In [5]:
import glob
from collections import defaultdict
from pprint import pprint
import pickle
import operator
import os
import nltk
import sklearn
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.util import ngrams
import random
import re

In [2]:
# First I reproduce Riloff et al.'s bootstrapping experiments exactly.
# The gist of it is: 
#    - Sarcasm in a sentence can be modeled as 
#         the presence of a positive sentiment followed by a negative situation.
#    - Self-labeled sarcastic tweets, marked #sarcasm or #sarcastic 
#         were used in a bootstrapping process
#         to automatically discover positive and negative phrases.
# I start with 
# a) reproducing this bootstrappping method exactly
# b) using other markers of sarcasm such as #yeahright, #not 
#        to discover the positive and negative phrases
# Possible next step:
#     Is the bootstrapping method invariant to initialization?
#         It should be if it is robust. Try with a different initial word than love.

In [4]:
# "positive sentiments that are expressed as a 
#    verb phrase or as a predicative expression 
#    (predicate adjective or predicate nominal),
#    and negative activities or states that can be a
#    complement to a verb phrase."

# "we try to recognize these syntactic structures heuristically using
#     only part-of-speech tags and proximity"

In [25]:
def bootstrap(seedword,tweets):
    '''
    Input: A seed positive word and a set of tweets
    Output: A set of positive sentiments and negative situations that indicate a possibly sarcastic tweet

    negphrases_count1 = 0
    posphrases_count1 = 1
    
    new_neg_phrases = learn_new_negative_phrases(seedword,tweets)
    neg_phrases_count2 = len(new_neg_phrases) + neg_phrases_count1
    
    while neg_phrases_count1 != neg_phrases_count2 and pos_phrases_count1
    
    new_pos_phrases = []
    for neg_phrase in new_neg_phrases:
        new_pos_phrases.extend(learn_new_positive_phrases(neg_phrase,tweets))
        
    for pos_phrase in new_pos_phrases
    pprint(neg_best_candidates)
    '''    
    new_neg = learn_new_phrases(seedword,'neg',tweets)

    #while (new keep getting added)

    new_pos = []
    for p in new_neg:
        new_pos.extend(learn_new_phrases(p,'pos',tweets))
    
    for p in new_pos:
        new_neg.extend(learn_new_phrases(p,'pos',tweets))
    
    
def learn_new_phrases(phrases,pos_or_neg,tweets):

    print('Looking for {} phrases'.format(pos_or_neg))

    candidates_sarc = defaultdict(int)
    candidates_no_sarc = defaultdict(int)

    for tweet in tweets:
        for seed in phrases:
            if pos_or_neg == 'pos':
                candidate_phrases = get_possibly_pos_phrase(seed,tweet)
            elif pos_or_neg == 'neg':
                candidate_phrases = get_possibly_neg_phrase(seed,tweet)

            for phrase in candidate_phrases:
                if pos_or_neg == 'pos':
                    res = pos_phrase_with_desired_syntactic_structure(phrase)
                elif pos_or_neg == 'pos_pred':
                    res = pos_pred_phrase_with_desired_syntactic_structure(phrase)
                elif pos_or_neg == 'neg':
                    res = neg_phrase_with_desired_syntactic_structure(phrase)

                for p in res:
                    if tweet['label'] == 'SARCASM':
                        candidates_sarc[p] += 1
                    elif tweet['label'] == 'NOT_SARCASM':
                        candidates_no_sarc[p] += 1
    
    return best_candidates(candidates_sarc,candidates_no_sarc,pos_or_neg)
    
    
def best_candidates(sarc_phrase_counts,not_sarc_phrase_counts):
    prob = {}
    for phrase,count in sarc_phrase_counts.items():
        if count >= 3:
            if phrase not in not_sarc_phrase_counts:
                #TODO: Is this the right thing to do
                #  or can there be more nuance?
                prob[phrase] = 1
            else:
                prob[phrase] = count/not_sarc_phrase_counts[phrase]
        else:
            print("Discarded cuz too few ({})".format(count),phrase)
    prob = sorted(prob.items(), key=operator.itemgetter(1), reverse=True)
    res = [x[0] for x in prob[:20] if x[1]>=0.8]
    print("Discarded cuz not probable enough:",prob[len(res):])
    return res
    
        
#The 7 POS bigram patterns are:
#  V+V, V+ADV, ADV+V, “to”+V,
#  V+NOUN, V+PRO, V+ADJ
def has_POS_bigram_pattern(tokens,tags):
    if tokens[0] == 'to' and tags[1] == 'V':
        return True
    if tags[0] == 'R' and tags[1] in ['V','T']:
        return True
    if tags[0] in ['V','T'] and tags[1] in ['N','O','^','S','Z','A','V','R','T']:
        return True

'''
The 20 POS trigram patterns are designed to capture 
    seven general types of verb phrases: 
        #verb and adverb mixtures,
        #an infinitive VP that includes an adverb, 
        a verb phrase followed by a noun phrase, 
        a verb phrase followed by a prepositional phrase,
        a verb followed by an adjective phrase, 
        #or an infinitive VP followed by an adjective, noun, or pronoun.
        
Made my own thing up with this as a guide:
    http://examples.yourdictionary.com/verb-phrase-examples.html
    
TODO: This certainly needs more attention
'''
def has_POS_trigram_pattern(tokens,tags):
    third_tag_candidates = set(['N','O','^','S','Z','A','V','R','T','P'])
    if tokens[0] == 'to' and tags[1] == 'V' and tags[2] in third_tag_candidates:
        return True
    if tags[0] == 'V' and tags[1] == 'V' and tags[2] in third_tag_candidates:
        return True
    
def pos_pred_phrase_with_desired_syntactic_structure(phrase):
    '''
    To learn predicative expressions, we use 24 copular
    verbs from Wikipedia and their inflections'''
    copular_verbs = {'act','acted','acting','acts','appear','appeared','appearing','appears','are','be','became','become','becomes','becoming','bled','bleed','bleeding','bleeds','came','come','comes','coming','constitute','constituted','constitutes','constituting','die','died','dies','dying','end','ended','ending','ends','fall','falling','falls','feel','feeling','feels','fell','felt','freeze','freezes','freezing','froze','get','gets','getting','go','goes','going','got','grew','grow','growing','grows','is','keep','keeping','keeps','kept','look','looked','looking','looks','prove','proved','proves','proving','ran','remain','remained','remaining','remains','run','running','runs','seem','seemed','seeming','seems','shine','shines','shining','shone','smell','smelled','smelling','smells','sound','sounded','sounding','sounds','stay','stayed','staying','stays','taste','tasted','tastes','tasting','turn','turned','turning','turns','was','wax','waxed','waxes','waxing','went','were'}
    
    '''
    We
    extract positive sentiment candidates by extracting
    1-grams, 2-grams, and 3-grams that appear immediately
    after a copular verb and occur within 5 words
    of the negative situation phrase, on either side. 
    '''
    
    
    '''
    We then apply POS patterns to identify n-grams
    that correspond to predicate adjective and predicate
    nominal phrases. For predicate adjectives, we retain
    ADJ and ADV+ADJ n-grams. We use a few
    heuristics to check that the adjective is not part of a
    noun phrase (e.g., we check that the following word
    is not a noun). For predicate nominals, we retain
    ADV+ADJ+N, DET+ADJ+N and ADJ+N n-grams.
    We excluded noun phrases consisting only of nouns
    because they rarely seemed to represent a sentiment.
    The sentiment in predicate nominals was usually
    conveyed by the adjective
    '''

#TODO: This could use a 4-gram pattern for better accuracy
def neg_phrase_with_desired_syntactic_structure(phrase):
    # The inverted order should take care of subsumption
    
    if len(phrase['tokens']) == 3 and has_POS_trigram_pattern(phrase['tokens'],phrase['tags']):
        return [tuple(phrase['tokens'][:3])]
    
    if len(phrase['tokens']) >= 2 and has_POS_bigram_pattern(phrase['tokens'][:2],phrase['tags'][:2]):
        return [tuple(phrase['tokens'][:2])]

    if len(phrase['tokens']) >= 1 and phrase['tags'][0] == 'V':
        return [tuple(phrase['tokens'][:1])]

    return [tuple()]
    
    
def get_possibly_neg_phrase(pos_phrase,tweetobj):
    '''
    In: a positive sentiment word or phrase and a tweet object
    Out: 1,2,3-grams of words on the right of the given phrase
    '''
    out = []
    tokens = tweetobj['tokens']
    postags = tweetobj['tags']
    len(poss)
    for i,token in enumerate(tokens):
        if token == pos_phrase:
            subout = {'tokens':[],'tags':[]}
            j = 1
            while j + i < len(tokens) and j <= 3:
                subout['tokens'].append(tokens[i+j])
                subout['tags'].append(postags[i+j])
                j += 1
            out.append(subout)
    return out


def get_possibly_pos_phrase(neg_phrase,tweetobj):
    '''
    In: a positive sentiment word or phrase and a tweet object
    Out: 1,2-grams of words on the left of the given phrase
    '''
    out = []
    tokens = tweetobj['tokens']
    postags = tweetobj['tags']
    for i,token in enumerate(tokens):
        if token == pos_phrase:
            subout = {'tokens':[],'tags':[]}
            j = 1
            while j + i < len(tokens) and j <= 3:
                subout['tokens'].append(tokens[i+j])
                subout['tags'].append(postags[i+j])
                j += 1
            out.append(subout)
    return out

In [15]:
def test_get_possibly_neg_phrase():
    for tweet in original_tweets:
        if tweet['label'] == 'SARCASM':
            res = get_possibly_neg_phrase('love',tweet)
            if res:
                print(res)

#test_get_possibly_neg_phrase()

bootstrap('love',original_riloff_tweets)

Discarded cuz too few (1) working
Discarded cuz too few (2) doing
Discarded cuz too few (1) being called
Discarded cuz too few (2) being
Discarded cuz too few (1) reading
Discarded cuz too few (1) getting
Discarded cuz too few (1) being talked down
Discarded cuz too few (1) getting lied to
Discarded cuz too few (2) getting up
Discarded cuz too few (1) missing
Discarded cuz too few (1) to see
Discarded cuz too few (1) waking
Discarded cuz too few (1) being up
Discarded cuz too few (2) feeling
Discarded cuz too few (1) going
Discarded cuz too few (1) getting yelled at
Discarded cuz too few (1) being able
Discarded cuz too few (1) having
Discarded cuz too few (1) freaking being ignored
Discarded cuz too few (1) to do
Discarded cuz too few (1) being left to
Discarded cuz too few (1) waiting
Discarded cuz too few (1) not sleeping
Discarded cuz not probable enough: []
['waking up']


In [149]:
#START: Creating the Riloff dataset and dumping it in a ready to use format

original_riloff_tweets = []
with open('sarcasm-annos-emnlp13-tweets.tsv') as f:
    for line in f:
        try:
            tweetid,label,tweet = line.strip().split('\t')
            original_riloff_tweets.append({'id':int(tweetid),'label':label,'tweet':tweet})
        except ValueError:
            print(line)
            
original_riloff_tweets.append({'appeasing_dummy':None})
            
i = 0
original_riloff_tweets[i].update({'tokens':[],'tags':[],'conf':[]})
with open('riloff-tweets.tagged') as f:
    for line in f:
        if line.strip() == '':
            i += 1
            original_riloff_tweets[i].update({'tokens':[],'tags':[],'conf':[]})
        else:
            word,tag,conf = line.strip().split()
            original_riloff_tweets[i]['tokens'].append(word)
            original_riloff_tweets[i]['tags'].append(tag)
            original_riloff_tweets[i]['conf'].append(conf)
            
pickle.dump(original_riloff_tweets, open('data/march22/riloff-tokenized-and-tagged.pkl','wb'))

#END: Creating the Riloff dataset and dumping it in a ready to use format

In [6]:
original_riloff_tweets = pickle.load(open('data/march22/riloff-tokenized-and-tagged.pkl','rb'))

In [196]:
seenalready = set()
with open('data/march22/shereen-sarc.orig','w') as fw:
    for filename in glob.iglob('data/shereen/sarcasm-more-week*.tweet'):
        #print(filename)
        with open(filename) as f:
            for line in f:
                if line not in seenalready:
                    fw.write(line)
                    seenalready.add(line)
                    #They're all unique, turns out.
                    #And there's 75680 of them.
    with open('data/shereen/sarcasm.tweet') as f:
        for line in f:
            if line not in seenalready:
                fw.write(line)
                seenalready.add(line)
                #There might be one here that is a repeat, considering the entire set
                
print(len(seenalready))
!wc -l data/march22/shereen-sarc.orig

seenalready = set()
repeat_count = 0
with open('data/march22/shereen-not_sarc.orig','w') as fw:
    for filename in glob.iglob('data/shereen/random-*not-sarc*.pkl'):
        with open(filename,'rb') as f:
            for line in pickle.load(f):
                if line not in seenalready:
                    fw.write(line+'\n')
                    seenalready.add(line)
                else:
                    repeat_count += 1
    for filename in glob.iglob('data/shereen/random*.tweet'):
        with open(filename) as f:
            for line in f:
                if line not in seenalready:
                    fw.write(line)
                    seenalready.add(line)
                else:
                    repeat_count += 1

print(repeat_count)
print(len(seenalready))
!wc -l data/march22/shereen-not_sarc.orig
# This one has plenty repeats, but nearly 2m non-repeating tweets nontheless.

In [16]:
original_shereen_sarc_tweets = []
with open('data/march22/shereen-sarc.orig') as f:
    for line in f:
        try:
            tweet = line.strip()
            original_shereen_sarc_tweets.append({'label':'SARCASM','tweet':tweet})
        except ValueError:
            print(line)
            
original_shereen_sarc_tweets.append({'appeasing_dummy':None})
            
i = 0
original_shereen_sarc_tweets[i].update({'tokens':[],'tags':[],'conf':[]})
with open('data/march22/shereen-sarc.tagged') as f:
    for line in f:
        if line.strip() == '':
            i += 1
            original_shereen_sarc_tweets[i].update({'tokens':[],'tags':[],'conf':[]})
        else:
            word,tag,conf = line.strip().split()
            try:
                original_shereen_sarc_tweets[i]['tokens'].append(word)
                original_shereen_sarc_tweets[i]['tags'].append(tag)
                original_shereen_sarc_tweets[i]['conf'].append(conf)
            except KeyError:
                print(line)
                
                
original_shereen_not_sarc_tweets = []
with open('data/march22/shereen-not_sarc.cleaned.orig') as f:
    for line in f:
        try:
            tweet = line.strip()
            original_shereen_not_sarc_tweets.append({'label':'NOT_SARCASM','tweet':tweet})
        except ValueError:
            print(line)
            
original_shereen_not_sarc_tweets.append({'appeasing_dummy':None})
            
i = 0
original_shereen_not_sarc_tweets[i].update({'tokens':[],'tags':[],'conf':[]})
with open('data/march22/shereen-not_sarc.cleaned.tagged') as f:
    for line in f:
        if line.strip() == '':
            i += 1
            original_shereen_not_sarc_tweets[i].update({'tokens':[],'tags':[],'conf':[]})
        else:
            word,tag,conf = line.strip().split()
            original_shereen_not_sarc_tweets[i]['tokens'].append(word)
            original_shereen_not_sarc_tweets[i]['tags'].append(tag)
            original_shereen_not_sarc_tweets[i]['conf'].append(conf)

In [21]:
pickle.dump(original_shereen_not_sarc_tweets+original_shereen_sarc_tweets,open('data/march22/shereen-tokenized-and-tagged.pkl','wb'))

In [30]:
original_shereen_tweets = pickle.load(open('data/march22/shereen-tokenized-and-tagged.pkl','rb'))

In [26]:
random

1965169

In [31]:
len(original_shereen_tweets)

2044898

In [29]:
79729*4

318916

In [32]:
# So while Riloff's dataset had a 20:80 split of sarc:no_sarc, I had started with a 4:96 split, unfortunately.
# I randomly sampled from the 1.9m no_sarc tweets to create a smaller no_sarc dataset that would cause
#   a roughly 20:80 split.

In [35]:
ls -lrt data/scraped/

total 17688
-rwxrwxrwx  1 kini  staff  4537730 Mar  7  2016 [31mscraped-with-JUSTKIDDING-until-30Jun2015.pkl[m[m*
-rwxrwxrwx  1 kini  staff  4512165 Mar  7  2016 [31mscraped-with-NOT-until-28Dec2015.pkl[m[m*
-rwxrwxrwx  1 kini  staff      784 Mar  7  2016 [31mthescript.py[m[m*


In [37]:
ls data/search-api/*

data/search-api/22ndMarch2016:
[31m#irony.pkl[m[m*
[31m#jk.pkl[m[m*
[31m#justkidding.pkl[m[m*
[31m#lol.pkl[m[m*
[31m#not.pkl[m[m*
[31m#sarcasm.pkl[m[m*
[31m#sarcastic.pkl[m[m*
[31m#sarcastictweet.pkl[m[m*
[31m#yaright.pkl[m[m*
[31m#yeahright.pkl[m[m*
[31m#yearight.pkl[m[m*
[30m[43m__pycache__[m[m/
[31maccesskeys.py[m[m*
[31msarcasm+yeahright+justkidding-22ndMarch-9861.pkl[m[m*
[31msarcasm+yeahright+justkidding-9266[m[m*
[31mtwittersearch.log[m[m*
[31mtwittersearch.py[m[m*

data/search-api/5thMarch2016:
[31mirony92.pkl[m[m*
[31mjk1796.pkl[m[m*
[31mjustkidding2603.pkl[m[m*
[31mlol494.pkl[m[m*
[31mnot391.pkl[m[m*
[31msarcasm+yeahright+justkidding-10347[m[m*
[31msarcasm+yeahright+justkidding-5thMarch-10345.pkl[m[m*
[31msarcasm7178.pkl[m[m*
[31msarcastic96.pkl[m[m*
[31msarcastictweet142.pkl[m[m*
[31myaright35.pkl[m[m*
[31myeahright566.pkl[m[m*
[31myearight47.pkl[m[m*


In [56]:
contents = {}
tweets = []
alreadyseen = set()
with open('data/march22/scraped-sarc.orig','w') as fw:
    for filename in glob.iglob('data/search-api/22ndMarch2016/*.pkl'):
        contents[filename] = pickle.load(open(filename,'rb'))
        if type(contents[filename][0]) == tuple:
            for tup in contents[filename]:
                tw_id, tweet = tup
                if tw_id not in alreadyseen:
                    alreadyseen.add(tw_id)
                    tweets.append({'label':'SARCASM','tweet':tweet})
                    fw.write('{}\t{}\n'.format(tw_id,tweet.replace('\n',' ')))
        elif type(contents[filename][0]) == dict:
            for d in contents[filename]:
                if d['id'] not in alreadyseen:
                    alreadyseen.add(d['id'])
                    temp = {'id':d['id'],'label':'SARCASM','tweet':d['text']}
                    fn = os.path.basename(filename)
                    if fn[0] == '#':
                        temp['#type'] = fn[:-4]
                    tweets.append(temp)
                    fw.write('{}\t{}\n'.format(d['id'], d['text'].replace('\n',' ')))

In [69]:

for key in contents.keys():
    print(remove_trailing_nums(os.path.basename(key)[:-4]))

irony
#yaright
justkidding
not
#yeahright
#yearight
#jk
jk
#lol
sarcasm+yeahright+justkidding-5thMarch-
sarcastictweet
lol
yaright
#sarcastic
#sarcasm
yearight
#irony
sarcasm
#not
#sarcastictweet
sarcasm+yeahright+justkidding-22ndMarch-
#justkidding
yeahright
sarcastic


In [71]:
print(len(tweets))

25140


In [68]:

def remove_trailing_nums(text):
    return re.sub("\d+$", "", text)

print(remove_trailing_nums('fhasbfhjabdj126357'))

fhasbfhjabdj


In [70]:
for filename in glob.iglob('data/search-api/5thMarch2016/*.pkl'):
    print()

irony
jk
justkidding
lol
not
sarcasm+yeahright+justkidding-5thMarch-
sarcasm
sarcastic
sarcastictweet
yaright
yeahright
yearight


In [72]:
with open('data/march22/scraped-sarc.orig','a') as fw:
    for filename in glob.iglob('data/search-api/5thMarch2016/*.pkl'):
        contents[filename] = pickle.load(open(filename,'rb'))
        if type(contents[filename][0]) == tuple:
            for tup in contents[filename]:
                tw_id, tweet = tup
                if tw_id not in alreadyseen:
                    alreadyseen.add(tw_id)
                    tweets.append({'label':'SARCASM','tweet':tweet})
                    fw.write('{}\t{}\n'.format(tw_id,tweet.replace('\n',' ')))
        elif type(contents[filename][0]) == dict:
            for d in contents[filename]:
                if d['id'] not in alreadyseen:
                    alreadyseen.add(d['id'])
                    temp = {'id':d['id'],'label':'SARCASM','tweet':d['text']}
                    fn = os.path.basename(filename)
                    if 'sarcasm+yeahright+justkidding-5thMarch-' not in fn:
                        temp['#type'] = '#'+remove_trailing_nums(os.path.basename(filename)[:-4])
                    tweets.append(temp)
                    fw.write('{}\t{}\n'.format(d['id'], d['text'].replace('\n',' ')))

In [73]:
print(len(tweets))

38571


In [76]:
for filename in glob.iglob('data/scraped/*.pkl'):
    contents[filename] = pickle.load(open(filename,'rb'))
    #print(type(contents[filename]))
    print(type(contents[filename][0]))

<class 'dict'>
<class 'dict'>


In [89]:
with open('data/march22/scraped-sarc.orig','a') as fw:
    for filename in glob.iglob('data/scraped/*.pkl'):
        contents[filename] = pickle.load(open(filename,'rb'))
        for d in contents[filename]:
            if d['id'] not in alreadyseen:
                alreadyseen.add(d['id'])
                temp = {'id':d['id'],'label':'SARCASM','tweet':d['text']}
                fn = os.path.basename(filename)
                if fn == 'scraped-with-JUSTKIDDING-until-30Jun2015.pkl':
                    temp['#type'] = '#justkidding'
                elif fn == 'scraped-with-NOT-until-28Dec2015.pkl':
                    temp['#type'] = '#not'
                tweets.append(temp)
                if type(d['text']) == bytes:
                    d['text'] = str(d['text'],'utf-8')
                fw.write('{}\t{}\n'.format(d['id'], d['text'].replace('\n',' ')))

In [92]:
tweets.append({'appeasing_dummy':None})
            
i = 0
tweets[i].update({'tokens':[],'tags':[],'conf':[]})
with open('data/march22/scraped-sarc.tagged') as f:
    for line in f:
        if line.strip() == '':
            i += 1
            tweets[i].update({'tokens':[],'tags':[],'conf':[]})
        else:
            word,tag,conf = line.strip().split()
            try:
                tweets[i]['tokens'].append(word)
                tweets[i]['tags'].append(tag)
                tweets[i]['conf'].append(conf)
            except KeyError:
                print(line)

IndexError: list index out of range

In [109]:
tweets = []
with open('data/march22/scraped-sarc.orig') as f:
    for line in f:
        tw_id,tweet = line.strip().split('\t')
        tweets.append({'id':tw_id, 'label':'SARCASM', 'tweet':tweet})
            
tweets.append({'appeasing_dummy':None})


In [110]:
len(tweets)

83568

In [112]:
i = 0
tweets[i].update({'tokens':[],'tags':[],'conf':[]})
with open('data/march22/scraped-sarc.tagged') as f:
    for line in f:
        if line.strip() == '':
            i += 1
            tweets[i].update({'tokens':[],'tags':[],'conf':[]})
        else:
            word,tag,conf = line.strip().split()
            tweets[i]['tokens'].append(word)
            tweets[i]['tags'].append(tag)
            tweets[i]['conf'].append(conf)

In [113]:
tweets.pop()

{'appeasing_dummy': None, 'conf': [], 'tags': [], 'tokens': []}

In [114]:
tweets[0]

{'conf': ['0.9672',
  '0.9602',
  '0.9914',
  '0.9939',
  '0.9991',
  '0.9978',
  '0.9903',
  '0.9767',
  '0.9993',
  '0.8113',
  '0.9961',
  '0.9847',
  '0.9195',
  '0.9970',
  '0.8172',
  '0.5607',
  '0.9740',
  '0.9938',
  '0.9988',
  '0.9788',
  '0.9962',
  '0.9979',
  '0.9525',
  '0.9942',
  '0.5641',
  '0.9967'],
 'id': '712194231151009792',
 'label': 'SARCASM',
 'tags': ['D',
  'N',
  'P',
  'N',
  'V',
  'P',
  'N',
  'O',
  'V',
  'V',
  '&',
  'V',
  'P',
  'D',
  'N',
  'V',
  'P',
  'A',
  'N',
  'O',
  'R',
  'V',
  'P',
  'D',
  'N',
  ','],
 'tokens': ['The',
  '#irony',
  'of',
  'people',
  'praying',
  'for',
  'victims',
  'who',
  'are',
  'killed',
  'and',
  'injured',
  'in',
  'a',
  'bomb',
  'attack',
  'by',
  'other',
  'people',
  'who',
  'also',
  'believe',
  'in',
  'a',
  'god',
  '.'],
 'tweet': 'The #irony of people praying for victims who are killed and injured in a bomb attack by other people who also believe in a god.'}

In [116]:
pickle.dump(tweets,open('scraped-tokenized-and-tagged.pkl','wb'))

In [117]:
len(tweets)

83567

In [119]:
shereen_discards = {'pos': {'possibly be', 'why ru', 'look is', 'comes', 'personally love', 'is only', 'not be', 'considering', 'delete', 'are finally', 'spy', 'should experience', 'feeling', 'was totally', 'texting', 'could be', 'taxing', "didn't try", 'will be', 'was only', 'really liked', 'are not', 'are now', '#excited', 'actually be', 'enjoy', 'excited', 'absolutely love', "can't stop", 'hold', 'is not', 'assume', 'throughly enjoy', 'subjected', 'are just', 'gotta love', 'are already', 'deserve', 'cycled', 'slapped', '#tired', 'conflicted', 'is', 'still be', 'was just', 'pull', 'always love', 'should stop', 'had', 'stealing are', 'miss', 'said was', 'ask', 'guess', 'really are', 'exercise', 'tired', 'hear', 'plan', 'really enjoying', 'are still', "don't bother", 'cheating', 'were totally', 'makes', 'is clearly', 'are always', 'just stop', 'admitted', 'deserves', 'is willingly', "can't resist", 'appreciated', 'were obviously', 'accused', 'was', 'have been', 'used', 'are', 'make', 'am currently', 'better then', 'doing', 'is #eddiemurphy', 'happens when', 'are screaming', 'not just', 'cancelled', 'loving', 'shpuld be', 'not even', 'so r', 'is simply', "didn't think", 'get', 'c', 'hate', 'once again', 'am just', 'think', 'just continue', 'might be', 'also have', 'r nt', 'write', 'leaves', 'started', 'were', 'called', 'love paying', 'knows', 'definitely love', 'apologize', 'end', 'is really', 'help', 'r still', 'go', 'have', 'lose', 'donating', 'be', 'love just', 'see', 'say', "can't miss", "aren't even", 'looting', 'is just', 'only just', 'is already', 'obviously love', 'is finally', 'more then', 'keep not', 'thinking', 'just love', 'based', 'can keep', 'talking', 'walk', 'love when', 'worry', 'just be', 'teach', 'includes', 'were just', 'keep', 'be hard', "isn't even", "wasn't even", 'regretting not', 'let', 'should start', 'seeing', 'love', 'r', 'mostly be', 'dont remember', 'do love', 'should be', 'missed', 'can avoid', 'was probably', 'are totally', 'really like'}, 'neg': {'getting invited on', 'getting told who', 'editing', 'getting kicked out', 'to join in', 'just getting', 'drinking coffee', '#mondaymorning', 'being me', 'to write', 'not qualifying', 'taking french', 'watching snapchat', 'to join', 'working retail', 'cancelled flights', 'missing pivotal', 'listening too', 'being stuck tagged', 'work', 'having loads', 'golfing', 'standing awkwardly', 'seeing tyler', 'finding multiple', 'helping people', 'walking', 'having surprise', 'being scheduled to', 'consistently getting', 'to drive ppl', 'trudging', 'closing', 'getting sexually', 'running errands', 'having allergies', 'to give', 'telling', 'delivering leaflets', 'being friends', 'getting played with', 'not talking', 'getting stuck with', 'not eating', 'working alone', 'having class', 'getting calls from', 'finally came', 'traveling', 'lagging', 'spending money', 'fighting', 'throwing alcohol', 'visiting mobile', 'being up', 'not having', 'getting booted off', 'to #laugh', 'finally getting', 'seeing commercials', 'watching', 'being asked to', 'serving', 'eating healthy', 'busing', 're-piercing', 'hate mail', 'getting blamed for', 'having random', 'being told when', 'writing', 'being taught about', 'love finding out', 'reading books', 'cuddles', 'love love to', 'waiting hours', 'being so', 'asking someone', 'randomly waking', 'knowing you', 'feeling part', 'being ignored by', 'hurt', 'wearing new', 'having such', 'sleeping', 'to help you', 'finding', 'working saturday', 'hearing shit', 'being spoken to', 'learning new', 'downloading nfl', 'how spring', 'queueing', 'being woken up', 'being poisoned', 'coming home', 'doing nothing', 'to win though', 'watching hockey', 'getting texted back', 'using what', 'getting shit', 'to do', 'working them', 'being sent home', 'not been', 'being preached about', 'being cheated on', 'getting sick', 'being told i', 'being called off', 'catching', 'having', 'having construction', 'walking outside', 'seeing it', 'eating starapples', 'to help', 'playing', 'being verbally', 'talking', 'wasting hours', 'seeing couples', 'surprise dentist', 'to change', 'put you', 'going grocery', 'is overwhelming', 'to be sarcastic', 'being able', 'being called ugly', 'feeling important', 'love looking at', 'doing that', 'getting mail', 'developing new', 'being schizo', 'getting spammy', 'being followed by', 'supporting', 'being treated differently', 'being pushed away', 'love', 'being hit over', 'doing so', 'getting patched', 'bring interrupted during', 'washing dishes', 'staying up', 'to stress about', 'seein', 'being injured', 'getting yelled at', 'to give you', 'working already', 'being screwed over', 'to wait', 'to send you', 'understanding', 'going places with', 'being covered in', 'doing group', 'having strep', 'getting identical', 'camping', 'punching myself', 'doing maths', 'having friends', 'seeing pictures', 'can i', 'being part', 'hearing ppl', 'seeing skiles', 'closing deli', 'discovering when', 'being generalized', 'feeling', 'composing giant', 'having headaches', 'running', 'being treated like', 'seeing this', 'updating apps', 'to tweet about', 'watching nba', 'scraping ice', 'cuddling', 'writing witty', 'smelling tobacco', 'being indirectly', 'painting', 'being stood up', 'finding vibrators', 'being abused by', 'paying so', 'outlines', 'being catcalled', 'lovee having', 'being randomly', 'texting absolutely', 'how secure', 'having insiders', 'telling people', 'smelling diesel', 'spending', 'being guilt', 'getting motion', 'getting errors', 'being blamed for', 'getting snap chats', 'arriving', 'scoring goals', 'finding shit', 'making plans', 'standing up', 'to see kt', 'hauling laundry', 'being excited for', 'sticking', 'to read this', 'getting called in', 'being depressed', 'wants', 'being called out', 'watching highlights', 'doing paperwork', 'to stay in', 'inhaling smoke', 'being stuck behind', 'sleeping alone', 'being excluded', 'to fail at', 'seeing snow', 'getting frostbite', 'being confused', 'getting', 'doing what', 'making them', 'seeing how', 'being trolled by', 'working extra', 'losing', 'being ignored so', 'playing phone', 'love this', 'hearing rush', 'looking', 'how organised', 'being british', 'falling', 'sitting alone', 'working when', 'celebrating', 'to hear about', 'being super', 'being invisible', 'to throw', 'being lied to', 'being disrespected in', 'is blind', 'doing', 'studying', 'being patronised', 'seeing knee', 'been cold', 'getting really', 'smelling cigarette', 'being questioned by', 'being stopped in', 'reading recipe', 'just watching', 'sitting home', 'constantly being', 'being called at', 'spending spring', 'listening', 'being rlly', 'to see great', 'being stuck at', 'to be around', 'showing up', 'hiding', 'receiving emails', 'to take', 'being surrounded by', 'feeling anxious', 'hearing you', 'being lied too', 'love lovee having', 'writing essays', 'expressing how', 'watching ps4/xboxone', 'seeing mexican', 'detoxing', 'not winning', "don't like", 'taking work', 'taking cold', 'to be #sarcastic', 'doing hw', 'quarrelling', 'to be treated', 'having scoliosis', 'to have', 'leaving work', 'delayed flights', 'being subtweeted', 'being sworn at', 'watching jerry', 'getting slapped on', 'feeling neglected', 'breathing', 'to see brady', 'planning', 'looking manly', 'leaving', 'writing haikus', 'to sit', 'flying', 'eating alone', 'bein hit wit', 'autocorrects', 'to spend', 'seeing selfies', 'to try', 'hearing other', 'getting pictures', 'to punch in', 'love waking up', 'being looked at', 'watching archer', 'running reports', 'to see season', 'to clean out', 'being awoken by', 'going here', 'being copied on', 'continues', 'not feeling', 'to make', 'watching millionaire', 'ruining', "isn't alchemy", 'being told how', 'sitting down', 'being left by', 'getting roped into', 'are better', 'being called', 'making up', 'putting', 'wasted power', 'cramming', 'seeing', 'will fuck you', 'doing homework', 'taking showers', 'learning', 'crying', 'shoveling snow', 'eating hot', 'let downs', 'having coughing attacks', 'just being', 'being sick', 'to eat you', 'working so', 'growing up', 'feeling ditched', 'eatting root', 'being exhausted', 'to be made', 'doing projects', 'sittin', '#stoned me', 'gettin up', 'seeing tax', 'watching everyone', 'sitting around', 'storming', 'correcting blue', 'moving', 'to do it', 'being forced to', 'getting music', 'love love this', 'being made out', 'getting stressed out', 'getting unbiased', 'surprise open to', 'how being', 'hearing people', 'being ignored thanks', 'sprinting', 'wailing children', 'live tweeting', 'looking after', 'dodging it', 'meeting booth', 'love love waking', 'being ditched after', 'to avoid', 'seeing that', 'being injury', 'just wasting', 'seeing spoilers', 'diy me', 'is not', 'living', 'getting subbed', 'learn business', 'driving', 'spending sunday', 'finding gray', 'losing money', 'havin', 'is', 'not getting', 'filling out', 'failing', 'being given', 'taking', 'using note', 'to know who', 'being invited to', 'rubbing new', 'tweeting', 'breaking them', 'making homemade', 'getting pushed to', 'being teased by', 'giving great', 'to hear them', 'only getting', 'being unfollowed', 'being told not', 'to see it', 'spending saturday', 'to say', 'to scroll endlessly', 'stripping frame', 'being hit up', 'ending', 'paying more', 'playing people', 'cleaning', 'riding', 'letting', 'being roasted', 'twisting shit', 'teaching', 'getting dirty', 'playing middle', 'revising', 'finding out', 'knowing', 'reading something', 'how telling', 'getting stabbed in', 'working saturdays', 'knowing i', 'being bored', 'to display that', 'eating', 'being used', 'pmsing so', 'getting pulled over', 'being volunteered to', 'updating treatment', 'being called retarded', 'being freezing', 'having alcoholics', 'having divorced parents', 'how knowing', 'being alone', 'bein', 'to support new', 'getting ready', 'seeing sunlight', 'finding typos', 'having bad', 'having anxiety', 'being treated as', 'going', 'getting bloodwork', 'tweet was', 'doing powerpoint', 'see', 'still being', 'spending hours', 'being isolated', 'getting warm/dying flowers', 'fixing problems', 'to hand over', 'love love it', 'eating cereal', 'wowed', 'stalking', 'helping customers', 'sitting right', 'to see someone', 'to go to', 'can screw you', 'is #sarcastic', 'missing league', "wasn't perfect", 'to reuse', 'getting cat', 'arguing', 'drowning', 'to hurt others', 'hearing them', 'being outside', 'being called weird', 'being hated so', 'getting blown off', 'living here', 'to get', 'working double', 'grading', 'to be', 'watching cops', 'is playing', 'suffocating', 'stunting', 'to work', 'attending musical', 'is when', 'drinking', 'having nothing', 'being severely', 'workin', 'watching warm', 'working super', 'getting random', 'helping self-absorbed', 'to play with', 'being thrown into', 'to rake', 'watching people', 'being forgotten', 'being yelled', 'being put in', 'being downtown', 'to stay', 'being taken for', 'getting woken up', 'to see in', 'laying awake', 'wearing such', '#sarcasm it', 'having emotional', 'forgetting', 'walking home', 'getting told multiple', 'hearing bass', 'texting', 'watching commercials', 'graves', 'doing work', 'bringing', 'to watch them', 'getting messages', 'being interrupted when', 'being put on', 'driving home', 'not sleeping', 'being here', 'being intl', 'limping', 'having panic', 'having big', 'argueing', 'feeling alone', 'laying', 'finding things', 'to go out', 'being kept awake', 'adding up', 'getting ignored', 'getting passed illegally', 'being patronised because', 'choking', 'being killed by', 'is love', 'rendering', 'traveling when', 'to buy', 'being sad', 'spending precious', 'to hangout with', 'does', 'to meet him', 'being wide', 'getting told', 'having tuition', 'to hate', 'to hear', 'to be threatened', 'paying paying out', 'being discriminated against', 'bugs should be', 'watching scary', 'never being', 'love love', 'to follow everton', 'being such', 'being completely', 'to wake up', 'stayin up', 'cleaning up', 'most- #grading', 'hanging', 'packing', 'reading', 'hanging out', 'playing crappy', 'being screamed at', 'getting anon', 'having drunks', 'getting hit on', 'coming', 'watching movies', 'being stuck in', 'being questioned constantly', 'planning conditions', 'getting phone', 'having vertigo', 'studying inside', 'saying such', 'setting alarms', 'getting called fat', 'watching it', 'is real', 'being grown up', 'to stomp on', 'is soooooooooooo', 'is i', "fixing people's", 'being kicked in', 'overhearing', 'asking questions', 'being woken at', 'watching #uk', 'being called diana', 'cruising', 'being yelled at', 'getting snapchats', 'to cheat', 'wasting time', 'wasting money', 'finishing work', 'seeing old', 'sucks', 'is having him', 'being constantly', 'getting cute', 'being awake', 'throwing up', 'could change', 'to sit here', 'doing laundry', 'watching horses', 'sending you', 'socialising alone', 'being left out', 'knows', 'seeing oduya', 'being', 'working sundays', 'getting hour', 'leaving school', 'watching deandre', 'having true', 'changing', 'waking up', 'seeing rodents', 'to see', 'being updated on', 'being stranded without', 'opening', 'seeing people', 'paying off', 'watching unmoving cars', 'blacking out', 'reading secret', 'spending time', 'watching roping in', 'sitting', 'being seriously', 'watching him', 'taking freezing cold', 'being ignored', 'running late', 'tweeting me', 'how exciting', 'when trying', 'watching games', 'waiting', 'to #ragequit when', 'being put second', 'being nocturnal', 'love love looking', 'gettin', 'did', 'seeing apm', 'how open', 'love it', 'getting work', 'having bots', 'coming down', 'doing someone', 'getting spammed on', 'using up', 'being home', 'eating dinner', 'siting', 'quitting', 'working holidays', 'not knowing', 'to see how', 'to sleep yet', 'being lectured about', 'being short', 'actually lifted', 'working', 'love love finding', 'waking'}}
riloff_discards = {'pos': set(), 'neg': {'being up', 'being wide', 'working', 'to see', 'getting home', 'waiting', 'going', 'getting', 'waking up', 'freaking being ignored', 'getting lied to', 'being left alone', 'having', 'missing', 'feeling', 'walking', 'being called', 'getting yelled at', 'being left to', 'not sleeping', 'waking', 'reading', 'getting up', 'coming back', 'being talked down', 'doing', 'being called into', 'being able', 'to do', 'being'}}
scraped_discards = {'neg': {'sitting', 'love love when', 'calling', 'being peed on', 'off silently', 'off again', 'trekking up', 'weirdly dressed', 'to pretend', 'to take', 'fighting isil', 'procrastinating', 'being bombarded with', 'working opposite', 'catching up', 'being low-key', 'feeling embarrassed about', 'paying taxes', 'closing', 'called', 'blended families', 'hearing classical', 'leafleting', 'getting thrown up', 'being wide', 'sound', 'did #not', 'repeating myself', 'wearing wet', 'love love that', 'following', 'being woken up', 'going out', 'smelling', 'to see a-rod', 'to clean', 'giving', 'wasting half', 'watching this', 'helping them', 'visiting', 'ruled', 'to get them', 'to bed early', 'seeing denver', 'being sicker', 'to be ripped', 'singing', 'breaking out', 'updating', 'erecting scaffolding', "working sunday's", 'off attacking', 'getting cancelled on', 'to go to', 'being treated like', 'off when', 'talking', 'seeing porn', 'being put last', 'start talking about', 'standing', "spending valentine's", 'being told', 'randomly waking', 'feeling sick', 'looking fab', 'out randomly', 'getting back', 'tick box', 'running', 'getting drinks', 'hearing how', 'has', 'going', 'committed', 'throwing up', 'feeling safe', 'being left out', 'spending weekends', 'seeing', 'love when', 'to #justkidding', 'accommodating them', 'doing so', 'to have', 'almost passing', 'concerned here', 'when flares', 'getting along', 'forgetting', 'losing', 'being home', 'getting made fun', 'getting shit', 'is rare', 'to say', 'upholding', 'crying', 'finally being', 'running so', 'not working', 'being me', 'being terribly', 'defending', 'watching michael', 'subjected', 'making faces', 'watching novice', 'watching fights where', 'adulting', 'memorizing chemical', 'to continue', 'working overtime', 'love me', 'accidentally falling', 'pulling', 'making tuition', 'to tell everyone', 'to get', 'to fill', 'missing jared', 'listening', 'getting air', 'coughing so', 'to be in', 'being pretty', 'being ignored whilst', 'making', 'getting decked in', 'waiting hand', 'going back', 'building html', 'seeing people', 'working nights', 'over entirely', 'may work', 'encouraging #pirates', 'falling asleep', 'to spell literature', 'being blanked', 'tuning in', 'being told how', 'being surrounded by', 'pushing', 'being attacked again', 'doing amrap', 'to see us', 'doing nice', 'filling out', 'happening', 'paying attention', 'seeing fire', 'making themselves', 'is always', 'blacking out', 'being put on', 'how inviting', 'being lied to', 'packing them', 'flying', 'rebooking flights', 'learning new', 'getting home', 'finding tyson', 'being consistently', "pushing people's", 'to rewind to', 'never getting', 'getting terrible', 'to set up', 'working outside', 'to shift', 'feeling', 'hating', 'babysitting', 'cleaning up', 'to lie', 'feeling sorry', 'tackling', 'getting lectures', 'love love', 'to walk', 'reading', 'was sent back', 'printing books', 'working night', 'to hear seagulls', 'to make new', 'to turn', 'knuckling', 'fucking up', 'leaving work', 'knows not', 'crying myself', 'having sleep', 'to see him', 'to catch up', 'excited', 'riding', 'spending friday', 'up shortly', 'is combine work', 'to see it', 'to come visit', 'folding laundry', 'affecting', 'to hear how', 'welcome here', 'having loud', 'suffering', 'having bad', 'finding', 'overthinking late', 'never sleeping', 'being emotional', 'working', 'to see you', 'feeling disposable', 'being not', 'entitled', 'leaving long', 'backfires', 'to show what', 'to be ignored', 'to spend', 'being so', 'is letting', 'is supporting donald', 'making this', 'making me', 'to sing bt', 'coming', 'put up', 'missing', 'carrying', 'being single', 'is really', 'to be', 'picking', 'to sell', 'studying wounds', 'hurt', 'being alone', 'staying up', 'marching', 'smelling cow', '#defeated', 'writing lesson', '#willing', 'to make', 'getting lectured about', 'playing', 'being', 'getting me', 'can do wonders', 'writing ssows on', 'being cold', '#trump', 'to give', 'being able', 'studying insects', 'to hear what', 'to be at', 'sitting in', 'doing laundry', 'being canadian', 'being power', 'to hear', 'being emotionally', 'is merely', 'being told to', 'paying off', 'to see', 'handles', 'to make you', 'getting calls at', 'never having', 'to have andy', 'to calm', 'having homework', 'getting nudes', 'to begin', 'to share', 'overhearing conversations', 'being hung over', 'getting', 'waiting', 'to raise', 'can take up', 'deicing', 'to try', 'to c yr', 'cleaning tack', 'chillin', 'to marry', 'having long', 'to tweet sad', 'getting harassed by', 'seen', 'feeling included', 'being ignored', 'to see that', 'feeling pain', 'to watch', 'hitting', 'digging', 'being good', 'getting woken up', "seeing lauren's", 'making videos', 'sleeping so', 'working closing shifts', 'having nose', 'being taken for', 'to be revelant', 'breaking down', 'to make indians', 'being voluntold', 'getting played', 'writing papers', 'to applaud', 'coming home', 'hearing bad', 'playing ball', 'getting let down', 'dealing', 'being understaffed', 'ignore karlee', 'driving', 'getting ignored', 'being hit by', 'seeing tweets', 'working weekends', 'not having', 'to save', 'looking', 'jackin', 'to see photos', 'made', 'being taxi', 'used', 'tidying', 'hearing people', 'ending', '#heading', 'to do', 'talks', 'to start ticketing', 'seeing jim', 'eating sand', 'to get nominated', 'makes us', '#woke', 'finishing', 'living', 'not getting', 'paying hard', 'to study hitler', 'being late', 'getting forced into', 'packing', 'to give back', 'to drive', 'handwashing me', 'tossing me', 'getting attacked by', 'to laugh at', 'to listened to', 'decorating', 'being delayed', 'helping', 'to sit', 'learning', 'says', 'when shows', 'being used', 'cancelled classes', 'cutting ties', 'being told that', 'making fun', 'injured', 'seeing enzo', 'growing up', 'messing things', 'up when', 'rocks so', 'assumes he', 'doing them', 'being kept in', 'readjusting you', 'being told by', 'walking outside', 'turning', 'to rip on', 'getting cute', 'paying', 'seeing white', 'to invest', 'to donate', 'twitch streams', 'feeling loved', 'not hearing', 'to get off', 'watching', 'wakin up', 'finding something', 'getting called into', 'having', 'beingtold', 'staying late', 'to do in', 'getting poked', '#born #leader', 'seeing things', 'to immerse myself', 'to ride', 'to test it', 'having earthquakes', 'watch', 'having you', 'to stop', 'hurts', 'being uninvolved', 'watching sports', 'drawing', 'to meet in', 'having anxiety', 'opening', 'fracking', 'bothered', 'was', 'to wake up', 'to have you', 'giving up', 'helping ppl', 'saying', 'being left alone', 'spending', 'attracting black', 'waking up', 'getting suprised with', 'being told what', 'breaking up', 'cleaning', 'being disrespected by', 'having roommates', 'trying', 'to pull ross', 'to stay connected', 'love love me', 'following you', "doesn't come together", 'to fly', 'havin', 'is literally', 'sleeping', 'how equated', 'love that', 'knowing', 'focusing', 'to run errands', 'is not', 'losing money', 'having sex', 'typing meeting minutes', 'being up', 'starting', 'being pale', 'hanging out', 'to meet her', 'scrolling', 'working out', 'impressed', 'standing outside', 'gone', 'puking', 'is when', 'having organic', 'shaking', 'being reunited with', 'getting migraines', 'seeing is when', 'putting', 'taking cold', 'is', 'worrying', 'is food', 'getting walked in', 'working retail', 'seeing ppl', 'to show how', 'randomly bumping', 'gave', 'to serve'}, 'pos': {'asking', 'splits', 'kill', 'will be', 'absolutely love', 'wtf', 'started', 'imagine', 'am', 'not be', 'might keep', 'got stuck', 'wheter', 'hmu', 'too much', 'can start', 'is slowly', 'literally been', 'study', 'being', 'scan not', 'return', 'forgot', 'still is', 'be expected', 'conflicted', 'almost want', 'was always', 'so #not', 'sweating', "doesn't love", 'scolded', 'then try', 'said', 'thought was', 'could be', 'now possibly', 'gets', 'yet not', 'falling', 'just love', 'talk', 'be trying', 'realize', "aren't even", 'went', 'thinking', 'means', 'told', 'definitely not', 'was told', 'quit', "isn't", 'whining', 'was really', 'considering', 'turning', 'swear', 'sign', 'change', 'was not', 'keep', 'am not', 'smiles', "wasn't really", 'ever love', 'progressed', 'knows', 'catch', 'rather be', 'totally still', 'too #not', 'is never', 'just luv', 'need more', 'just fail', 'is probably', 'r', 'stop', "wasn't there", 'allows', 'tried', 'cant stop', 'allright #not', 'gotta love', 'fired', 'tweet', 'see', 'am literally', 'end', 'spend', 'just have', 'clearly not', 'not still', "can't wait", '#not enjoy', 'obviously just', 'am totes', 'say', 'having', 'was sleep', 'getting stuck', 'set', 'cussing', 'not fully', 'say was', 'resort', 'been there', 'is still', "here isn't", 'are', 'even bother', 'played when', 'found', 'had', 'imply', 'do', 'now #not', 'else was', 'is beginning', 'can keep', "can't continue", 'regret not', 'is just', 'would love', 'believe', 'were born', 'wear is', '#not be', 'kill #not', 'love not', "don't start", 'works', 'been', 'sat', 'hold', 'makes', 'hear', 'should b', 'hope', 'loving', 'need', 'drinking is', "couldn't resist", 'know just', 'wants', 'feel', "don't try", 'was just', 'atleast am', 'freaking love', 'text', 'tired', 'was', 'guess', 'constantly deflecting', 'thought', 'debating quitting', 'protesting', 'just tried', 'collapsed', 'prosper', 'scamming', 'saying', 'spending', "won't be", 'is finally', 'texting stop', 'wondering', 'would like', 'gotta start', 'will quit', "don't have", 'was spent', 'use', 'just off', 'is #not', 'conned', 'delaying', 'trying', 'injure', 'have', 'just loved', 'am really', 'make be', 'encouraging when', 'believe is', 'r not', 'back then', 'would stop', 'just going', 'watched', 'has', 'am just', 'never ceases', 'know how', 'probably just', 'wasting', 'flirting', 'are never', 'forcing', 'overwhelmed', 'find', 'is ever', 'using', 'are not', 'is seriously', 'got', 'may be', 'serves', 'has been', 'are also', 'excited', 'gonna start', 'were', 'finally finished', 'running is', 'didn’t need', 'just loving', 'walk', 'are really', 'is not', 'stops', 'am totally', 'not even', 'finding', 'inspired', 'be', 'is supposedly', 'rocking', 'was finally', "haven't had", 'made', 'please keep', 'working', 'would be', 'kald #not', 'used', 'fallen', 'is already', 'empowering', 'can’t stand', 'manahel keep', 'is enough', 'reminiscing', 'spinning when', 'just now', 'ending', 'have been', "won't like", 'else is', 'lived is', 'not enough', 'really appreciate', 'was still', 'was probably', '#breeds', 'stop trying', 'just casually', 'coming', 'really be', 'should go', 'will keep', 'living', 'are totally', 'sleep', 'mean', 'love', 'was contemplating', 'am beginning', 'is really', 'wonder', 'ended', 'think', 'should be', 'seem need', 'spin', 'is', 'work', 'know', 'is truly', 'wait', 'assume', '#sarcasm keep', 'imagining', 'was #not', 'must be', 'really going', 'gonna keep', 'please', 'not actually', 'am always', 'is totally', 'always love', 'got caught', 'want', 'are just', 'knows how', 'really love', 'particularly enjoy', 'cares', '#not going', "isn't designed", 'is going', 'tweet was', 'gonna be', 'will start', 'obviously have', 'buffer when', 'go', 'just voluntarily', 'look'}}

In [120]:
predicate_expr = set(["great", "so much fun", "good", "so happy", "better", "my favorite thing", "cool", "funny", "nice", "always fun", "fun", "awesome", "the best feeling", "amazing", "happy", "ready today", "ready", "dry", "juicy", "my favorite part"])
pos_expr = set(["love", "missed", "loves", "enjoy", "cant wait", "excited", "wanted", "can't wait", "get", "appreciate", "decided", "loving", "really like", "looooove", "just keeps", "loveee", "randomly stop", "cannot wait", "just live", "please keep", "live", "stoked", "goin", "reading", "break", "just stops", "stops"])
neg_expr = set(["being ignored", "being sick", "waiting", "feeling", "waking up early", "being woken", "fighting", "staying", "writing", "being home", "cleaning", "not getting", "crying", "sitting at home", "being stuck", "starting", "being told", "being left", "getting ignored", "being treated", "doing homework", "learning", "getting up early", "going to bed", "getting sick", "riding", "being ditched", "getting ditched", "missing", "not sleeping", "not talking", "trying", "falling", "walking home", "getting yelled", "being awake", "being talked", "taking care", "doing nothing", "wasting", "throwing", "getting woken up", "to spend", "standing", "smelling", "getting woken", "arguing", "paying bills", "being locked", "shoveling", "getting called", "being at work", "having nothing", "getting invited", "getting blown", "dealing", "ending", "to wake", "when doesn't text", "getting ready", "to learn", "picking", "walking to class", "breaking", "being invited", "getting home", "setting", "dropping", "not seeing", "forgetting", "being called fat", "getting lied", "invited", "to sit here", "to be ignored", "being late", "doing laundry", "being taken", "practicing", "babysitting", "getting hit", "being used", "being used", "being reminded", "when falls", "working all day", "running late", "traveling", "peeing", "being hit", "having practice", "not being invited", "being bored", "stepping", "spending my day", "leaving", "almost getting", "being put", "passing", "being at school", "to study", "going to class", "coughing", "sitting in traffic", "being yelled", "fixing", "burning", "walking to school", "wakin", "seeing people", "being accused", "being up early", "scratches", "texting someone", "being invited places", "receiving", "being grounded", "checking", "getting my ass", "getting back", "getting bitched", "getting treated", "only getting", "reviewing", "sitting alone", "getting screwed", "going there", "getting stared", "calling", "watching scary movies", "getting no sleep", "taking tests", "getting locked", "reading tweets", "teaching", "waking up not", "sounding", "getting made", "sleeping alone", "not feeling", "being surrounded", "editing", "being stood up", "to randomly ask", "getting hacked", "getting texts", "having insomnia", "having homework", "blamed", "showing", "being blamed", "getting bad news", "getting played", "being stood", "scrolling", "being lied too", "being a loner", "going weeks", "being up late", "having class", "failing", "being cussed", "listening to women", "when ignores", "cutting", "bring", "burnt", "getting hate", "coming to school", "sitting here", "waking up early", "being called names", "getting replaced", "having bruises", "closing", "coming back", "getting punched", "getting phone", "spending all day", "being pushed", "spending", "not being able", "waking", "working", "sitting", "walking", "coming home", "living", "being lied", "getting", "coming", "going", "running", "to sit", "being called", "to read", "studying", "paying", "texting", "hearing", "replying", "gettin better", "gettin better", "gettin", "eating", "losing", "listening", "to get up", "finding", "to clean", "being able", "seeing", "to run", "to drive", "to go back", "looking", "taking", "putting", "driving", "to start", "posting", "to pay", "telling me", "ruined", "being woke", "hitting", "laying", "cuddling", "reading", "buying", "cancelled", "sending", "to see pictures", "to find out", "sharing", "finishing", "sweating", "to miss", "hurting"])

In [121]:
# HOW MANY DISCARDS ARE IN RILOFF'S FINAL LIST?
print("DS Riloff")
print("pos")
print(pos_expr & riloff_discards['pos'])
print("neg")
print(neg_expr & riloff_discards['neg'])
print("pospred")
print(pos_expr & riloff_discards['pos'])
print()
print("DS Shereen")
print("pos")
print(pos_expr & shereen_discards['pos'])
print("neg")
print(neg_expr & shereen_discards['neg'])
print("pospred")
print(pos_expr & shereen_discards['pos'])
print()
print("DS Scraped")
print("pos")
print(pos_expr & scraped_discards['pos'])
print("neg")
print(neg_expr & scraped_discards['neg'])
print("pospred")
print(pos_expr & scraped_discards['pos'])

DS Riloff
pos
set()
neg
{'missing', 'feeling', 'working', 'not sleeping', 'coming back', 'going', 'walking', 'being able', 'being called', 'reading', 'getting', 'waiting', 'getting home', 'waking'}
pospred
set()

DS Shereen
pos
{'really like', 'enjoy', 'get', 'missed', 'excited', 'loving', 'love'}
neg
{'running late', 'gettin', 'cleaning', 'forgetting', 'working', 'not sleeping', 'closing', 'looking', 'putting', 'reading', 'running', 'spending', 'sitting', 'doing laundry', 'riding', 'only getting', 'to sit here', 'crying', 'being awake', 'feeling', 'getting ignored', 'walking home', 'taking', 'going', 'falling', 'being able', 'being bored', 'not getting', 'doing homework', 'ending', 'getting sick', 'being home', 'studying', 'waking', 'eating', 'getting ready', 'fighting', 'writing', 'getting phone', 'editing', 'to sit', 'texting', 'listening', 'being used', 'cuddling', 'sleeping alone', 'leaving', 'sitting alone', 'being called', 'getting', 'doing nothing', 'being yelled', 'seeing', 'a

In [None]:
'''
A
out.scraped.full_length
out.shereen.full_length
out.riloff.full_length

B
out.riloff.full_length.big_change
out.shereen.full_length.big_change
out.scraped.full_length.big_change

C
out.riloff.inv_sub
out.shereen.inv_sub
out.scraped.inv_sub


results['A/B/C']['shereen/scraped']['neg/pos/discards']

A
  scraped
    pos
      11
    discards
      pos
        312
      neg
        495
    neg
      24
  shereen
    pos
      2
    discards
      pos
        166
      neg
        731
    neg
      20
C
  scraped
    pos
      17
    discards
      pos
        409
      neg
        269
    neg
      18
  shereen
    pos
      19
    discards
      pos
        578
      neg
        336
    neg
      35
B
  scraped
    pos
      18
    discards
      pos
        405
      neg
        1508
    neg
      40
  shereen
    pos
      18
    discards
      pos
        481
      neg
        1239
    neg
      46

'''




# Code versions:
#This contains an interpretation which finds candidate phrases 
#  looking at only one opposite polarity phrase at a time
#  e.g. Get all negative phrases for 'love'. 
#       Then all negative phrases for 'really like' etc.
# A
replicate_riloff.py

#This contains an interpretation which finds candidate phrases 
#  looking at the entire opposite polarity set
#  e.g. Get all negative phrases for {'love','really like'} etc. 
# B
replicate_riloff-big_correction.py

#This contains an interpretation which correctly understands subsumption
#  So far e.g. “waiting forever” was chosen over “waiting” 
#  e.g. Now, waiting is chosen  
# C
replicate_riloff-inverted_subsumption.py

In [1]:
pwd

'/home/ashish/Desktop/245/Project/code'

In [4]:
results = pickle.load(open('results_summary.pkl','rb'))

In [5]:
results.keys()

dict_keys(['C', 'B', 'A'])

In [7]:
### EXPERIMENTS ###
!ls -lrt

total 456
-rw-r--r--  1 ashish ashish   1828 Feb 12  2016 tweet-analysis.py
-rw-r--r--  1 ashish ashish    252 Mar  6  2016 accesskeys.py
-rw-r--r--  1 ashish ashish   1945 Mar  6  2016 twittersearch.py
-rw-r--r--  1 ashish ashish   1574 Mar  6  2016 download_tweets_pythonv3.py
-rw-r--r--  1 ashish ashish   1716 Mar  6  2016 gettweetsbyid.py
-rw-r--r--  1 ashish ashish  28941 Mar  6  2016 quickjava-2.0.7-fx.xpi
-rw-r--r--  1 ashish ashish   2429 Mar  7  2016 gettweetsbyscraping2.py
-rw-r--r--  1 ashish ashish   2182 Mar  7  2016 gettweetsbyscraping.py
-rw-r--r--  1 ashish ashish   2788 Mar  7  2016 gettweetsbyscraping-selenium.py
-rw-r--r--  1 ashish ashish   2708 Mar  7  2016 gettweetsbyscraping-selenium2.py
-rw-r--r--  1 ashish ashish    784 Mar  7  2016 thescript.py
-rw-r--r--  1 ashish ashish  77020 Mar 21 19:22 Classification of Tweets as Sarcastic or Not Sarcastic.ipynb
-rw-r--r--  1 ashish ashish 127663 Mar 23 23:32 results_summary.pkl
-rw-r--r--  1 ashish ashish  

In [31]:
riloff_eval_dataset = pickle.load(open('data/riloff-emnlp/sarcasm-annos-emnlp13-tweet_objs-2124.pkl','rb'))

In [32]:
riloff_eval_dataset[0]['text']

'Very low entries in this giveaway! Hop over and check it out! http://t.co/OrUSN9ne'

In [35]:
x = []
y = []
with open('data/riloff-emnlp/sarcasm-annos-emnlp13-tweets-noids.tsv') as f:
    for line in f:
        label,tweet = line.strip().split('\t')
        x.append(tweet)
        y.append(label)

In [38]:
z = list(zip(x,y))

In [41]:

for i in range(10):
    random.shuffle(z)

In [42]:
x,y = zip(*z)

In [43]:

c = defaultdict(int)
for label in y:
    c[label] += 1

In [44]:
c

defaultdict(int, {'NOT_SARCASM': 1666, 'SARCASM': 458})

In [3]:
riloff_eval = pickle.load(open('../../riloff-tokenized-and-tagged-lowercase.pkl','rb'))    

In [4]:
len(riloff_eval)

2124

ImportError: No module named 'nltk'

In [13]:

bigrams = ngrams(riloff_eval[0]['tokens'],2)

In [14]:
print(list(bigrams))

[('very', 'low'), ('low', 'entries'), ('entries', 'in'), ('in', 'this'), ('this', 'giveaway'), ('giveaway', '!'), ('!', 'hop'), ('hop', 'over'), ('over', 'and'), ('and', 'check'), ('check', 'it'), ('it', 'out'), ('out', '!'), ('!', 'http://t.co/orusn9ne')]


In [72]:
categories = ['NOT_SARCASM','SARCASM']

In [57]:
train_set_x = [tweet.lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in x[:200]]
train_set_y = y[:200]
test_set_x =  [tweet.lower().replace('#sarcasm','').replace('#sarcastic','') for tweet in x[200:]]
test_set_y = y[200:]

In [47]:


count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_set_x)
print(X_train_counts.shape)


#tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
#X_train_tf = tf_transformer.transform(X_train_counts)
#print(X_train_tf.shape)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

clf = MultinomialNB().fit(X_train_tfidf, train_set_y)
#To try to predict the outcome on a new document we need to extract the features using almost the same feature extracting chain as before. The difference is that we call transform instead of fit_transform on the transformers, since they have already been fit to the training set:
#docs_new = [tweetobj['tweet'] for tweetobj in riloff_eval]
X_new_counts = count_vect.transform(test_set_x)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

count_vect = CountVectorizer()

predicted = clf.predict(X_new_tfidf)

#for doc, category in zip(docs_new, predicted):
#    print('%r => %s' % (doc, twenty_train.target_names[category]))


(200, 1087)
(200, 1087)


In [48]:

np.mean(predicted == test_set_y) 

0.78534303534303529

In [50]:



text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
np.mean(predicted == test_set_y)            


0.7718295218295218

In [51]:
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

             precision    recall  f1-score   support

    SARCASM       0.85      0.86      0.85      1511
NOT_SARCASM       0.47      0.46      0.47       413

avg / total       0.77      0.77      0.77      1924



In [58]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
#print(np.mean(predicted == test_set_y))         
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

0.766632016632
             precision    recall  f1-score   support

    SARCASM       0.84      0.86      0.85      1511
NOT_SARCASM       0.45      0.41      0.43       413

avg / total       0.76      0.77      0.76      1924



In [60]:
# Moment of truth:
results = pickle.load(open('results_summary.pkl','rb'))

In [73]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
])
_ = text_clf.fit(train_set_x, train_set_y)
predicted = text_clf.predict(test_set_x)
#print(np.mean(predicted == test_set_y)) 
print("SVM with hinge loss")
print(metrics.classification_report(test_set_y, predicted,
    target_names=categories))

for _id,resultset in results.items():
    print(_id)
    for dataset,wordsets in resultset.items():
        predicted = []
        if dataset == 'shereen': 
            print('DatasetSarc')
        else: 
            print('DatasetOther')
        #print(dataset)
        for tweet in test_set_x:
            predicted.append('NOT_SARCASM')
            for posphrase in wordsets['pos']:
                if posphrase in tweet:
                    for negphrase in wordsets['neg']:
                        if negphrase in tweet:
                            predicted[-1] = "SARCASM"
        print(metrics.classification_report(test_set_y,predicted,target_names=categories))   

SVM with hinge loss
             precision    recall  f1-score   support

NOT_SARCASM       0.84      0.86      0.85      1511
    SARCASM       0.45      0.41      0.43       413

avg / total       0.76      0.77      0.76      1924

B
DatasetSarc
             precision    recall  f1-score   support

NOT_SARCASM       0.79      1.00      0.88      1511
    SARCASM       0.69      0.02      0.04       413

avg / total       0.77      0.79      0.70      1924

DatasetOther
             precision    recall  f1-score   support

NOT_SARCASM       0.79      1.00      0.88      1511
    SARCASM       0.50      0.01      0.01       413

avg / total       0.72      0.79      0.69      1924

A
DatasetSarc
             precision    recall  f1-score   support

NOT_SARCASM       0.79      1.00      0.88      1511
    SARCASM       0.75      0.01      0.01       413

avg / total       0.78      0.79      0.69      1924

DatasetOther
             precision    recall  f1-score   support

NOT_SARCASM 

In [65]:
c = defaultdict(int)
for label in test_set_y:
    c[label] += 1

In [66]:
c

defaultdict(int, {'NOT_SARCASM': 1511, 'SARCASM': 413})

In [67]:
413/1511

0.27332892124420916

In [74]:
pwd

'/home/ashish/Desktop/245/Project/code'