In [208]:
import json 
import csv 
import pandas
import pandas as pd

In [209]:
data = pd.read_csv('Desktop/Sarcasm_Corr/twitter_training.csv')
pd.options.display.max_columns = None
df = data[['context/0','response','label']]
df.head()

Unnamed: 0,context/0,response,label
0,A minor child deserves privacy and should be k...,@USER @USER @USER I don't get this .. obviousl...,SARCASM
1,@USER @USER Why is he a loser ? He's just a Pr...,@USER @USER trying to protest about . Talking ...,SARCASM
2,Donald J . Trump is guilty as charged . The ev...,@USER @USER @USER He makes an insane about of ...,SARCASM
3,Jamie Raskin tanked Doug Collins . Collins loo...,@USER @USER Meanwhile Trump won't even release...,SARCASM
4,Man ... y ‚Äô all gone ‚Äú both sides ‚Äù the apocal...,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,SARCASM


In [210]:
"""### Vader """
def vader(sentence):
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    vader = SentimentIntensityAnalyzer()
    score = vader.polarity_scores(sentence)
    return score

# creating lists to keep pos, neu, neg, and compound scores --- later to be used to create a dataframe


# extracting vader scores for each entry in the data
# (we're not using context yet.)
# note that the compound score is rescaled to the [0,1] range
# some classiifers don't take negative values (e.g., MultinomialNB)


def vader_scores(df):
    vs_compound = []
    vs_pos = []
    vs_neu = []
    vs_neg = []
    for row in df:
        score = vader(row)
        neg = float(score['neg'])
        vs_neg.append(neg)
        neu =float(score['neu'])
        vs_neu.append(neu)
        pos =float(score['pos'])
        vs_pos.append(pos)
        compound = float((score['compound']+1)/2) # rescaling to the [0,1] range
        vs_compound.append(compound)
    return vs_compound, vs_pos, vs_neu, vs_neg


In [211]:
cvs_compound, cvs_pos, cvs_neu, cvs_neg = vader_scores(df['context/0'])
df = df.assign(cntx_vader_neu = cvs_neu, cntx_vader_pos = cvs_pos, cntx_vader_neg = cvs_neg, cntx_vader_compound = cvs_compound)

In [212]:
rvs_compound, rvs_pos, rvs_neu, rvs_neg = vader_scores(df['response'])
df = df.assign(resp_vader_neu = rvs_neu, resp_vader_pos = rvs_pos, resp_vader_neg = rvs_neg, resp_vader_compound = rvs_compound)


In [213]:
df1 = df[df["label"]=="SARCASM"].copy(deep=False)  #Divide DATA by label
df2 = df[df["label"]=="NOT_SARCASM"].copy(deep=False) #Divide DATA by label

# Compare Response to Context

## VADER

In [214]:
from scipy import stats

def correlation_calc(response, context, a):   #Pearson
    r_list = response.tolist()
    c_list = context.tolist()
    correlation,p_value = stats.pearsonr(r_list,c_list)
    return a, correlation

def correlation_calcSpearman(response, context, a):   #Spearman
    r_list = response.tolist()
    c_list = context.tolist()
    correlation,p_value = stats.spearmanr(r_list,c_list)
    return a, correlation


In [215]:
print('Pearson')
print('Compound')
print(correlation_calc(df1['resp_vader_compound'], df1['cntx_vader_compound'], "Sarcasm Vader Compound Correlation: "))
print(correlation_calc(df2['resp_vader_compound'], df2['cntx_vader_compound'], "Not Sarcasm Vader Compound Correlation: "))
print()
print('Positive')
print(correlation_calc(df1['resp_vader_pos'], df1['cntx_vader_pos'], "Sarcasm Vader Positive Correlation: "))
print(correlation_calc(df2['resp_vader_pos'], df2['cntx_vader_pos'], "Not Sarcasm Vader Positive Correlation: "))
print()
print('Negative')
print(correlation_calc(df1['resp_vader_neg'], df1['cntx_vader_neg'], "Sarcasm Vader Negative Correlation: "))
print(correlation_calc(df2['resp_vader_neg'], df2['cntx_vader_neg'], "Not Sarcasm Vader Negative Correlation: "))
print()
print('Neutral')
print(correlation_calc(df1['resp_vader_neu'], df1['cntx_vader_neu'], "Sarcasm Vader Neutral Correlation: "))
print(correlation_calc(df2['resp_vader_neu'], df2['cntx_vader_neu'], "Not Sarcasm Vader Neutral Correlation: "))


print()
print()

print('Spearman')
print('Compound')
print(correlation_calcSpearman(df1['resp_vader_compound'], df1['cntx_vader_compound'], "Sarcasm Vader Compound Correlation: "))
print(correlation_calcSpearman(df2['resp_vader_compound'], df2['cntx_vader_compound'], "Not Sarcasm Vader Compound Correlation: "))
print()
print('Positive')
print(correlation_calcSpearman(df1['resp_vader_pos'], df1['cntx_vader_pos'], "Sarcasm Vader Positive Correlation: "))
print(correlation_calcSpearman(df2['resp_vader_pos'], df2['cntx_vader_pos'], "Not Sarcasm Vader Positive Correlation: "))
print()
print('Negative')
print(correlation_calcSpearman(df1['resp_vader_neg'], df1['cntx_vader_neg'], "Sarcasm Vader Negative Correlation: "))
print(correlation_calcSpearman(df2['resp_vader_neg'], df2['cntx_vader_neg'], "Not Sarcasm Vader Negative Correlation: "))
print()
print('Neutral')
print(correlation_calcSpearman(df1['resp_vader_neu'], df1['cntx_vader_neu'], "Sarcasm Vader Neutral Correlation: "))
print(correlation_calcSpearman(df2['resp_vader_neu'], df2['cntx_vader_neu'], "Not Sarcasm Vader Neutral Correlation: "))


Pearson
Compound
('Sarcasm Vader Compound Correlation: ', 0.05594913449243477)
('Not Sarcasm Vader Compound Correlation: ', 0.23635737454486683)

Positive
('Sarcasm Vader Positive Correlation: ', 0.05585535471132459)
('Not Sarcasm Vader Positive Correlation: ', 0.22339329839762206)

Negative
('Sarcasm Vader Negative Correlation: ', 0.03501089511117264)
('Not Sarcasm Vader Negative Correlation: ', 0.19671787773880817)

Neutral
('Sarcasm Vader Neutral Correlation: ', 0.03829393755024069)
('Not Sarcasm Vader Neutral Correlation: ', 0.13686775083814706)


Spearman
Compound
('Sarcasm Vader Compound Correlation: ', 0.052523120730035615)
('Not Sarcasm Vader Compound Correlation: ', 0.2527067392959428)

Positive
('Sarcasm Vader Positive Correlation: ', 0.056421158178478495)
('Not Sarcasm Vader Positive Correlation: ', 0.19645995360852453)

Negative
('Sarcasm Vader Negative Correlation: ', 0.03528235341728887)
('Not Sarcasm Vader Negative Correlation: ', 0.18531708761548674)

Neutral
('Sarcasm 

# VAD

In [189]:

# Vad will pull all the words that are found in the dictionary, then sum up all the scores, and divide by # of words that matched dictionary entry
import pandas as pd
import nltk



"""### Dataframe
Columns: Context & Response
"""

df['tokenized_context']  = df['context/0'].str.replace('[^\w\s]','')  #get rid of punctuation
df['tokenized_response']  = df['response'].str.replace('[^\w\s]','')  #get rid of punctuation

"""tokenize context & response"""
df['tokenized_context'] = df.apply(lambda row: nltk.word_tokenize(row['tokenized_context']), axis=1) #tokenize
df['tokenized_response'] = df.apply(lambda row: nltk.word_tokenize(row['tokenized_response']), axis=1) #tokenize



from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

class LemmatizationWithPOSTagger(object):
    def __init__(self):
        pass
    def get_wordnet_pos(self,treebank_tag):
        """
        return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v) 
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            # As default pos in lemmatization is Noun
            return wordnet.NOUN

    def pos_tag(self,tokens):
        # find the pos tagginf for each tokens [('What', 'WP'), ('can', 'MD'), ('I', 'PRP') ....
        pos_tokens = [nltk.pos_tag(token) for token in tokens]

        # lemmatization using pos tagg   
        # convert into feature set of [('What', 'What', ['WP']), ('can', 'can', ['MD']), ... ie [original WORD, Lemmatized word, POS tag]
        pos_tokens = [ [(lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag))) for (word,pos_tag) in pos] for pos in pos_tokens]
        #pos_tokens = [ [(lemmatizer.lemmatize(word,self.get_wordnet_pos(pos_tag)), [pos_tag]) for (word,pos_tag) in pos] for pos in pos_tokens]

        return pos_tokens

lemmatizer = WordNetLemmatizer()
lemmatization_using_pos_tagger = LemmatizationWithPOSTagger()

#step 2 lemmatization

df['lemma_response'] = lemmatization_using_pos_tagger.pos_tag((df['tokenized_response'])) #response lemma
df['lemma_context'] = lemmatization_using_pos_tagger.pos_tag((df['tokenized_context'])) #context lemma


"""VAD dictionary"""
import csv
reader = csv.reader(open('Desktop/UPDATED_NLP_COURSE/vad1.csv'))

d = {}
for row in reader:
    key = row[1]
    value = row[2:]
    d[key] = value
    
d.pop('Word')          


""" CHECK LEMMAS AGAINST VAD DICTIONARY"""

"""response"""

list_totals = [[(list(map(float,d[token]))) for token in row if token in d] for row in df['lemma_response'].array]

def find_sum(x):
    total = [[num[x] for num in l]for l in list_totals]
    full_list = []
    for i in total:
        if len(i) != 0:
            full_list.append(i)
        else:
            i = [5.0,5.0,5.0]    #if there is a tweet that has nothing, assign average value
            full_list.append(i)
    average = [((sum(x))/(len(x))) for x in full_list]     #sum of all values divided by how many there were
    return average


vr_total = find_sum(0)
ar_total = find_sum(1)
dr_total = find_sum(2)

df['valence_response'] = vr_total
df['arousal_response'] = ar_total
df['dominance_response'] = dr_total



"""context"""

list_totals = [[(list(map(float,d[token]))) for token in row if token in d] for row in df['lemma_context'].array]


vc_total = find_sum(0)
ac_total = find_sum(1)
dc_total = find_sum(2)


df['valence_context'] = vc_total
df['arousal_context'] = ac_total
df['dominance_context'] = dc_total

In [190]:
# # Divide data by label again 
df1 = df[df["label"]=="SARCASM"].copy(deep=False)  #Divide DATA by label
df2 = df[df["label"]=="NOT_SARCASM"].copy(deep=False) #Divide DATA by label

In [191]:
print("Valence")
print(correlation_calc(df1['valence_response'], df1['valence_context'], "Sarcasm Valence Correlation"))
print(correlation_calc(df2['valence_response'], df2['valence_context'], "Not Sarcasm Valence Correlation"))
print()
print("Dominance")
print(correlation_calc(df1['dominance_response'], df1['dominance_context'], "Sarcasm Dominance Correlation"))
print(correlation_calc(df2['dominance_response'], df2['dominance_context'], "Not Sarcasm Dominance Correlation"))
print()
print("Arousal")
print(correlation_calc(df1['arousal_response'], df1['arousal_context'], "Sarcasm Arousal Correlation"))
print(correlation_calc(df2['arousal_response'], df2['arousal_context'], "Not Sarcasm Arousal Correlation"))

Valence
('Sarcasm Valence Correlation', 0.08821719063645983)
('Not Sarcasm Valence Correlation', 0.2672168944449087)

Dominance
('Sarcasm Dominance Correlation', 0.07598156129154873)
('Not Sarcasm Dominance Correlation', 0.18235810047910916)

Arousal
('Sarcasm Arousal Correlation', 0.045377398347435344)
('Not Sarcasm Arousal Correlation', 0.11804622628070281)


In [192]:
df2

Unnamed: 0,context/0,response,label,cntx_vader_neu,cntx_vader_pos,cntx_vader_neg,cntx_vader_compound,resp_vader_neu,resp_vader_pos,resp_vader_neg,resp_vader_compound,tokenized_context,tokenized_response,lemma_response,lemma_context,valence_response,arousal_response,dominance_response,valence_context,arousal_context,dominance_context
2500,"7v6 last night , the 6 held their own until th...",@USER @USER @USER Keith you need to give your ...,NOT_SARCASM,0.899,0.000,0.101,0.29985,1.000,0.000,0.000,0.50000,"[7v6, last, night, the, 6, held, their, own, u...","[USER, USER, USER, Keith, you, need, to, give,...","[USER, USER, USER, Keith, you, need, to, give,...","[7v6, last, night, the, 6, hold, their, own, u...",5.545714,4.321429,5.137143,5.982857,4.378571,5.645714
2501,... by topping the ball for a perfectly placed...,@USER I'll concede a Cardinals World Series if...,NOT_SARCASM,0.781,0.219,0.000,0.81845,1.000,0.000,0.000,0.50000,"[by, topping, the, ball, for, a, perfectly, pl...","[USER, Ill, concede, a, Cardinals, World, Seri...","[USER, Ill, concede, a, Cardinals, World, Seri...","[by, top, the, ball, for, a, perfectly, place,...",4.273333,4.810000,4.906667,6.140000,3.627143,5.911429
2502,God does not belong to a political party .,"@USER @USER The Bible is NOT Political , it IS...",NOT_SARCASM,0.510,0.490,0.000,0.79295,0.866,0.134,0.000,0.78595,"[God, does, not, belong, to, a, political, party]","[USER, USER, The, Bible, is, NOT, Political, i...","[USER, USER, The, Bible, be, NOT, Political, i...","[God, do, not, belong, to, a, political, party]",5.788333,3.696667,5.651667,5.860000,4.162500,5.447500
2503,üôè Show me the path where I should go oh Lord p...,@USER @USER @USER <URL> Thank you so much #Pra...,NOT_SARCASM,0.825,0.121,0.053,0.71075,0.839,0.161,0.000,0.81755,"[Show, me, the, path, where, I, should, go, oh...","[USER, USER, USER, URL, Thank, you, so, much, ...","[USER, USER, USER, URL, Thank, you, so, much, ...","[Show, me, the, path, where, I, should, go, oh...",5.882000,3.476000,6.052000,6.459231,4.022308,5.815385
2504,@USER please help me . my Ipone 6 is broken an...,@USER they said i must pay off 50 % for it & t...,NOT_SARCASM,0.522,0.237,0.240,0.49200,0.805,0.153,0.042,0.81620,"[USER, please, help, me, my, Ipone, 6, is, bro...","[USER, they, said, i, must, pay, off, 50, for,...","[USER, they, say, i, must, pay, off, 50, for, ...","[USER, please, help, me, my, Ipone, 6, be, bre...",5.991000,3.836000,5.916000,6.196000,4.048000,5.932000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,@USER Apologies for the inconvenience you face...,@USER You don't . I have purchased a lot on Am...,NOT_SARCASM,0.776,0.167,0.057,0.79465,0.881,0.119,0.000,0.77750,"[USER, Apologies, for, the, inconvenience, you...","[USER, You, dont, I, have, purchased, a, lot, ...","[USER, You, dont, I, have, purchase, a, lot, o...","[USER, Apologies, for, the, inconvenience, you...",6.026667,3.969167,5.608333,5.611333,3.856000,5.927333
4996,"@USER ü§î idk tho , I think I ‚Äô m #hungry . But ...",@USER #Emotions you say ü§î never knew that I th...,NOT_SARCASM,0.631,0.155,0.214,0.49585,0.791,0.209,0.000,0.78595,"[USER, idk, tho, I, think, I, m, hungry, But, ...","[USER, Emotions, you, say, never, knew, that, ...","[USER, Emotions, you, say, never, know, that, ...","[USER, idk, tho, I, think, I, m, hungry, But, ...",6.996000,4.370000,6.726000,5.785000,4.045000,5.740000
4997,"@USER @USER @USER Peace to you , and two count...","@USER @USER @USER You are so right ... "" Yes !...",NOT_SARCASM,0.446,0.554,0.000,0.95640,0.811,0.189,0.000,0.75165,"[USER, USER, USER, Peace, to, you, and, two, c...","[USER, USER, USER, You, are, so, right, Yes, S...","[USER, USER, USER, You, be, so, right, Yes, Si...","[USER, USER, USER, Peace, to, you, and, two, c...",6.465000,3.822500,5.915000,6.268333,3.616667,5.840000
4998,Bernie Sanders told Elizabeth Warren in privat...,@USER @USER @USER Another lazy delusional vote...,NOT_SARCASM,0.847,0.000,0.153,0.26415,0.754,0.079,0.167,0.29905,"[Bernie, Sanders, told, Elizabeth, Warren, in,...","[USER, USER, USER, Another, lazy, delusional, ...","[USER, USER, USER, Another, lazy, delusional, ...","[Bernie, Sanders, tell, Elizabeth, Warren, in,...",5.187500,4.394167,5.709167,5.815714,4.218571,5.931429


# LIWC

In [193]:
data1 = pd.read_csv('Desktop/Sarcasm_Corr/twitter_test_label.csv')
df1 = data1[['context/0','response','label']]

df1['tokenized_context']  = df1['context/0'].str.replace('[^\w\s]','')  #get rid of punctuation
df1['tokenized_response']  = df1['response'].str.replace('[^\w\s]','')  #get rid of punctuation

"""tokenize context & response"""
df1['tokenized_context'] = df1.apply(lambda row: nltk.word_tokenize(row['tokenized_context']), axis=1) #tokenize
df1['tokenized_response'] = df1.apply(lambda row: nltk.word_tokenize(row['tokenized_response']), axis=1) #tokenize

"""### Tokenizer"""

"""### LIWC """
import csv 
from liwc import Liwc

lwc = Liwc("Desktop/UPDATED_NLP_COURSE/liwc_dictionaries_shared/LIWC2007_English100131.dic")
liwcresults =[]

for token in df1['tokenized_response']:  #response scores
    results = (lwc.parse(token))
    liwcresults.append(results)

# could be done more efficiently and as a list comprehension, but the idea that you collect all the fieldnames
liwc_keys = []
for i in range(len(liwcresults)):
    for k in liwcresults[i].keys():
        if k not in liwc_keys:
            liwc_keys.append(k)

    

#look at each category in range 3,000, if the value is not in the list of keys, append new key. 

with open('liwc_feature_chart_response.csv', 'w') as csv_file:  
    dict_writer = csv.DictWriter(csv_file, liwc_keys)       
    writer = csv.writer(csv_file)
    dict_writer.writeheader()
    dict_writer.writerows(liwcresults)
    
liwcdata = pd.read_csv("liwc_feature_chart_response.csv")   #dataframe of liwc feeatures
df1 = pd.concat([liwcdata, df1], axis=1, sort=False)   #combine dataframes
df1.fillna(0, inplace=True)  #put zeroes wherever there are no values



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['tokenized_context']  = df1['context/0'].str.replace('[^\w\s]','')  #get rid of punctuation
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['tokenized_response']  = df1['response'].str.replace('[^\w\s]','')  #get rid of punctuation
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['tokenize

In [194]:
#ASSIGN RESPONSE VALUE

keep_same = {'context/0', 'response', 'label', 'tokenized_context', 'tokenized_response'}
df1.columns = ['{}{}'.format(c, '' if c in keep_same else '_resp') for c in df1.columns]

In [195]:

liwcresults =[]

for token in df1['tokenized_context']:  #response scores
    results = (lwc.parse(token))
    liwcresults.append(results)

# could be done more efficiently and as a list comprehension, but the idea that you collect all the fieldnames
liwc_keys = []
for i in range(len(liwcresults)):
    for k in liwcresults[i].keys():
        if k not in liwc_keys:
            liwc_keys.append(k)

    

#look at each category in range 3,000, if the value is not in the list of keys, append new key. 

with open('liwc_feature_chart_context.csv', 'w') as csv_file:  
    dict_writer = csv.DictWriter(csv_file, liwc_keys)       
    writer = csv.writer(csv_file)
    dict_writer.writeheader()
    dict_writer.writerows(liwcresults)
    
liwcdata = pd.read_csv("liwc_feature_chart_context.csv")   #dataframe of liwc feeatures
df1 = pd.concat([liwcdata, df1], axis=1, sort=False)   #combine dataframes
df1.fillna(0, inplace=True)  #put zeroes wherever there are no values

In [217]:
liwcresults

[Counter({'funct': 2,
          'adverb': 1,
          'time': 1,
          'relativ': 1,
          'pronoun': 1,
          'ipron': 1,
          'affect': 1,
          'negemo': 1,
          'cogmech': 1,
          'discrep': 1}),
 Counter({'time': 3,
          'relativ': 9,
          'funct': 19,
          'article': 4,
          'verb': 4,
          'past': 3,
          'social': 2,
          'percept': 3,
          'hear': 2,
          'pronoun': 4,
          'ipron': 3,
          'space': 5,
          'quant': 1,
          'preps': 6,
          'ppron': 1,
          'we': 1,
          'achieve': 2,
          'motion': 1,
          'present': 1,
          'cause': 3,
          'cogmech': 6,
          'affect': 1,
          'negemo': 1,
          'excl': 2,
          'insight': 1,
          'auxverb': 2,
          'adverb': 2,
          'feel': 1,
          'conj': 1,
          'certain': 1}),
 Counter({'funct': 4,
          'pronoun': 3,
          'ppron': 2,
          'shehe': 2,


In [196]:
keep_same = {'context/0', 'response', 'label', 'tokenized_context', 'tokenized_response','time_resp',
 'relativ_resp',
 'funct_resp',
 'pronoun_resp',
 'ipron_resp',
 'adverb_resp',
 'cogmech_resp',
 'excl_resp',
 'leisure_resp',
 'conj_resp',
 'incl_resp',
 'verb_resp',
 'past_resp',
 'social_resp',
 'ppron_resp',
 'i_resp',
 'cause_resp',
 'humans_resp',
 'certain_resp',
 'achieve_resp',
 'preps_resp',
 'tentat_resp',
 'space_resp',
 'affect_resp',
 'filler_resp',
 'posemo_resp',
 'present_resp',
 'they_resp',
 'shehe_resp',
 'negemo_resp',
 'anger_resp',
 'quant_resp',
 'auxverb_resp',
 'article_resp',
 'insight_resp',
 'work_resp',
 'you_resp',
 'motion_resp',
 'discrep_resp',
 'assent_resp',
 'negate_resp',
 'inhib_resp',
 'home_resp',
 'percept_resp',
 'hear_resp',
 'anx_resp',
 'sad_resp',
 'see_resp',
 'money_resp',
 'bio_resp',
 'health_resp',
 'sexual_resp',
 'nonfl_resp',
 'future_resp',
 'swear_resp',
 'ingest_resp',
 'feel_resp',
 'number_resp',
 'body_resp',
 'relig_resp',
 'family_resp',
 'we_resp',
 'death_resp',
 'friend_resp'}
df1.columns = ['{}{}'.format(c, '' if c in keep_same else '_cntx') for c in df1.columns]

In [197]:
df = df1.drop(['context/0', 'response','tokenized_context','tokenized_response'], axis=1)

In [198]:
df1 = df[df["label"]=="SARCASM"].copy(deep=False)  #Divide DATA by label
df2 = df[df["label"]=="NOT_SARCASM"].copy(deep=False) #Divide DATA by label

In [199]:
df1

Unnamed: 0,funct_cntx,adverb_cntx,time_cntx,relativ_cntx,pronoun_cntx,ipron_cntx,affect_cntx,negemo_cntx,cogmech_cntx,discrep_cntx,article_cntx,verb_cntx,past_cntx,social_cntx,percept_cntx,hear_cntx,space_cntx,quant_cntx,preps_cntx,ppron_cntx,we_cntx,achieve_cntx,motion_cntx,present_cntx,cause_cntx,excl_cntx,insight_cntx,auxverb_cntx,feel_cntx,conj_cntx,certain_cntx,shehe_cntx,leisure_cntx,posemo_cntx,tentat_cntx,anger_cntx,work_cntx,incl_cntx,humans_cntx,you_cntx,anx_cntx,they_cntx,nonfl_cntx,future_cntx,negate_cntx,sad_cntx,number_cntx,filler_cntx,money_cntx,relig_cntx,bio_cntx,sexual_cntx,see_cntx,body_cntx,health_cntx,i_cntx,friend_cntx,swear_cntx,inhib_cntx,family_cntx,home_cntx,ingest_cntx,assent_cntx,death_cntx,time_resp,relativ_resp,funct_resp,pronoun_resp,ipron_resp,adverb_resp,cogmech_resp,excl_resp,leisure_resp,conj_resp,incl_resp,verb_resp,past_resp,social_resp,ppron_resp,i_resp,cause_resp,humans_resp,certain_resp,achieve_resp,preps_resp,tentat_resp,space_resp,affect_resp,filler_resp,posemo_resp,present_resp,they_resp,shehe_resp,negemo_resp,anger_resp,quant_resp,auxverb_resp,article_resp,insight_resp,work_resp,you_resp,motion_resp,discrep_resp,assent_resp,negate_resp,inhib_resp,home_resp,percept_resp,hear_resp,anx_resp,sad_resp,see_resp,money_resp,bio_resp,health_resp,sexual_resp,nonfl_resp,future_resp,swear_resp,ingest_resp,feel_resp,number_resp,body_resp,relig_resp,family_resp,we_resp,death_resp,friend_resp,label
1,19.0,2.0,3.0,9.0,4.0,3.0,1.0,1.0,6.0,0.0,4.0,4.0,3.0,2.0,3.0,2.0,5.0,1.0,6.0,1.0,1.0,2.0,1.0,1.0,3.0,2.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,3.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SARCASM
2,4.0,0.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,1.0,1.0,1.0,4.0,1.0,2.0,0.0,1.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SARCASM
5,7.0,1.0,2.0,3.0,0.0,0.0,2.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,4.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SARCASM
8,9.0,0.0,0.0,3.0,2.0,1.0,2.0,0.0,3.0,1.0,0.0,4.0,0.0,1.0,0.0,0.0,2.0,0.0,3.0,1.0,0.0,0.0,1.0,4.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,1.0,1.0,2.0,1.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.0,11.0,4.0,3.0,0.0,4.0,0.0,0.0,0.0,1.0,9.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0,1.0,2.0,0.0,3.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SARCASM
9,14.0,1.0,1.0,1.0,6.0,3.0,1.0,0.0,2.0,0.0,2.0,3.0,2.0,3.0,0.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SARCASM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,4.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,4.0,2.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,SARCASM
1789,4.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SARCASM
1794,17.0,2.0,4.0,9.0,1.0,1.0,2.0,1.0,5.0,0.0,1.0,5.0,1.0,1.0,0.0,0.0,4.0,1.0,7.0,0.0,0.0,1.0,1.0,4.0,0.0,1.0,0.0,4.0,0.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,2.0,5.0,7.0,2.0,1.0,2.0,3.0,1.0,0.0,0.0,0.0,2.0,0.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,3.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,SARCASM
1796,4.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,4.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,14.0,2.0,1.0,1.0,6.0,2.0,0.0,3.0,3.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,2.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,SARCASM


In [201]:
j =df1.iloc[:, :64] #Sarcasm Context

In [202]:
h =df1.iloc[:,64 :-1] #Sarcasm Response

In [203]:
h = h.reindex(sorted(h.columns), axis=1)
j = j.reindex(sorted(j.columns), axis=1)

In [204]:
#Sarcasm

Sarcasm_Scores_List = []
for i in range(len(h.columns)):
    r_list = h.iloc[:,i].tolist()
    c_list = j.iloc[:,i].tolist()
    correlation, p_value = stats.pearsonr(r_list,c_list)
    print((h.columns[i],j.columns[i],correlation))
    Sarcasm_Scores_List.append((correlation))

('achieve_resp', 'achieve_cntx', 0.08596522186430626)
('adverb_resp', 'adverb_cntx', 0.08531259704593028)
('affect_resp', 'affect_cntx', 0.08709781440329778)
('anger_resp', 'anger_cntx', 0.10625037751469493)
('anx_resp', 'anx_cntx', 0.041087704286221806)
('article_resp', 'article_cntx', 0.05632156887423635)
('assent_resp', 'assent_cntx', -0.038399515854432124)
('auxverb_resp', 'auxverb_cntx', 0.16595676987654373)
('bio_resp', 'bio_cntx', 0.09700002330000784)
('body_resp', 'body_cntx', 0.06101402152986163)
('cause_resp', 'cause_cntx', 0.008483045490866755)
('certain_resp', 'certain_cntx', -0.0009506283670423291)
('cogmech_resp', 'cogmech_cntx', 0.04978660740880248)
('conj_resp', 'conj_cntx', -0.014211437486800878)
('death_resp', 'death_cntx', 0.09729936234894491)
('discrep_resp', 'discrep_cntx', 0.024179798420093566)
('excl_resp', 'excl_cntx', -0.013357967814571894)
('family_resp', 'family_cntx', 0.05039095683494532)
('feel_resp', 'feel_cntx', 0.030187006337216896)
('filler_resp', 'fill

In [205]:
k =df2.iloc[:, :64] #NotSarcasm context
m =df2.iloc[:,64 :-1] #NotSarcasm response


In [219]:
#Not Sarcasm
k = k.reindex(sorted(k.columns), axis=1)
m = m.reindex(sorted(m.columns), axis=1)

for i in range(len(m.columns)):
    r_list = m.iloc[:,i].tolist()
    c_list = k.iloc[:,i].tolist()
    correlation, p_value = stats.pearsonr(r_list,c_list)
    print(m.columns[i],k.columns[i],correlation)
    Not_Sarcasm_Scores_List.append((correlation))

achieve_resp achieve_cntx 0.07145373930128096
adverb_resp adverb_cntx 0.0520418147155224
affect_resp affect_cntx 0.08734424883723413
anger_resp anger_cntx 0.07277417162018122
anx_resp anx_cntx 0.07586829248759333
article_resp article_cntx 0.08514027692952233
assent_resp assent_cntx 0.04161970206990138
auxverb_resp auxverb_cntx 0.09013935118686361
bio_resp bio_cntx 0.1467246416716215
body_resp body_cntx 0.10095073400456936
cause_resp cause_cntx 0.03910743840639916
certain_resp certain_cntx 0.0509487056494122
cogmech_resp cogmech_cntx 0.12885291122826198
conj_resp conj_cntx 0.08840081091221613
death_resp death_cntx 0.06737583210186694
discrep_resp discrep_cntx 0.00934146220533316
excl_resp excl_cntx 0.0807508010721035
family_resp family_cntx 0.038098585022822234
feel_resp feel_cntx 0.014433955821640906
filler_resp filler_cntx 0.01982373351227297
friend_resp friend_cntx 0.08091835183769447
funct_resp funct_cntx 0.09686464576550413
future_resp future_cntx 0.014296753952200284
health_resp h

In [220]:
def Average(lst): 
    return sum(lst) / len(lst)

print("NotSarcasm Average", (Average(Not_Sarcasm_Scores_List)))
print()
print("Sarcasm Average", (Average(Sarcasm_Scores_List)))



NotSarcasm Average 0.06027429415156402

Sarcasm Average 0.06149014526623917


# Questions
### Vader
Does it need any preprocessing?
Compound score standerdized, why?
Try to treat each tweet as a vector, then compute correlations

In [None]:
def correlation_calc(response, context, a):   #Pearson
    r_list = response.tolist()
    c_list = context.tolist()
    correlation,p_value = stats.pearsonr(r_list,c_list)
    
    