In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('data/8_sentences_train.csv',sep='|',index_col=0)

In [3]:
df_train_pos = df_train[df_train['label']==1]
df_train_neg = df_train[df_train['label']==0].sample(n = df_train_pos.shape[0])

df_array = [df_train_pos,df_train_neg]
df_train_balanced = pd.concat(df_array)
df_train_balanced = df_train_balanced.sample(frac=1)
df_train_balanced = df_train_balanced.fillna("")

In [4]:
all_sentences = df_train_balanced['sentence'] + ' ' + \
                df_train_balanced['prev1'] + ' ' + \
                df_train_balanced['prev2'] + ' ' + \
                df_train_balanced['prev3'] + ' ' + \
                df_train_balanced['prev4']     

In [5]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))
vectorizer.fit(all_sentences.values.astype('U'))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
X_sentence = vectorizer.transform(df_train_balanced['sentence'].values.astype('U'))
X_prev1 = vectorizer.transform(df_train_balanced['prev1'].values.astype('U'))
X_prev2 = vectorizer.transform(df_train_balanced['prev2'].values.astype('U'))
X_prev3 = vectorizer.transform(df_train_balanced['prev3'].values.astype('U'))
X_prev4 = vectorizer.transform(df_train_balanced['prev4'].values.astype('U'))

list_sentence = [X_sentence,X_prev1,X_prev2,X_prev3,X_prev4]

In [7]:
X = hstack(list_sentence)
y = df_train_balanced['label']
lr = LogisticRegression()
lr.fit(X,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [8]:
sp_df = pd.read_csv('data/All-seasons.csv')
sp_df = sp_df[sp_df['Season']!='Season']
sp_df['Episode_ID'] = sp_df['Season'] + sp_df['Episode']
episode_ids = list(set(sp_df['Episode_ID']))

In [9]:
sp_res = pd.DataFrame(columns=['prev4','prev3','prev2','prev1','sentence','Character'])
for id in episode_ids:
    episode_df = sp_df[sp_df['Episode_ID']==id].reset_index() 
    res = []
    prev4 = ""
    prev3 = ""
    prev2 = ""
    for idx,row in episode_df.iterrows():
        if idx==0:
            prev1 = row['Line']
        else:
            sentence = row['Line']
            char = row['Character']
            res.append([prev4,prev3,prev2,prev1,sentence,char])
            prev4 = prev3
            prev3 = prev2
            prev2 = prev1
            prev1 = sentence 
    local_res = pd.DataFrame(data=res,columns=['prev4','prev3','prev2','prev1','sentence','Character']) 
    sp_res = sp_res.append(local_res)
    
    
print('processed south park dataset shape: ',sp_res.shape)

processed south park dataset shape:  (70625, 6)


In [10]:
X_test_sentence = vectorizer.transform(sp_res['sentence'].values.astype('U'))
X_test_prev1 = vectorizer.transform(sp_res['prev1'].values.astype('U'))
X_test_prev2 = vectorizer.transform(sp_res['prev2'].values.astype('U'))
X_test_prev3 = vectorizer.transform(sp_res['prev3'].values.astype('U'))
X_test_prev4 = vectorizer.transform(sp_res['prev4'].values.astype('U'))

list_test_sentence = [X_test_sentence,X_test_prev1,X_test_prev2,X_test_prev3,X_test_prev4]

X_test = hstack(list_test_sentence)
sp_res['sarcasm_prediction']= lr.predict(X_test)

print('prediction of sarcasm proportion: ',(sp_res['sarcasm_prediction']==1).sum()/sp_res.shape[0])

prediction of sarcasm proportion:  0.194123893805


In [13]:
sarcasm_char = sp_res.groupby('Character').agg({'sarcasm_prediction':'sum'})/sp_res.groupby('Character').agg({'sarcasm_prediction':'count'})
sarcasm_char['number of sentences'] = sp_res.groupby('Character').agg({'sarcasm_prediction':'count'})

sarcasm_char_reduced = sarcasm_char[sarcasm_char['number of sentences']>100]
sarcasm_char_reduced2 = sarcasm_char[sarcasm_char['number of sentences']>700]

In [18]:
sarcasm_char_reduced.sort_values('sarcasm_prediction',ascending=False).head(40)

Unnamed: 0_level_0,sarcasm_prediction,number of sentences
Character,Unnamed: 1_level_1,Unnamed: 2_level_1
Crowd,0.391304,115
Kids,0.313869,137
Token,0.310469,277
Clyde,0.285714,315
The Boys,0.281818,110
Man,0.280952,210
Man 2,0.268293,123
Ike,0.265,200
Kenny,0.257955,880
Man 1,0.257426,101


In [16]:
sarcasm_char_reduced2.sort_values('sarcasm_prediction',ascending=False)

Unnamed: 0_level_0,sarcasm_prediction,number of sentences
Character,Unnamed: 1_level_1,Unnamed: 2_level_1
Kenny,0.257955,880
Butters,0.226466,2592
Kyle,0.203926,7081
Stan,0.201516,7652
Cartman,0.188303,9729
Randy,0.182927,2460
Mr. Garrison,0.181174,988
Chef,0.173581,916
Sharon,0.166667,858
