In [22]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd, numpy as np, re, time
import itertools
import seaborn as sns
sns.set(style = "whitegrid", color_codes = True,font_scale = 1.5)
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# Reddit Dataset

In [23]:
df = pd.read_csv("train-balanced-sarcasm.csv")

# Data Cleaning
### Figuring out the "nan" float values that are causing issues

In [24]:
print(len(df))

1010826


In [25]:
print(df.isnull().any(axis = 0))

label             False
comment            True
author            False
subreddit         False
score             False
ups               False
downs             False
date              False
created_utc       False
parent_comment    False
dtype: bool


In [26]:
features = df['comment']
labels = df['label']
print(len(features))
print(len(labels))

1010826
1010826


In [27]:
start = time.time()

counter = 0
i_count = 0
bad_indices = []

for i,string in enumerate(features):
    try:
        if isinstance(string, float):
            counter += 1
            bad_indices.append(i_count)
    except:
        pass
    i_count += 1

print(counter)
print("i_count:", i_count)
print("bad_indices:", bad_indices)

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

53
i_count: 1010826
bad_indices: [56269, 68590, 135348, 199910, 258718, 284331, 312969, 328775, 331735, 332600, 332631, 362293, 389792, 445204, 505371, 520619, 524263, 529336, 532823, 569280, 645450, 651242, 661519, 675235, 683899, 747602, 799033, 800812, 813274, 817886, 859333, 875251, 878050, 898863, 905291, 914178, 914615, 918700, 919882, 923678, 936221, 949593, 966886, 967116, 978220, 982492, 992907, 995023, 1001185, 1001891, 1002133, 1009303, 1010599]
0.6761200428009033 seconds


In [28]:
print(len(df))

1010826


In [29]:
for b in bad_indices:
    df = df.drop([b])
print(len(df))

1010773


In [30]:
print(len(df))

1010773


In [31]:
features = df['comment']
labels = df['label']
print(len(features))
print(len(labels))

1010773
1010773


In [32]:
start = time.time()

counter = 0
i_count = 0
for i in df.itertuples():
    if not i[2]:
        counter += 1
        print(counter)
    i_count += 1

print(counter)
print("i_count", i_count)

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0
i_count 1010773
2.423959970474243 seconds


### Select desired sample size:

In [33]:
sample_size = input()

100000


In [34]:
df = df.sample(int(sample_size))
df.reset_index(drop=True,inplace=True)

In [35]:
features = df['comment']
labels = df['label']
print(len(features))
print(len(labels))

100000
100000


## Modeling

In [36]:
start = time.time()

In [37]:
# Stemming data: reducing a word to its word stem
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [38]:
end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

25.09518074989319 seconds


In [39]:
#TF-IDF: Transoform text to meaningful numerical representation
tv = TfidfVectorizer(max_features = 5000)
# features = list(features)
features = tv.fit_transform(features).toarray()

In [40]:
# Training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .05, random_state = 0)

In [41]:
# Using linear support vector classifier
start = time.time()

lsvc = LinearSVC()
lsvc.fit(features_train, labels_train)
test_predicted_labels = lsvc.predict(features_test)
print(lsvc.score(features_train, labels_train))
print(lsvc.score(features_test, labels_test))
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.7065368421052631
0.6554
              precision    recall  f1-score   support

           0       0.64      0.69      0.67      2491
           1       0.67      0.62      0.64      2509

    accuracy                           0.66      5000
   macro avg       0.66      0.66      0.66      5000
weighted avg       0.66      0.66      0.66      5000

7.412353992462158 seconds


In [42]:
# Using Gaussian Naive Bayes
start = time.time()

gnb = GaussianNB()
gnb.fit(features_train, labels_train)
print(gnb.score(features_train, labels_train))
print(gnb.score(features_test, labels_test))
test_predicted_labels = gnb.predict(features_test)
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.6425263157894737
0.577
              precision    recall  f1-score   support

           0       0.58      0.54      0.56      2491
           1       0.57      0.62      0.59      2509

    accuracy                           0.58      5000
   macro avg       0.58      0.58      0.58      5000
weighted avg       0.58      0.58      0.58      5000

247.18753910064697 seconds


In [43]:
# Using Logistic Regression
start = time.time()

lr = LogisticRegression()
lr.fit(features_train, labels_train)
print(lr.score(features_train, labels_train))
print(lr.score(features_test, labels_test))
test_predicted_labels = lr.predict(features_test)
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.7046105263157895
0.6622
              precision    recall  f1-score   support

           0       0.65      0.70      0.67      2491
           1       0.68      0.63      0.65      2509

    accuracy                           0.66      5000
   macro avg       0.66      0.66      0.66      5000
weighted avg       0.66      0.66      0.66      5000

6.7619709968566895 seconds


In [44]:
# Using Random Forest Classification
start = time.time()

rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print(rfc.score(features_train, labels_train))
print(rfc.score(features_test, labels_test))
test_predicted_labels = rfc.predict(features_test)
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.9635263157894737
0.6404
              precision    recall  f1-score   support

           0       0.62      0.71      0.66      2491
           1       0.66      0.58      0.62      2509

    accuracy                           0.64      5000
   macro avg       0.64      0.64      0.64      5000
weighted avg       0.64      0.64      0.64      5000

300.66705083847046 seconds


# "News Headline" Dataset

In [45]:
# SOURCE: https://github.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection
# NOTES - Dataset is much cleaner, less spelling errors, higher modeling scores overall
df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines = True)

In [46]:
print(len(df))

28619


In [48]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.theonion.com/thirtysomething-scien...,thirtysomething scientists unveil doomsday clo...,1
1,https://www.huffingtonpost.com/entry/donna-edw...,dem rep. totally nails why congress is falling...,0
2,https://www.huffingtonpost.com/entry/eat-your-...,eat your veggies: 9 deliciously different recipes,0
3,https://local.theonion.com/inclement-weather-p...,inclement weather prevents liar from getting t...,1
4,https://www.theonion.com/mother-comes-pretty-c...,mother comes pretty close to using word 'strea...,1


In [53]:
features = df['headline']
labels = df['is_sarcastic']
print(len(features))
print(len(labels))

28619
28619


## Modeling with Machine Learning Algorithms

In [54]:
# Stemming data: reducing a word to its word stem
ps = PorterStemmer()
features = features.apply(lambda x: x.split())
features = features.apply(lambda x : ' '.join([ps.stem(word) for word in x]))

In [55]:
#TF-IDF: Transoform text to meaningful numerical representation
tv = TfidfVectorizer(max_features = 5000)
# features = list(features)
features = tv.fit_transform(features).toarray()

In [56]:
# Training and testing data
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = .05, random_state = 0)

In [57]:
# Using linear support vector classifier
start = time.time()

lsvc = LinearSVC()
lsvc.fit(features_train, labels_train)
test_predicted_labels = lsvc.predict(features_test)
print(lsvc.score(features_train, labels_train))
print(lsvc.score(features_test, labels_test))
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.9050684125349419
0.8218029350104822
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       717
           1       0.82      0.83      0.82       714

    accuracy                           0.82      1431
   macro avg       0.82      0.82      0.82      1431
weighted avg       0.82      0.82      0.82      1431

0.7443058490753174 seconds


In [58]:
# Using Gaussian Naive Bayes
start = time.time()

gnb = GaussianNB()
gnb.fit(features_train, labels_train)
print(gnb.score(features_train, labels_train))
print(gnb.score(features_test, labels_test))
test_predicted_labels = gnb.predict(features_test)
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.8031852287774018
0.7169811320754716
              precision    recall  f1-score   support

           0       0.70      0.77      0.73       717
           1       0.74      0.66      0.70       714

    accuracy                           0.72      1431
   macro avg       0.72      0.72      0.72      1431
weighted avg       0.72      0.72      0.72      1431

12.7919020652771 seconds


In [59]:
# Using Logistic Regression
start = time.time()

lr = LogisticRegression()
lr.fit(features_train, labels_train)
print(lr.score(features_train, labels_train))
print(lr.score(features_test, labels_test))
test_predicted_labels = lr.predict(features_test)
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.8796527879947036
0.8301886792452831
              precision    recall  f1-score   support

           0       0.84      0.81      0.83       717
           1       0.82      0.85      0.83       714

    accuracy                           0.83      1431
   macro avg       0.83      0.83      0.83      1431
weighted avg       0.83      0.83      0.83      1431

0.58504319190979 seconds


In [60]:
# Using Random Forest Classification
start = time.time()

rfc = RandomForestClassifier(n_estimators = 10, random_state = 0)
rfc.fit(features_train, labels_train)
print(rfc.score(features_train, labels_train))
print(rfc.score(features_test, labels_test))
test_predicted_labels = rfc.predict(features_test)
print(classification_report(labels_test, test_predicted_labels))

end = time.time()
time_elapsed = end-start
print(time_elapsed, "seconds")

0.9900691481535971
0.777078965758211
              precision    recall  f1-score   support

           0       0.77      0.78      0.78       717
           1       0.78      0.77      0.78       714

    accuracy                           0.78      1431
   macro avg       0.78      0.78      0.78      1431
weighted avg       0.78      0.78      0.78      1431

32.20409893989563 seconds
