In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer()
vectorizer = CountVectorizer(ngram_range=(1, 3))

In [3]:
df = pd.read_csv("Sarcasm Dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0


In [4]:
print(df.shape)

(3468, 9)


In [5]:
df = df.dropna(subset=['tweet'])
df.shape

(3467, 9)

In [6]:
X = vectorizer.fit_transform(df['tweet'])
y = df["sarcastic"]
X

<3467x102294 sparse matrix of type '<class 'numpy.int64'>'
	with 168053 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [8]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
dic = {
    "Naive-Bayes Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,Naive-Bayes Eval. Metric,Score
0,accuracy_score:,0.583573
1,precision_score:,0.643449
2,recall_score:,0.583573
3,f1_score:,0.602062


In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
dic = {
    "Logistic Regression Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,Logistic Regression Eval. Metric,Score
0,accuracy_score:,0.706052
1,precision_score:,0.620067
2,recall_score:,0.706052
3,f1_score:,0.609535


In [10]:
from sklearn.ensemble import RandomForestClassifier
rF = RandomForestClassifier()
rF.fit(X_train, y_train)
y_pred = rF.predict(X_test)
dic = {
    "Random Forest Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,Random Forest Eval. Metric,Score
0,accuracy_score:,0.713256
1,precision_score:,0.713256
2,recall_score:,1.0
3,f1_score:,0.832632


In [11]:
from sklearn.svm import SVC
sv = SVC()
sv.fit(X_train, y_train)
y_pred = sv.predict(X_test)
dic = {
    "SVM Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,SVM Eval. Metric,Score
0,accuracy_score:,0.713256
1,precision_score:,0.713256
2,recall_score:,1.0
3,f1_score:,0.832632


In [12]:
from sklearn.linear_model import Perceptron
tron = Perceptron()
tron.fit(X_train, y_train)
y_pred = tron.predict(X_test)
dic = {
    "Perceptron Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,Perceptron Eval. Metric,Score
0,accuracy_score:,0.693084
1,precision_score:,0.621789
2,recall_score:,0.693084
3,f1_score:,0.626395


# Multi-Class Classifiers

In [13]:
X = vectorizer.fit_transform(df[df.sarcastic == 1]['tweet'])

In [14]:
y = df[df.sarcastic == 1].copy().drop(columns = ['Unnamed: 0', 'tweet', 'sarcastic'])
y.head()

Unnamed: 0,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [16]:
print(X.shape, y.shape)

(867, 28368) (867, 6)


In [17]:
from sklearn.ensemble import RandomForestClassifier
rF = RandomForestClassifier()
rF.fit(X_train, y_train)
y_pred = rF.predict(X_test)
dic = {
    "Random Forest Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Random Forest Eval. Metric,Score
0,accuracy_score:,0.701149
1,precision_score:,0.723477
2,recall_score:,0.850575
3,f1_score:,0.781895


In [18]:
from sklearn.tree import DecisionTreeClassifier
dtC = DecisionTreeClassifier()
dtC.fit(X_train, y_train)
y_pred = dtC.predict(X_test)
dic = {
    "DecisionTree Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,DecisionTree Eval. Metric,Score
0,accuracy_score:,0.637931
1,precision_score:,0.763547
2,recall_score:,0.810345
3,f1_score:,0.782725


In [19]:
from sklearn.tree import ExtraTreeClassifier
etC = ExtraTreeClassifier()
etC.fit(X_train, y_train)
y_pred = etC.predict(X_test)
dic = {
    "ExtraTreeClassifier Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,ExtraTreeClassifier Eval. Metric,Score
0,accuracy_score:,0.609195
1,precision_score:,0.733703
2,recall_score:,0.798851
3,f1_score:,0.762928


In [20]:
from sklearn.ensemble import ExtraTreesClassifier
etsC = ExtraTreesClassifier()
etsC.fit(X_train, y_train)
y_pred = etsC.predict(X_test)
dic = {
    "ExtraTreesClassifier Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,ExtraTreesClassifier Eval. Metric,Score
0,accuracy_score:,0.701149
1,precision_score:,0.723477
2,recall_score:,0.850575
3,f1_score:,0.781895


In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
dic = {
    "KNeighbors Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,KNeighbors Eval. Metric,Score
0,accuracy_score:,0.701149
1,precision_score:,0.723477
2,recall_score:,0.850575
3,f1_score:,0.781895


In [22]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)
dic = {
    "MLPClassifier Eval. Metric": ["accuracy_score: ", 
               "precision_score: ", 
               "recall_score: ", 
               "f1_score: "],
    "Score": [accuracy_score(y_test, y_pred), 
              precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred)), 
              f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))]
}
pd.DataFrame(dic).head()

Unnamed: 0,MLPClassifier Eval. Metric,Score
0,accuracy_score:,0.701149
1,precision_score:,0.723477
2,recall_score:,0.850575
3,f1_score:,0.781895


# End