# Detecting Sarcasm in Reddit Comments – Using TF-IDF

**Team 4:** Nanda H Krishna, Rubini U and Vikram Reddy

**Checklist:**
1. [x] EDA and Pre-processing
2. [x] TF-IDF (Random Forest, Gradient Boosting, Gaussian Naïve Bayes, Multi-Layer Perceptron, Neural Network)
    - [x] TF-IDF on Pre-processed Text
    - [x] TF-IDF on Raw Text
    - [x] Effect of using 2-grams
    - [x] Effect of using PCA
    - [x] Model Interpretability
3. [ ] BERT Embeddings

## Importing Modules

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import eli5
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [None]:
random_state = 42

## Loading Dataset

First, we'll be removing all NaNs from the dataset. Then we will be restricting ourselves to 125000 instances from the dataset due to compute power limitations.

In [None]:
df = pd.read_csv('sarcasm/dataset.csv')

In [None]:
df['label'].value_counts()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dropna(inplace=True)
df = df.sample(n=125000, random_state=random_state)
df.reset_index(inplace=True, drop=True)

In [None]:
df.shape

## Splitting Data

In [None]:
df = shuffle(df).reset_index(drop=True)

In [None]:
author_le = LabelEncoder()
df['author'] = author_le.fit_transform(df['author'])
sub_le = LabelEncoder()
df['subreddit'] = sub_le.fit_transform(df['subreddit'])

In [None]:
split = int(df.shape[0] * 0.8)
df_train = df.iloc[:split, :].reset_index(drop=True)
df_test = df.iloc[split:, :].reset_index(drop=True)
del df

In [None]:
print(df_train.shape, df_test.shape)

In [None]:
df_train.head()

In [None]:
df_test.head()

## TF-IDF on Pre-processed Text

We will be using the Random Forest Classifier, Gradient Boosting Classifier, Gaussian Naïve Bayes Classifier and Multi-Layer Perceptron in this section.

In [None]:
content = pd.concat([df_train['lemmatised_comment'],
                     df_train['lemmatised_parent']], axis=0).reset_index(drop=True)

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_features=1000)
tfidf.fit(content)

In [None]:
del content

In [None]:
def generate_dataframe(df):
    comment_features = pd.DataFrame.sparse.from_spmatrix(
        tfidf.transform(df['lemmatised_comment']))
    parent_features = pd.DataFrame.sparse.from_spmatrix(
        tfidf.transform(df['lemmatised_parent']))
    mapping_c = {value: key + '_comment'
                 for key, value in tfidf.vocabulary_.items()}
    mapping_v = {value: key + '_parent'
                 for key, value in tfidf.vocabulary_.items()}
    comment_features = comment_features.rename(columns=mapping_c)
    parent_features = parent_features.rename(columns=mapping_v)
    combined = pd.concat([comment_features, parent_features],
                         axis=1)
    print('Transformed!')
    combined['comment_author'] = df['author']
    combined['comment_subreddit'] = df['subreddit']
    combined['comment_score'] = df['score']
    combined['comment_ups'] = df['ups']
    combined['comment_downs'] = df['downs']
    combined['comment_label'] = df['label']
    print('Columns assigned!')
    return combined

In [None]:
train_tfidf = generate_dataframe(df_train)

In [None]:
test_tfidf = generate_dataframe(df_test)

In [None]:
train_tfidf.info()

In [None]:
test_tfidf.info()

In [None]:
X_train = train_tfidf.drop('comment_label', axis=1)
y_train = train_tfidf['comment_label']
X_test = test_tfidf.drop('comment_label', axis=1)
y_test = test_tfidf['comment_label']

### Random Forest Classifier

In [None]:
rf = RandomForestClassifier(n_estimators=50, verbose=1, n_jobs=-1)
rf.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, rf.predict(X_test)))

In [None]:
eli5.explain_weights(rf, top=10)

In [None]:
train_tfidf.columns[[2000, 2001, 2002, 2003, 988, 336, 605, 896, 1492, 1642]]

In [None]:
eli5.explain_prediction(rf, doc=X_test.iloc[1000, :], top=10)

In [None]:
eli5.explain_prediction(rf, doc=X_test.iloc[2000, :], top=10)

### Gradient Boosting Classifier

In [None]:
gb = GradientBoostingClassifier(n_estimators=10, verbose=1)
gb.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, gb.predict(X_test)))

In [None]:
eli5.explain_weights(gb, top=10)

In [None]:
train_tfidf.columns[[988, 2002, 336, 605, 896, 2003, 163, 696, 677, 991]]

### Naïve Bayes Classifier

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, nb.predict(X_test)))

### Multi-Layer Perceptron

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(25, 25), max_iter=50, alpha=0.001, early_stopping=True)
mlp.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, mlp.predict(X_test)))

### Neural Network

In [None]:
model = Sequential()
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(np.asarray(X_train).astype(np.float32),
          np.asarray(y_train).astype(np.float32), epochs=5)

In [None]:
print(classification_report(y_test,
                            model.predict(np.asarray(X_test).astype(np.float32)) \
                                 .round().astype(int)))

From the results above, we see that the Random Forest Classifier was the best. We will be using only this classifier in our next experiment.

## TF-IDF on Raw Text

In [None]:
content = pd.concat([df_train['comment'],
                     df_train['parent_comment']], axis=0).reset_index(drop=True)

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_features=5000)
tfidf.fit(content)

In [None]:
del content

In [None]:
def generate_dataframe(df):
    comment_features = pd.DataFrame.sparse.from_spmatrix(
        tfidf.transform(df['comment']))
    parent_features = pd.DataFrame.sparse.from_spmatrix(
        tfidf.transform(df['parent_comment']))
    mapping_c = {value: key + '_comment'
                 for key, value in tfidf.vocabulary_.items()}
    mapping_v = {value: key + '_parent'
                 for key, value in tfidf.vocabulary_.items()}
    comment_features = comment_features.rename(columns=mapping_c)
    parent_features = parent_features.rename(columns=mapping_v)
    combined = pd.concat([comment_features, parent_features],
                         axis=1)
    print('Transformed!')
    combined['comment_author'] = df['author']
    combined['comment_subreddit'] = df['subreddit']
    combined['comment_score'] = df['score']
    combined['comment_ups'] = df['ups']
    combined['comment_downs'] = df['downs']
    combined['comment_label'] = df['label']
    print('Columns assigned!')
    return combined

In [None]:
train_tfidf = generate_dataframe(df_train)

In [None]:
test_tfidf = generate_dataframe(df_test)

In [None]:
train_tfidf.info()

In [None]:
test_tfidf.info()

In [None]:
X_train = train_tfidf.drop('comment_label', axis=1)
y_train = train_tfidf['comment_label']
X_test = test_tfidf.drop('comment_label', axis=1)
y_test = test_tfidf['comment_label']

### Random Forest Classifier

In [None]:
rf_raw = RandomForestClassifier(n_estimators=50, verbose=1, n_jobs=-1)
rf_raw.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, rf_raw.predict(X_test)))

In [None]:
eli5.explain_weights(rf_raw, top=10)

In [None]:
train_tfidf.columns[[10002, 10001, 10000, 10003, 4974, 480, 693, 9451, 4451, 9530]]

## TF-IDF with n-grams

In [None]:
content = pd.concat([df_train['comment'],
                     df_train['parent_comment']], axis=0).reset_index(drop=True)

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_features=5000)
tfidf.fit(content)

In [None]:
del content

In [None]:
def generate_dataframe(df):
    comment_features = pd.DataFrame.sparse.from_spmatrix(
        tfidf.transform(df['comment']))
    parent_features = pd.DataFrame.sparse.from_spmatrix(
        tfidf.transform(df['parent_comment']))
    mapping_c = {value: key + '_comment'
                 for key, value in tfidf.vocabulary_.items()}
    mapping_v = {value: key + '_parent'
                 for key, value in tfidf.vocabulary_.items()}
    comment_features = comment_features.rename(columns=mapping_c)
    parent_features = parent_features.rename(columns=mapping_v)
    combined = pd.concat([comment_features, parent_features],
                         axis=1)
    print('Transformed!')
    combined['comment_author'] = df['author']
    combined['comment_subreddit'] = df['subreddit']
    combined['comment_score'] = df['score']
    combined['comment_ups'] = df['ups']
    combined['comment_downs'] = df['downs']
    combined['comment_label'] = df['label']
    print('Columns assigned!')
    return combined

In [None]:
train_tfidf = generate_dataframe(df_train)

In [None]:
test_tfidf = generate_dataframe(df_test)

In [None]:
train_tfidf.info()

In [None]:
test_tfidf.info()

In [None]:
X_train = train_tfidf.drop('comment_label', axis=1)
y_train = train_tfidf['comment_label']
X_test = test_tfidf.drop('comment_label', axis=1)
y_test = test_tfidf['comment_label']

### Random Forest Classifier

In [None]:
rf_2gram = RandomForestClassifier(n_estimators=50, verbose=1, n_jobs=-1)
rf_2gram.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, rf_2gram.predict(X_test)))

In [None]:
eli5.explain_weights(rf_2gram, top=10)

In [None]:
train_tfidf.columns[[10001, 10002, 10000, 10003, 4876, 476, 621, 8882, 3882, 9227]]

## TF-IDF + PCA

In [None]:
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
X_train.shape

### Random Forest Classifier

In [None]:
rf_pca = RandomForestClassifier(n_estimators=50, verbose=1, n_jobs=-1)
rf_pca.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, rf_pca.predict(X_test)))

In [None]:
eli5.explain_weights(rf_pca, top=1)

The Random Forest Classifier trained on TF-IDF Features from Raw Text was the best model with respect to all metrics.