In [1]:

from collections import defaultdict

import numpy as np
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import naive_bayes, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

np.random.seed(500)

In [3]:
import pandas as pd
df = pd.read_csv("data/debagreement.csv")

In [4]:
df.head()

Unnamed: 0,label,msg_id_parent,msg_id_child,submission_id,body_parent,body_child,submission_text,subreddit,author_parent,exact_time,author_child,datetime,agreement_fraction,individual_kappa
0,0,gfvmv5x,gfvmzei,kd0se4,So now that they have elected Biden how can we...,"They haven't, it is a contested election. The ...","Forensic Audit: ""We conclude that the Dominion...",Republican,guildarts15,1607998349,03-Oct,15/12/2020 02:12,1.0,1.0
1,2,gyn3we6,gyn4ruu,nfrunb,"Tlaib, your family is Hamas. Funded by Iran a ...",She was not truthful when she swore her oath o...,Biden and The Squad's Tlaib stage heated confr...,Republican,oppositeofoutside,1621387168,cmcolfax,19/05/2021 01:19,1.0,1.0
2,0,gbh1w5x,gbh2dxt,jpv1jr,Most Republicans are happy with Biden as he wi...,I'm not happy about it but I'm not going out a...,President-elect Biden is projected to win Penn...,Republican,Rpdaca,1604773521,elang7390,07/11/2020 18:25,1.0,1.0
3,0,fv8hp2o,fv9lbzd,hbf73b,What is going to happen with this trend is tha...,Shouldn't they only fire use their weapon unle...,Not even an issue of politics it’s wrong this ...,Republican,wiseways9999,1592511116,Hotelier83,18/06/2020 20:11,0.666667,0.333333
4,0,g8ahfgp,g8ahzwk,j8crnu,We gotta start normalizing defending ourselves...,Nah women are about equal rights now a days so...,Young man wearing MAGA hat attacked at school ...,Republican,jwymes44,1602308654,notaglock,10/10/2020 05:44,1.0,1.0


In [None]:


def get_splits(df_group, train_len_pct=0.80, val_len_pct=0.10):
    total_length = len(df_group)
    train_length = int(train_len_pct * total_length)
    val_length = int(val_len_pct * total_length)
    test_length = total_length - (train_length + val_length)
    train_set = df_group.iloc[:train_length]
    val_set = df_group.iloc[train_length:train_length + val_length]
    test_set = df_group.iloc[train_length + val_length:]
    assert len(train_set) == train_length
    assert len(val_set) == val_length
    assert len(test_set) == test_length
    return train_set, val_set, test_set

trains = []
vals = []
tests = []
for gname, g in df.groupby("subreddit"):
    g["date_py"] = pd.to_datetime(df["datetime"])
    g = g.sort_values(by="date_py")
    train, val, test = get_splits(g)
    trains.append(train)
    vals.append(val)
    tests.append(test)

df_train = pd.concat(trains)
df_val = pd.concat(vals)
df_test = pd.concat(tests)

print(len(df_train))
print(len(df_val))
print(len(df_test))

df_train.head()

In [None]:
def tokenize(row):
    classification_string = row.body_parent + "</s>" + row.body_child
    inputs = classification_string.lower()
    inputs = word_tokenize(inputs)
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    final_inputs = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(inputs):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            final_inputs.append(word_Final)
    return final_inputs

In [None]:
# takes around 7 min
small_train = df_train
small_test = df_val
X_train = list(small_train[['body_parent', 'body_child']].apply(tokenize, axis=1, result_type='reduce'))
Y_train = list(small_train['label'])
X_test = list(small_test[['body_parent', 'body_child']].apply(tokenize, axis=1, result_type='reduce'))
Y_test = list(small_test['label'])

In [None]:
vectorizer = TfidfVectorizer()
train_corpus = [" ".join(x) for x in X_train]
X_train_tfidf = vectorizer.fit_transform(train_corpus)
test_corpus = [" ".join(x) for x in X_test]
X_test_tfidf = vectorizer.transform(test_corpus)

In [None]:
# Took around 15 min

Naive = naive_bayes.MultinomialNB()
svm_model = svm.SVC()
Naive.fit(X_train_tfidf,Y_train)
svm_model.fit(X_train_tfidf,Y_train)
predictions_NB = Naive.predict(X_test_tfidf)
predictions_SVM = svm_model.predict(X_test_tfidf)
print("Naive Bayes Accuracy Score -> ",classification_report(Y_test, predictions_NB))
print("SVM Accuracy Score -> ", classification_report(Y_test, predictions_SVM))

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train_tfidf, Y_train)
predictions_tree = clf.predict(X_test_tfidf)
print("Tree Accuracy Score -> ", classification_report(Y_test, predictions_tree))
