In [1]:
import os
os.chdir(os.getcwd() + '/../../')

In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix

from scripts.utils import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
comments = pd.read_csv('data/comments/preprocessed_comments.csv', index_col=0)
comments = comments.dropna()
comments.head()

Unnamed: 0,toxic,comment_text
0,0,explanation why the edits make under my userna...
1,0,d'aww ! he match this background colour i be s...
2,0,"hey man , i be really not try to edit war . it..."
3,0,`` more i can not make any real suggestion on ...
4,0,"you , sir , be my hero . any chance you rememb..."


# 1. TF-IDF

In [4]:
# define vocabulary

vocab_size = 10000

toxic_rank = pd.read_csv('data/comments/toxic_tfidf_rank_unigram.csv')
display(toxic_rank.head())
toxic_vocab = toxic_rank.loc[:vocab_size, 'term']

non_toxic_rank = pd.read_csv('data/comments/non_toxic_tfidf_rank_unigram.csv')
display(non_toxic_rank.head())
non_toxic_vocab = non_toxic_rank.loc[:vocab_size, 'term']

vocab = toxic_vocab.append(non_toxic_vocab).unique()

Unnamed: 0.1,Unnamed: 0,term,mean
0,64975,fuck,0.064007
1,142653,shit,0.021041
2,150875,suck,0.018965
3,94297,like,0.016618
4,26602,bitch,0.016268


Unnamed: 0.1,Unnamed: 0,term,mean
0,19010,article,0.027063
1,117362,page,0.02451
2,153745,talk,0.022387
3,171694,wikipedia,0.018847
4,53932,edit,0.015978


In [12]:
print(len(vocab))

14245


In [5]:
# vectorization
vectorizer = TfidfVectorizer(vocabulary=vocab)
X = vectorizer.fit_transform(comments['comment_text'])
y = comments['toxic']

# train val, test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)

## 1.1. Logistic regression

In [6]:
model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [7]:
get_evaluation_df(y_val, y_hat, 'logistic regression')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression,0.937843,0.629284,0.855333,0.7251,0.900961


## 1.2. Linear SVM

In [8]:
model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [9]:
get_evaluation_df(y_val, y_hat, 'linear SVM')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM,0.939723,0.646641,0.818145,0.722353,0.885377


## 1.3. RBF-SVM

In [10]:
model = SVC(max_iter=100000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [11]:
get_evaluation_df(y_val, y_hat, 'RBF SVM')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
RBF SVM,0.959619,0.822404,0.738047,0.777945,0.860576
