In [1]:
import os
os.chdir(os.getcwd() + '/../../')

In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix

from scripts.utils import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
comments = pd.read_csv('data/comments/preprocessed_comments.csv', index_col=0)
comments = comments.dropna()
comments.head()

Unnamed: 0,toxic,comment_text
0,0,explanation why the edits make under my userna...
1,0,d'aww ! he match this background colour i be s...
2,0,"hey man , i be really not try to edit war . it..."
3,0,`` more i can not make any real suggestion on ...
4,0,"you , sir , be my hero . any chance you rememb..."


# 1. TF-IDF

## 1.1. with stop words

In [4]:
# vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(comments['comment_text'])
y = comments['toxic']

# train val, test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)

In [9]:
# total number of features
print(len(vectorizer.get_feature_names()))

179429


### 1.1.1 Logistic regression

In [5]:
model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [6]:
get_evaluation_df(y_val, y_hat, 'logistic regression')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression,0.942817,0.657238,0.843086,0.738651,0.898238


### 1.1.2. Linear SVM

In [7]:
model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [8]:
get_evaluation_df(y_val, y_hat, 'linear SVM')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM,0.945731,0.687376,0.795685,0.737576,0.878661


## 1.2. Unigram + bigram model

In [11]:
# vectorization
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(comments['comment_text'])
y = comments['toxic']

# train val, test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)

In [12]:
# total number of features
print(len(vectorizer.get_feature_names()))

2470034


### 1.2.1. Logistic Regression

In [13]:
model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [14]:
get_evaluation_df(y_val, y_hat, 'logistic regression')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression,0.948676,0.711521,0.781301,0.74478,0.87386


### 1.2.2. Linear SVM

In [15]:
model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [16]:
get_evaluation_df(y_val, y_hat, 'linear SVM')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM,0.958671,0.840109,0.702517,0.765177,0.844172


## 1.3. Effect of number of features

### 1.3.1. Logistic regression

In [17]:
N = [100000, 50000, 25000, 10000, 5000]

df_list = []
for n in N:
    
    # vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=n)
    X = vectorizer.fit_transform(comments['comment_text'])
    y = comments['toxic']

    # train val, test split
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)
    
    # model training and evaluation
    model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
    y_hat = model.predict(X_val)
    
    df_list.append(get_evaluation_df(y_val, y_hat, f'logistic regression n={n}'))

In [18]:
pd.concat(df_list)

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression n=100000,0.942222,0.65283,0.848316,0.737845,0.900246
logistic regression n=50000,0.941344,0.647307,0.852566,0.735892,0.901661
logistic regression n=25000,0.938869,0.634011,0.856816,0.728764,0.902192
logistic regression n=10000,0.933636,0.609089,0.858777,0.712697,0.900175
logistic regression n=5000,0.926994,0.581198,0.852893,0.691309,0.893871


### 1.3.2 Linear SVM

In [19]:
N = [100000, 50000, 25000, 10000, 5000]

df_list = []
for n in N:
    
    # vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=n)
    X = vectorizer.fit_transform(comments['comment_text'])
    y = comments['toxic']

    # train val, test split
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)
    
    # model training and evaluation
    model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
    y_hat = model.predict(X_val)
    
    df_list.append(get_evaluation_df(y_val, y_hat, f'linear SVM n={n}'))

In [20]:
pd.concat(df_list)

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM n=100000,0.943193,0.668835,0.8068,0.731368,0.882226
linear SVM n=50000,0.940812,0.651869,0.820856,0.726668,0.887192
linear SVM n=25000,0.936488,0.627282,0.831317,0.715029,0.889477
linear SVM n=10000,0.928968,0.590742,0.842759,0.694598,0.890433
linear SVM n=5000,0.922388,0.563399,0.845374,0.676167,0.887963
