In [1]:
import os
os.chdir(os.getcwd() + '/../../')

In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix

from scripts.utils import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
comments = pd.read_csv('data/comments/preprocessed_comments.csv', index_col=0)
comments = comments.dropna()
comments.head()

Unnamed: 0,toxic,comment_text
0,0,explanation why the edits make under my userna...
1,0,d'aww ! he match this background colour i be s...
2,0,"hey man , i be really not try to edit war . it..."
3,0,`` more i can not make any real suggestion on ...
4,0,"you , sir , be my hero . any chance you rememb..."


# 1. TF-IDF

## 1.1. with stop words

In [4]:
# vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(comments['comment_text'])
y = comments['toxic']

# train val, test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)

In [5]:
# total number of features
print(len(vectorizer.get_feature_names()))

179429


### 1.1.1 Logistic regression

In [6]:
model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [7]:
get_evaluation_df(y_val, y_hat, 'logistic regression')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression,0.943365,0.661504,0.837761,0.739272,0.89616


### 1.1.2. Linear SVM

In [8]:
model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [9]:
get_evaluation_df(y_val, y_hat, 'linear SVM')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM,0.947282,0.696397,0.797711,0.743619,0.880424


## 1.2. Unigram + bigram model

In [10]:
# vectorization
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(comments['comment_text'])
y = comments['toxic']

# train val, test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)

In [11]:
# total number of features
print(len(vectorizer.get_feature_names()))

2470034


### 1.2.1. Logistic Regression

In [12]:
model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [13]:
get_evaluation_df(y_val, y_hat, 'logistic regression')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression,0.95159,0.729443,0.786678,0.75698,0.877874


### 1.2.2. Linear SVM

In [14]:
model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
y_hat = model.predict(X_val)

In [15]:
get_evaluation_df(y_val, y_hat, 'linear SVM')

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM,0.959228,0.841926,0.707397,0.768821,0.846659


## 1.3. Effect of number of features

### 1.3.1. Logistic regression

In [16]:
N = [100000, 50000, 25000, 10000, 5000]

df_list = []
for n in N:
    
    # vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=n)
    X = vectorizer.fit_transform(comments['comment_text'])
    y = comments['toxic']

    # train val, test split
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)
    
    # model training and evaluation
    model = LogisticRegression(max_iter=10000, class_weight='balanced').fit(X_train, y_train)
    y_hat = model.predict(X_val)
    
    df_list.append(get_evaluation_df(y_val, y_hat, f'logistic regression n={n}'))

In [17]:
pd.concat(df_list)

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
logistic regression n=100000,0.941642,0.650994,0.843073,0.734687,0.897582
logistic regression n=50000,0.940702,0.645191,0.84716,0.732509,0.898889
logistic regression n=25000,0.938939,0.634872,0.854107,0.72835,0.901019
logistic regression n=10000,0.93373,0.609516,0.858602,0.712928,0.900148
logistic regression n=5000,0.927855,0.583356,0.865141,0.69684,0.899822


### 1.3.2 Linear SVM

In [18]:
N = [100000, 50000, 25000, 10000, 5000]

df_list = []
for n in N:
    
    # vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=n)
    X = vectorizer.fit_transform(comments['comment_text'])
    y = comments['toxic']

    # train val, test split
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)
    
    # model training and evaluation
    model = LinearSVC(max_iter=50000, class_weight='balanced').fit(X_train, y_train)
    y_hat = model.predict(X_val)
    
    df_list.append(get_evaluation_df(y_val, y_hat, f'linear SVM n={n}'))

In [19]:
pd.concat(df_list)

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM n=100000,0.943835,0.671986,0.808745,0.73405,0.88345
linear SVM n=50000,0.940702,0.653001,0.813649,0.724527,0.883909
linear SVM n=25000,0.937843,0.633623,0.833265,0.719859,0.891097
linear SVM n=10000,0.930049,0.595163,0.844708,0.698311,0.891901
linear SVM n=5000,0.922842,0.564547,0.852472,0.679258,0.891387


### 1.3.2. RBF SVM

In [20]:
N = [100000, 50000, 25000, 10000, 5000]

df_list = []
for n in N:
    
    # vectorization
    vectorizer = TfidfVectorizer(stop_words='english', max_features=n)
    X = vectorizer.fit_transform(comments['comment_text'])
    y = comments['toxic']

    # train val, test split
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=0)
    
    # model training and evaluation
    model = SVC(max_iter=100000, class_weight='balanced').fit(X_train, y_train)
    y_hat = model.predict(X_val)
    
    df_list.append(get_evaluation_df(y_val, y_hat, f'linear SVM n={n}'))

In [21]:
pd.concat(df_list)

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
linear SVM n=100000,0.959188,0.82904,0.723335,0.772588,0.853762
linear SVM n=50000,0.959384,0.820746,0.737229,0.776749,0.860081
linear SVM n=25000,0.95864,0.805714,0.749081,0.776366,0.864967
linear SVM n=10000,0.956408,0.778613,0.761749,0.770089,0.869395
linear SVM n=5000,0.952804,0.74741,0.766653,0.756909,0.869595
