In [2]:
import pandas as pd

data = pd.read_csv("dataset.csv")
# print("Num Samples:", data.shape[0])

# NOTE: Remove this line before final training
# data = data[0:20000]
print("Num Samples:", data.shape[0])

Num Samples: 159571


In [3]:
data.head()

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


## Data Cleanup

In [4]:
import re

def clean(text):
    text = str(text).lower()
    text = text.replace("\n", " ")
    text = re.sub("[^a-z A-Z]+", "", text) # only keep digits and spaces
    text = re.sub('\s+', " ", text) # clean up spaces
    return text.strip()
        
data["comment_text"] = data["comment_text"].apply(clean)

In [5]:
data.head()

Unnamed: 0,comment_text,toxic
0,explanation why the edits made under my userna...,0
1,daww he matches this background colour im seem...,0
2,hey man im really not trying to edit war its j...,0
3,more i cant make any real suggestions on impro...,0
4,you sir are my hero any chance you remember wh...,0


## Feature Selection

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words("english")

vectorizer = TfidfVectorizer(
    stop_words=stopwords,
    strip_accents='unicode', 
    ngram_range=(1,3),
    max_features=10000,
)

X = vectorizer.fit_transform(data["comment_text"])
y = data.drop(labels=["comment_text"], axis=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maharsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Train/Test Split

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.2)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(127656, 10000) (127656, 1)
(31915, 10000) (31915, 1)


## Method 1: Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import numpy as np

hyperparameters = dict(C=np.logspace(0, 2, 10), penalty=["l1", "l2"])
classifier = GridSearchCV(LogisticRegression(),
                          hyperparameters, 
                          cv=5, # k=5 fold cross validation
                          verbose=1)

best_model = classifier.fit(X_train, y_train)
y_predicted = best_model.predict(X_test)

report = classification_report(y_test, y_predicted)
print(report)
print(confusion_matrix(y_test, y_predicted))
print("accuracy:", accuracy_score(y_test, y_predicted))
print("precision:", precision_score(y_test, y_predicted))
print("recall:", recall_score(y_test, y_predicted))
print("f1 score:", f1_score(y_test, y_predicted))

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  2.6min finished


              precision    recall  f1-score   support

           0       0.97      0.99      0.98     28812
           1       0.88      0.67      0.76      3103

    accuracy                           0.96     31915
   macro avg       0.92      0.83      0.87     31915
weighted avg       0.96      0.96      0.96     31915

[[28534   278]
 [ 1019  2084]]
accuracy: 0.9593608021306596
precision: 0.882303132938188
recall: 0.6716081211730583
f1 score: 0.7626715462031107


## Method 2: SVM

In [11]:
from sklearn.svm import LinearSVC

hyperparameters = dict(C=[1, 10, 100, 1000])
classifier = GridSearchCV(LinearSVC(random_state=1),
                          hyperparameters, 
                          cv=5, # k=5 fold cross validation
                          verbose=1)

best_model = classifier.fit(X_train, y_train)
y_predicted = best_model.predict(X_test)

report = classification_report(y_test, y_predicted)
print(report)
print(confusion_matrix(y_test, y_predicted))
print("accuracy:", accuracy_score(y_test, y_predicted))
print("precision:", precision_score(y_test, y_predicted))
print("recall:", recall_score(y_test, y_predicted))
print("f1 score:", f1_score(y_test, y_predicted))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  1.8min finished


              precision    recall  f1-score   support

           0       0.96      0.99      0.98     28812
           1       0.87      0.66      0.75      3103

    accuracy                           0.96     31915
   macro avg       0.92      0.83      0.87     31915
weighted avg       0.96      0.96      0.96     31915

[[28513   299]
 [ 1047  2056]]
accuracy: 0.957825473915087
precision: 0.873036093418259
recall: 0.6625845955526909
f1 score: 0.7533895199706852


## Method 3: Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB

hyperparameters = dict(alpha=np.logspace(0, 2, 30))
classifier = GridSearchCV(MultinomialNB(),
                          hyperparameters, 
                          cv=5, # k=5 fold cross validation
                          verbose=1)

best_model = classifier.fit(X_train, y_train)
y_predicted = best_model.predict(X_test)

report = classification_report(y_test, y_predicted)
print(report)
print(confusion_matrix(y_test, y_predicted))
print("accuracy:", accuracy_score(y_test, y_predicted))
print("precision:", precision_score(y_test, y_predicted))
print("recall:", recall_score(y_test, y_predicted))
print("f1 score:", f1_score(y_test, y_predicted))

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=T

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.95      1.00      0.97     28812
           1       0.92      0.52      0.66      3103

    accuracy                           0.95     31915
   macro avg       0.94      0.76      0.82     31915
weighted avg       0.95      0.95      0.94     31915

[[28677   135]
 [ 1501  1602]]
accuracy: 0.9487388375372082
precision: 0.9222797927461139
recall: 0.5162745729938769
f1 score: 0.6619834710743802


  y = column_or_1d(y, warn=True)
[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed:    7.2s finished
  y = column_or_1d(y, warn=True)
