In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
from sklearn.metrics import f1_score
import nltk
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score

# Load Data

In [2]:
df = pd.read_csv('toxic_classification.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,target,comment_text,preprocessed_text
0,0,0,"This is so cool. It's like, 'would you want yo...",cool like would want mother read realli great ...
1,1,0,Thank you!! This would make my life a lot less...,thank would make life lot less anxieti induc k...
2,2,0,This is such an urgent design problem; kudos t...,urgent design problem kudo take impress
3,3,0,Is this something I'll be able to install on m...,someth abl instal site releas
4,4,1,haha you guys are a bunch of losers.,haha guy bunch loser


# Preprocessing

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer
import re
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))
def preprocess(text_string):
    text_string = text_string.lower() 
    text_string = re.sub('[^A-Za-z0-9]+', ' ', text_string) 
    
    x = text_string.split()
    new_text = []
    
    for word in x:
        if word not in stop_words:
            new_text.append(stemmer.stem(word))
            
    text_string = ' '.join(new_text)
    return text_string

In [4]:
bag_of_words = []
for index, value in df['comment_text'].items():
    bag_of_words.append(preprocess(value))
bag_of_words[:5]

['cool like would want mother read realli great idea well done',
 'thank would make life lot less anxieti induc keep let anyon get way',
 'urgent design problem kudo take impress',
 'someth abl instal site releas',
 'haha guy bunch loser']

In [7]:
def sample_data(df):
    labels = df['target']
    zero_indices = []
    one_indices = []
    for i in range(len(labels)):
        if labels[i] == 0:
            zero_indices.append(i)
        else:
            one_indices.append(i)
    label_0 = random.sample(zero_indices, k=20000)
    label_1 = random.sample(one_indices, k=20000)
    labels = [0] * 20000 + [1] * 20000
    bag_of_words_select = []
    for i in label_0:
        bag_of_words_select.append(bag_of_words[i])
    for i in label_1:
        bag_of_words_select.append(bag_of_words[i])
    return labels, bag_of_words_select

In [8]:
labels, bag_of_words= sample_data(df)

# Bag-Of-Words

In [35]:
vectorizer = CountVectorizer()
vectorizer.fit(bag_of_words)
X = vectorizer.transform(bag_of_words)
X_train, X_test, y_train, y_test, text_train, text_test = train_test_split(X, labels, bag_of_words, test_size=0.2, random_state=0)


# Logistic Regression

In [36]:
model = LogisticRegression()
param_grid = {'max_iter': [1000, 2000], 'C': [0.1, 1, 10]}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

In [37]:
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params,best_score

({'C': 1, 'max_iter': 1000}, 0.8303664209272726)

In [38]:
model = LogisticRegression(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Results

In [39]:
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1, precision, recall

(0.8268505268635359, 0.8610132755350853, 0.7952952952952953)

In [40]:
wrong_idx_fp = []
wrong_idx_fn = []
y_test_list = list(y_test)
for i in range(len(y_test)):
    if y_test_list[i] != y_pred[i]:
        if y_pred[i] == 1:
            wrong_idx_fp.append(i)
        else:
            wrong_idx_fn.append(i)

### False Positive

In [59]:
random_sample_fp = random.sample(wrong_idx_fp, k=3)

In [60]:
text = text_test[random_sample_fp[0]]
print(list(df[df['preprocessed_text'] == text]['comment_text'])[0])
print("-------------------------------")
print(text)

Then you react with a very dishonest reply! "The Fort and Red Rocks"? That was absurd and way out of context.
-------------------------------
react dishonest repli fort red rock absurd way context


In [61]:
text = text_test[random_sample_fp[1]]
print(list(df[df['preprocessed_text'] == text]['comment_text'])[0])
print("-------------------------------")
print(text)

Is eating bannock racist?
-------------------------------
eat bannock racist


In [62]:
text = text_test[random_sample_fp[2]]
print(list(df[df['preprocessed_text'] == text]['comment_text'])[0])
print("-------------------------------")
print(text)

"... a whole hour and not a single troll has shown up."

Look again. Turns out you beat the patron saint of trolldom by six minutes. Apparently it's a working holiday for him.
-------------------------------
whole hour singl troll shown look turn beat patron saint trolldom six minut appar work holiday


### False Negative

In [63]:
random_sample_fn = random.sample(wrong_idx_fn, k=3)

In [64]:
text = text_test[random_sample_fn[0]]
print(list(df[df['preprocessed_text'] == text]['comment_text'])[0])
print("-------------------------------")
print(text)

Trump has always been a complete failure as a "business guy". Everything he touches files for bankruptcy. he couldn't even make a profit running six casinos!

think before you post!
-------------------------------
trump alway complet failur busi guy everyth touch file bankruptci even make profit run six casino think post


In [65]:
text = text_test[random_sample_fn[1]]
print(list(df[df['preprocessed_text'] == text]['comment_text'])[0])
print("-------------------------------")
print(text)

Thanks for nothing and don't let the door hit your fanny! Might get oil on it. The door, that is.
-------------------------------
thank noth let door hit fanni might get oil door


In [66]:
text = text_test[random_sample_fn[2]]
print(list(df[df['preprocessed_text'] == text]['comment_text'])[0])
print("-------------------------------")
print(text)

Hey, Black folks, be on the lookout for a ban on malt liquor 40's next...brought to you by your own party!
-------------------------------
hey black folk lookout ban malt liquor 40 next brought parti
