#### Importing Libraries and Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import matplotlib
from matplotlib import pyplot as plt
# import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

#### Importing Training Sets, Test Sets and Output of Test Set

In [3]:
train_set=pd.read_csv('train.csv')
test_set=pd.read_csv('test.csv')
sample=pd.read_csv('sample_submission.csv')

In [4]:
train_set.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [5]:
test_set.head()

Unnamed: 0,id,comment_text
0,7000000,Jeff Sessions is another one of Trump's Orwell...
1,7000001,I actually inspected the infrastructure on Gra...
2,7000002,No it won't . That's just wishful thinking on ...
3,7000003,Instead of wringing our hands and nibbling the...
4,7000004,how many of you commenters have garbage piled ...


In [6]:
sample.head()

Unnamed: 0,id,prediction
0,7000000,0.0
1,7000001,0.0
2,7000002,0.0
3,7000003,0.0
4,7000004,0.0


#### Extracting out target toxicity and Comments from training data

In [7]:
train=train_set[['target','comment_text']]
train.shape

(1804874, 2)

#### Combining test set comments with the target toxicity level

In [8]:
test=test_set.merge(sample, how='inner', on='id')
test.head()

Unnamed: 0,id,comment_text,prediction
0,7000000,Jeff Sessions is another one of Trump's Orwell...,0.0
1,7000001,I actually inspected the infrastructure on Gra...,0.0
2,7000002,No it won't . That's just wishful thinking on ...,0.0
3,7000003,Instead of wringing our hands and nibbling the...,0.0
4,7000004,how many of you commenters have garbage piled ...,0.0


#### Taking a subset of training set of 70000 samples uisng random sampling for training and testing as the actual data is 1.8million

In [55]:
train_sample=train.sample(n=70000,replace=False,axis=0).reset_index()
train_sample.head()

Unnamed: 0,index,target,comment_text
0,1574830,0.0,The answer to lower housing cost is not regula...
1,1764703,0.0,It doesn't matter what the truth is.\n\nHe HAS...
2,285049,0.0,Perhaps you should listen to the words your de...
3,9180,0.142857,"I do, to the worthy."
4,495434,0.0,"I don't know about you, but I have two!"


#### Converting the target into toxic(toxicity>=0.5) and non-toxic(toxicity<0.5)

In [56]:

for i in range(len(train_sample)):
    if train_sample.iloc[i,1]>=0.5:
        train_sample.iloc[i,1]=1
    else:
        train_sample.iloc[i,1]=0
        

#### Removing punctuations from comments

In [57]:
def remove_punctuation(text):
    '''a function for removing punctuation'''
    import string
    # replacing the punctuations with no space, 
    # which in effect deletes the punctuation marks 
    translator = str.maketrans('', '', string.punctuation)
    # return the text stripped of punctuation marks
    return text.translate(translator)

In [58]:
train_sample['text'] = train_sample['comment_text'].apply(remove_punctuation)
train_sample.head(10)

Unnamed: 0,index,target,comment_text,text
0,1574830,0.0,The answer to lower housing cost is not regula...,The answer to lower housing cost is not regula...
1,1764703,0.0,It doesn't matter what the truth is.\n\nHe HAS...,It doesnt matter what the truth is\n\nHe HAS t...
2,285049,0.0,Perhaps you should listen to the words your de...,Perhaps you should listen to the words your de...
3,9180,0.0,"I do, to the worthy.",I do to the worthy
4,495434,0.0,"I don't know about you, but I have two!",I dont know about you but I have two
5,1362914,0.0,One sure wonders about the professionalism of ...,One sure wonders about the professionalism of ...
6,605992,0.0,Probably? The share price is $0.70 at the tim...,Probably The share price is 070 at the time y...
7,499356,0.0,""" didn't take any government handouts.""\n\nThe...",didnt take any government handouts\n\nThe Val...
8,888982,0.0,I found a site that identifies eleven (11) Sta...,I found a site that identifies eleven 11 State...
9,278966,1.0,"O my God, that's disgusting.",O my God thats disgusting


#### Applying stemming-reducing the words to a root word

In [59]:
stemmer = SnowballStemmer("english")

def stemming(text):    
    '''a function which stems each word in the given text'''
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text) 

In [60]:
train_sample['text'] = train_sample['text'].apply(stemming)
train_sample.head(10)

Unnamed: 0,index,target,comment_text,text
0,1574830,0.0,The answer to lower housing cost is not regula...,the answer to lower hous cost is not regul or ...
1,1764703,0.0,It doesn't matter what the truth is.\n\nHe HAS...,it doesnt matter what the truth is he has to b...
2,285049,0.0,Perhaps you should listen to the words your de...,perhap you should listen to the word your deme...
3,9180,0.0,"I do, to the worthy.",i do to the worthi
4,495434,0.0,"I don't know about you, but I have two!",i dont know about you but i have two
5,1362914,0.0,One sure wonders about the professionalism of ...,one sure wonder about the profession of the to...
6,605992,0.0,Probably? The share price is $0.70 at the tim...,probabl the share price is 070 at the time you...
7,499356,0.0,""" didn't take any government handouts.""\n\nThe...",didnt take ani govern handout the valley would...
8,888982,0.0,I found a site that identifies eleven (11) Sta...,i found a site that identifi eleven 11 state w...
9,278966,1.0,"O my God, that's disgusting.",o my god that disgust


In [61]:
train_sample.target.value_counts()

0.0    64381
1.0     5619
Name: target, dtype: int64

##### Only 8% of the total comments are toxic. Clearly there exists a class Imbalance. Hence, using precision recall and F1 score would be an ideal evaluation metric.

#### Train Test splitting

In [62]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
     train_sample.text, train_sample.target, test_size=0.5, random_state=42)

#### Using CountVectorizer to tokenise comments, to remove stopwords, to use the tokens with frequency of more than 3 and finally to use the combination words upto 3(ngram)

In [63]:
from sklearn.feature_extraction.text import CountVectorizer


vect = CountVectorizer(min_df=3,stop_words='english',ngram_range=(1,3)).fit(X_train)
X_train = vect.transform(X_train)
X_test = vect.transform(X_test)


#### Using GridSearchCV and pipeline to find the best hyperparameters for Random Forest and Extreme Gradient Boosting

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

f1_scorer = make_scorer(f1_score, pos_label='pos')

estimators = [('model', XGBClassifier())]

pipe = Pipeline(estimators)

param_grid = [
{'model': [XGBClassifier(random_state=0)],
 'model__n_estimators':[100,500,1000,2000],
'model__learning_rate': [0.01, 0.05, 0.1, 0.3, 0.5]},
{'model': [RandomForestClassifier(random_state=0)],'model__n_estimators':[100,500,1000,2000]}]

grid = GridSearchCV(pipe, param_grid,scoring='f1', cv=5)
fitted_search = grid.fit(X_train, y_train)

#### Finding the best algorithm and best parameters based on higher F1 score

In [25]:
fitted_search.score(X_test, y_test)
fitted_search.best_estimator_

Pipeline(memory=None,
     steps=[('model', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.3,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=2000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1))])

#### Evaluating the results of XGB

In [64]:
from xgboost import XGBClassifier

clf = XGBClassifier(learning_rate=0.3,n_estimators=2000, random_state=0)
model=clf.fit(X_train, y_train) 
training_accuracy=model.score(X_train, y_train)
test_accuracy=model.score(X_test, y_test)
y_pred = model.predict(X_test)
print(training_accuracy,test_accuracy)
print(classification_report(y_test, y_pred))

0.9758285714285714 0.9378
              precision    recall  f1-score   support

         0.0       0.95      0.98      0.97     32272
         1.0       0.66      0.42      0.51      2728

   micro avg       0.94      0.94      0.94     35000
   macro avg       0.81      0.70      0.74     35000
weighted avg       0.93      0.94      0.93     35000



##### Based on GridSearch CV, XGB yielded the best F1 score. Out of the total Toxic predictions made, 66% have been correct(Precision). Out of the actual toxic comments, 42% have been identified as toxic(Recall).