In [1]:
import pandas as pd
import numpy as np

In [2]:
path='/content/drive/My Drive/Colab_data/Data_Mining/Toxic Comment Analysis/'
train=pd.read_csv(path+"train.csv")
test=pd.read_csv(path+"test.csv")
train=train[:100000]

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
import re

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def clean(comment):
  comment = comment.lower()
  comment = re.sub('[^a-zA-Z]',' ', comment)
  comment = comment.strip()
  comment = comment.split()
  stop_words = set(stopwords.words('english'))
  stemmer = PorterStemmer()
  comment = [stemmer.stem(word) for word in comment if word not in stop_words and len(word)<30]
  comment = ' '.join(comment)
  return comment

In [7]:
def vectorise(comments):
  vectorizer = CountVectorizer(max_features=5000)
  X = vectorizer.fit_transform(comments)
  return X.toarray()

In [8]:
raw_comments= list(train['comment_text'])
len(raw_comments)

100000

In [9]:
# raw_comments[5606+110000] #Led to recursion error while stemming!!!!!

In [10]:
comments=[]
for i,com in enumerate(raw_comments):
  if(i%10000==0): print(i)
  try:
    comments.append(clean(com))
  except:
    print(i,com)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000


In [20]:
x=vectorise(comments)
x.shape

(100000, 5000)

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

In [13]:
labels=train.iloc[:,2:]
labels.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [14]:
classes=list(labels.columns)
classes

['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [15]:
labels.head(10)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,1,1,1,0,1,0
7,0,0,0,0,0,0
8,0,0,0,0,0,0
9,0,0,0,0,0,0


In [16]:
y=[]
for idx,row in labels.iterrows():
  y.append(row.values)
y=np.array(y)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x,y,test_size=0.2,random_state=1)

In [18]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [19]:
model = MultinomialNB()

train_acc = []
val_acc = []

preds_train = []
preds_val = []

f1_val = []

for i, label_name in enumerate(classes):
    print('Class: ',label_name)
    model.fit(X_train,y_train[:,i])
  
    preds_train_class = model.predict(X_train)
    train_acc_class = accuracy_score(y_train[:,i], preds_train_class)
    print('Train Accuracy:', train_acc_class)
    train_acc.append(train_acc_class)
    preds_train.append(preds_train_class)

    preds_val_class = model.predict(X_val)
    val_acc_class = accuracy_score(y_val[:,i], preds_val_class)
    print('Val Accuracy:', val_acc_class)
    val_acc.append(val_acc_class)
    preds_val.append(preds_val_class)

    cm = confusion_matrix(y_val[:,i], preds_val_class)
    print(cm)

    tn, fp, fn, tp = cm.ravel()
    precision= tp/(tp+fp+1e-5)
    recall= tp/(tp+fn+1e-5)
    f1_score= (2*precision*recall)/(precision+recall+1e-5)
    print('precision {:.4f} recall {:.4f} F1 score {:.4f}:'.format(precision,recall,f1_score))
    f1_val.append(f1_score)

    print()
    
print('mean train accuracy : ', np.mean(train_acc))
print('mean val accuracy :', np.mean(val_acc))
print('mean val F1 score :', np.mean(f1_val))

Class:  toxic
Train Accuracy: 0.95235
Val Accuracy: 0.94945
[[17648   402]
 [  609  1341]]
precision 0.7694 recall 0.6877 F1 score 0.7262:

Class:  severe_toxic
Train Accuracy: 0.984325
Val Accuracy: 0.9826
[[19508   287]
 [   61   144]]
precision 0.3341 recall 0.7024 F1 score 0.4528:

Class:  obscene
Train Accuracy: 0.9713875
Val Accuracy: 0.9685
[[18559   380]
 [  250   811]]
precision 0.6809 recall 0.7644 F1 score 0.7202:

Class:  threat
Train Accuracy: 0.99025
Val Accuracy: 0.9891
[[19754   185]
 [   33    28]]
precision 0.1315 recall 0.4590 F1 score 0.2044:

Class:  insult
Train Accuracy: 0.964775
Val Accuracy: 0.96215
[[18561   465]
 [  292   682]]
precision 0.5946 recall 0.7002 F1 score 0.6431:

Class:  identity_hate
Train Accuracy: 0.98105
Val Accuracy: 0.97945
[[19490   352]
 [   59    99]]
precision 0.2195 recall 0.6266 F1 score 0.3251:

mean train accuracy :  0.9740229166666667
mean val accuracy : 0.9718749999999999
mean val F1 score : 0.5119811032175334
