<a href="https://colab.research.google.com/github/neilgautam/toxic_comments_classification/blob/master/toxic_comments_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize,word_tokenize
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding,Dropout

In [None]:
from google.colab import drive

In [None]:
drive.mount("gdrive")

Mounted at gdrive


In [None]:
# os.listdir("../input/jigsaw-toxic-comment-classification-challenge")

In [None]:
os.listdir("gdrive/My Drive/toxic_comments_data")

['train.csv', 'test.csv', 'sample_submission.csv', 'test_labels.csv']

In [None]:
# !unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
# !unzip ../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
# !unzip ../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
# !unzip ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip

In [None]:
train_data = pd.read_csv("gdrive/My Drive/toxic_comments_data/train.csv")
test = pd.read_csv("gdrive/My Drive/toxic_comments_data/test.csv")
sample_submission = pd.read_csv("gdrive/My Drive/toxic_comments_data/sample_submission.csv")
test_labels = pd.read_csv("gdrive/My Drive/toxic_comments_data/test_labels.csv")

In [None]:
train_data.head(n=5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
train_data = train_data.drop(columns = ["id"])

In [None]:
train_data.head(n = 10)

Unnamed: 0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,alignment on this subject and which are contra...,0,0,0,0,0,0


In [None]:
x_data  = train_data.iloc[:,0].reset_index(drop = True)
y_data  = train_data.iloc[:,1:].reset_index(drop = True)
x_data = x_data.tolist()
y_data = np.array(y_data)

In [None]:
def clean_data(corpus):
    regexp = "[a-zA-Z']+"
    tokenizer = RegexpTokenizer(regexp)
    nltk.download("stopwords")
    stopword_list = stopwords.words("english")
    l_stemmer = LancasterStemmer()
    cleaned_corpus = []
    for sent in corpus:
        sent = sent.lower()
        sent_list = tokenizer.tokenize(sent)
        sent_list = [ x for x in sent_list if x not in stopword_list]
        sent_list = [l_stemmer.stem(x) for x in sent_list]
        sent = ' '.join(sent_list)
        cleaned_corpus.append(sent)
    return cleaned_corpus    

In [None]:
filtered_x_data = clean_data(x_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def count_builder(corpus):
    vocab = {}
    vocab[" "] =9999999
    vocab["  "] = 9999999
    regexp = "[a-zA-Z']+"
    tokenizer = RegexpTokenizer(regexp)
    for sent in corpus:
        tokenized_sent = tokenizer.tokenize(sent)
        for word in tokenized_sent:
            if vocab.get(word) is None :
                vocab[word] = 1           
            else:
                vocab[word] += 1 
                pass
    return vocab

In [None]:
vocabulary  = count_builder(filtered_x_data)

In [None]:
vocab_with_count = {k:v for k,v in sorted(vocabulary.items(), key = lambda x: x[1],reverse = True)}

In [None]:
new_id2w = {}
new_w2id = {}
for idx,items in enumerate(vocab_with_count.items()):
    if idx >=30000:
        break
    else:    
        new_w2id[items[0]]  = idx
        new_id2w[idx] = items[0]        

In [None]:
# w2id = vocabulary
# id2w = {value:key for key,value in w2id.items()}

In [None]:
# cv = CountVectorizer()
# vectorized_corpus = cv.fit_transform(filtered_x_data)
# word_to_id = cv.vocabulary_
# id_to_word = {value:key for key,value in word_to_id.items()}
# print(vectorized_corpus.shape)

In [None]:
def corpus_to_vectors(corpus,id_to_word = new_id2w,word_to_id = new_w2id):
    regexp = "[a-zA-Z']+"
    tokenizer = RegexpTokenizer(regexp)
    data_x = []
    for sent in corpus :
        tokenized_sent = word_tokenize(sent)
#         print(tokenized_sent)
        id_vector = []
        for word in tokenized_sent:
            try:
                id_vector.append(word_to_id[word]) 
            except:
                id_vector.append(0)
        data_x.append(id_vector)
    return data_x

In [None]:
nltk.download("punkt")
data_x = corpus_to_vectors(filtered_x_data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
data_X  = sequence.pad_sequences(data_x,maxlen = 500,padding = "post")

In [None]:
data_X.shape

(159571, 500)

In [None]:
np.unique(y_data)

array([0, 1])

In [None]:
model = Sequential()
model.add(Embedding(30000,96))
model.add(LSTM(256,return_sequences = True))
model.add(Dropout(0.5))
model.add(LSTM(256,return_sequences = False))
model.add(Dropout(0.5))
model.add(Dense(128,activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(6,activation = "softmax"))
model.compile(loss = "mse", optimizer = "adam",metrics = ["accuracy","mse"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 96)          2880000   
_________________________________________________________________
lstm_6 (LSTM)                (None, None, 256)         361472    
_________________________________________________________________
dropout_9 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)              

In [None]:
modelcheckpoint = ModelCheckpoint("gdrive/My Drive/toxic_comments_data/model_checkpoint.h5",monitor = "val_loss",period = 1,save_best_only = True,mode = "auto")
earlystopping = EarlyStopping(monitor = "val_acc",patience = 3)



In [None]:
model.fit(data_X,y_data,batch_size = 512,epochs = 10,validation_split = 0.2,callbacks = [modelcheckpoint,earlystopping])

In [None]:
model.save("gdrive/My Drive/toxic_comments_data/saved_model.h5")

In [None]:
test_corpus = ["you are a fucking bitch , you bloody whore , you dick sucking fucker , you black nigga ."]

In [None]:
test_data = clean_data(test_corpus)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
test_data

['fuck bitch bloody whor dick suck fuck black nigg']

In [None]:
test_x = corpus_to_vectors(test_data)

In [None]:
test_x = sequence.pad_sequences(test_x,maxlen = 500,padding = "post")

In [None]:
model.evaluate(test_data,np.array([[1,1,1,0,1,0]]))



[0.051063451915979385, 0.9938586950302124, 0.051063451915979385]