## **Project**:  Toxic Content Classifier

* Jignesh Madhani
* Pankaj Patil
* Manish Lokhande


### **OBJECTIVES**: 
* Train a model on the Toxic Comment Classification Challenge.
* Save the trained models and vectors and deploy in an application to achieve classification on live data.


In [None]:
import numpy as np 
import pandas as pd 

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# https://drive.google.com/file/d/1nBRMYj5sYADjRoY5ZXH71v-02tjWThP3/view?usp=sharing
downloaded = drive.CreateFile({'id': '1nBRMYj5sYADjRoY5ZXH71v-02tjWThP3'}) 
downloaded.GetContentFile('train.csv')  
dataset = pd.read_csv('train.csv')

dataset.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
dataset.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [None]:
sentiment = dataset['comment_text'].values
y_list = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = dataset[y_list].values
sentiment

array(["Explanation\r\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
       "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
       "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
       ...,
       'Spitzer \r\n\r\nUmm, theres no actual article for prostitution ring.  - Crunch Captain.',
       'And it looks like it was actually you who put on the speedy to have the first version deleted now that I look at it.',
       '"\r\nAnd ... I really don\'t think you understand.  I came here and my idea was bad right away.  What kind of 

In [None]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(list(sentiment))

In [None]:
from keras.preprocessing import text, sequence

seq = tokenizer.texts_to_sequences(sentiment)
pad = sequence.pad_sequences(seq, maxlen=100)

In [None]:
# https://drive.google.com/file/d/10aa9MhkLjtBRmAkNuXloxQvvzzd9lnzQ/view?usp=sharing
downloaded = drive.CreateFile({'id': '10aa9MhkLjtBRmAkNuXloxQvvzzd9lnzQ'}) 
downloaded.GetContentFile('test.csv')  
test = pd.read_csv('test.csv')
test = test['comment_text'].values
test_seq = tokenizer.texts_to_sequences(test)
test_pad = sequence.pad_sequences(test_seq, maxlen=100)


In [None]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


def model_add():
    inputs = Input(shape=(100, ))
    x = Embedding(20000, 128)(inputs)
    x = Bidirectional(LSTM(50))(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model
model = model_add()
print(model.summary())

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 100)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 100, 128)          2560000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100)               71600     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
model.fit(pad, y, batch_size=32, epochs=2, validation_split=0.1, callbacks=early)


Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7faaa93ff5c0>

In [None]:
# https://drive.google.com/file/d/1zeSXYbLJydwbOdF_M0MT6ZZCQ-6OVP5y/view?usp=sharing
y_test = model.predict([test_pad], batch_size=1024, verbose=1)
downloaded = drive.CreateFile({'id': '1zeSXYbLJydwbOdF_M0MT6ZZCQ-6OVP5y'}) 
downloaded.GetContentFile('sample_submission.csv')  
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission[y_list] = y_test
sample_submission.to_csv('submission.csv', index=False)



In [None]:
model.evaluate(test_pad, y_test, batch_size=32, verbose=2)


4787/4787 - 102s - loss: 0.0949 - accuracy: 1.0000


[0.09492810070514679, 1.0]