# Model:

We will  be taking a LSTM model for our multi label predictions. We are taking this deep learning model since it can give a good accuracy.

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import codecs
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings
import pickle
from bs4 import BeautifulSoup
warnings.filterwarnings("ignore")

# Load and Split data

In [26]:
df=pd.read_csv('preprocessed_text.csv')
df.head()


Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
0,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,0,0,0,0,0,0,daww matches background colour seemingly stuck...
2,0,0,0,0,0,0,hey man really trying edit war guy constantly ...
3,0,0,0,0,0,0,make real suggestions improvement wondered sec...
4,0,0,0,0,0,0,sir hero chance remember page


In [68]:
df[df['threat']==1].head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
79,1,0,0,1,0,0,hi back last warning stop undoing edits die
176,1,0,1,1,1,1,think fagget get oife burn hell hate sorry can...
600,1,0,0,1,0,0,also sock puppet account suprise sincerely man...
802,1,0,1,1,1,0,fuck smith please notified die want dance grave
1017,1,1,1,1,1,1,wouldnt first time bitch fuck ill find live so...


In [28]:
print(df.columns)

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate',
       'preprocessed_text'],
      dtype='object')


In [29]:
train,test=train_test_split(df,test_size=0.2)
train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate,preprocessed_text
23228,1,0,0,0,0,0,aids account also closed due pool aids sorry v...
11448,0,0,0,0,0,0,really even obvious fine hope know head heart ...
12986,1,0,0,0,1,0,report want call vandalism idiocy begin whethe...
37182,0,0,0,0,0,0,nomeansno move hello disagree greatly decision...
120292,0,0,0,0,0,0,december according webpage jay hi name jay kno...


# Tokenization of words for LSTM

In [30]:
MAX_SEQUENCE_LENGTH = 400
MAX_NB_WORDS = 50000

In [31]:
tokenizer=Tokenizer(lower=False, filters='',num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train['preprocessed_text'])

sequences = tokenizer.texts_to_sequences(train['preprocessed_text'])
test_sequences = tokenizer.texts_to_sequences(test['preprocessed_text'])

train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of train data tensor:', train_data.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
nb_words = (np.max(train_data) + 1)

Shape of train data tensor: (127567, 400)


In [32]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
print(nb_words)

50000


# Our Model

In [34]:
from keras.layers.recurrent import LSTM
inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
# size of the vector space
embed_size = 128
x = Embedding(nb_words, embed_size)(inp)
output_dimention = 60
x = LSTM(output_dimention, return_sequences=True,name='lstm_layer')(x)
# reduce dimention
x = GlobalMaxPool1D()(x)
# disable 10% precent of the nodes
x = Dropout(0.1)(x)
# pass output through a RELU function
x = Dense(50, activation="relu")(x)
# another 10% dropout
x = Dropout(0.1)(x)
# pass the output through a sigmoid layer, since 
# we are looking for a binary (0,1) classification 
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)

model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 400)               0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 400, 128)          6400000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 400, 60)           45360     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 60)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 50)                3050      
_________________________________________________________________
dropout_8 (Dropout)          (None, 50)                0   

In [35]:
labels = ['identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic']
y = train[labels].values
batch_size = 32
epochs = 2
model.fit(train_data,y, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 114810 samples, validate on 12757 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x1f8addb4198>

In [36]:
pred = model.predict(test_data)
pred[:10]

array([[1.0373957e-04, 3.3771992e-04, 1.6841292e-04, 6.0232010e-06,
        2.0135340e-05, 1.9673705e-03],
       [5.4430373e-05, 1.9407272e-04, 1.1147541e-04, 4.3026052e-06,
        1.2279527e-05, 1.2706220e-03],
       [6.6503460e-05, 6.5809488e-04, 2.2116303e-04, 1.6159886e-06,
        7.6981214e-06, 4.7862530e-03],
       [1.3959408e-04, 4.2742491e-04, 2.4601817e-04, 1.1346216e-05,
        3.8469028e-05, 2.6203096e-03],
       [1.5312433e-04, 4.5421720e-04, 2.6294589e-04, 1.2034592e-05,
        3.6830697e-05, 2.6741624e-03],
       [8.2526312e-05, 9.4771385e-04, 3.0827522e-04, 1.5139155e-06,
        7.6391834e-06, 6.9566667e-03],
       [5.5186510e-02, 1.1440787e-01, 3.7994921e-02, 1.8342137e-03,
        1.0418594e-02, 3.5571337e-01],
       [6.6113472e-04, 3.0002892e-03, 1.4289320e-03, 5.2680261e-05,
        2.8517842e-04, 1.9589305e-02],
       [1.4137788e-05, 9.2093644e-05, 3.7664293e-05, 3.4146836e-07,
        8.7306250e-07, 6.2993169e-04],
       [2.7304499e-05, 1.3545156e-04,

In [37]:
model.save('LSTM_toxic_prediction_model.h5') 

**Note:** We are storing our model for future use

In [5]:
'''##########[!] Functions reused in Django###############'''
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase
def cleanPunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    cleaned = cleaned.strip()
    cleaned = cleaned.replace("\n"," ")
    return cleaned
def clear_sentance(sentance):
    sentance= re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = cleanPunc(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    #sentance = stemming(sentance)
    # https://gist.github.com/sebleier/554280
    #https://towardsdatascience.com/journey-to-the-center-of-multi-label-classification-384c40229bff
    stop_words = set(stopwords.words('english'))
    stop_words.update(['zero','one','two','three','four','five','six','seven','eight','nine','ten','may','also','across','among','beside','however','yet','within'])
    sentance = ' '.join(e.lower() for e in sentance.split() if e.lower() not in  stopwords.words('english'))
    return sentance.strip()
def tokenize(sentance):
    MAX_SEQUENCE_LENGTH = 400
    #MAX_NB_WORDS = 50000
    with open('tokenizer.pickle', 'rb') as handle:
                    tokenizer = pickle.load(handle)
    test_sequences = tokenizer.texts_to_sequences([sentance])
    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return test_data
def model_predict(test_data):
    from keras.models import load_model
    model=load_model('LSTM_toxic_prediction_model.h5')
    prediction=model.predict(test_data)
    
    return prediction
def get_prediction(sentance):
    clear_text=clear_sentance(sentance)
    test_data=tokenize(clear_text)
    predicted_array=model_predict(test_data)
    #'identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic'
    predicted_values={'Hate':round(predicted_array[0][0]),'Insult':round(predicted_array[0][1]), 'Obscene':round(predicted_array[0][2]), 'Severe Toxic':round(predicted_array[0][3]), 'Threat':round(predicted_array[0][4]), 'Toxic':round(predicted_array[0][5])}
    #print(clear_text)
    #print(test_data)
    #print(predicted_array)
    result=''
    for key in predicted_values:
        #print(key)
        #print(predicted_values[key])
        if(predicted_values[key]==1.0):
            result+=key+' '
    print(result)
    
    

In [6]:
get_prediction("pair jew hating weiner nazi schmucks")

Hate Insult Toxic 


In [7]:
get_prediction("Fuck OFF man , you peace of cunt. Mother fucker")

Insult Obscene Toxic 


In [8]:
get_prediction("Hello")




In [9]:
get_prediction("BAstard")

Insult Obscene Toxic 
