In [88]:
# uncomment to install keras
# !pip install --user keras

In [89]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU,Conv1D,MaxPooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D,Bidirectional, GlobalMaxPooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
import gc
from sklearn.model_selection import train_test_split
from keras.models import load_model

In [90]:
# Don't display too many rows/cols of DataFrames
pd.options.display.max_rows = 15
pd.options.display.max_columns = 100

# Round decimals when displaying DataFrames
pd.set_option('precision', 2)

In [91]:
train = pd.read_csv('./train.csv')
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:47.601894+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.89,haha you guys are a bunch of losers.,0.02,0.0,0.02,0.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2015-09-29 10:50:48.488476+00,2,,2006,rejected,0,0,0,1,0,0.0,4,47


In [150]:
#Split into training and test set:
# X_train, X_test, y_train, y_test = train_test_split(train, train[["severe_toxicity", "obscene", "identity_attack", "insult", "threat"]], test_size = 0.10, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(train.copy()[:50000], train.copy()[:50000][["severe_toxicity", "obscene", "identity_attack", "insult", "threat"]], test_size = 0.10, random_state = 42)

In [151]:
# X_train

In [152]:
# Store the comments as seperate variables for further processing.
list_sentences_train = X_train["comment_text"]
list_sentences_test = X_test["comment_text"]

In [153]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features,char_level=True)

In [154]:
tokenizer.fit_on_texts(list(list_sentences_train))


In [155]:
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_sentences_test = tokenizer.texts_to_sequences(list_sentences_test)


In [156]:
# list_tokenized_train[:1]

In [157]:
# Find mean length of the sentences
length_sentences = [len(comment) for comment in list_tokenized_train]
import statistics
statistics.median(length_sentences)

216.0

In [158]:
# Set length of all sentences to 200 characters. Pad zeros for sentences with length < 200
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_sentences_test, maxlen=maxlen)

In [159]:
X_t[:1]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 17,  9,
         4, 22,  5, 22, 11, 16,  1,  8, 13,  6,  3,  8,  1, 15, 14, 39,
        13,  2,  5,  9, 16,  1, 19,  6,  7,  2, 25,  1, 10,  2,  1, 14,
         5,  7,  1, 17,  5, 14, 24,  1,  3, 10,  2,  1, 17,  5,  9,  3,
        16,  1, 14, 10,  5,  6,  9,  1,  5, 18,  5,  6,  7, 21, 21, 21,
        19,  4,  9,  1, 20, 10,  5,  3,  1,  3, 10,  2,  1,  3, 10,  6,
         9, 12,  1,  3,  6, 15,  2,  1,  7,  4, 20, 32, 21, 21,  4,  9,
         1, 19,  4, 13,  9,  3, 10, 32]], dtype=int32)

In [160]:
def CNN_model():
    inp = Input(shape=(maxlen, ))
    
    embed_size = 240
    x = Embedding(len(tokenizer.word_index)+1, embed_size)(inp)

    x = Conv1D(filters=100,kernel_size=4,padding='same', activation='relu')(x)
    
    x=MaxPooling1D(pool_size=4)(x)

    x = Bidirectional(GRU(60, return_sequences=True,name='lstm_layer',dropout=0.2,recurrent_dropout=0.2))(x)

    x = GlobalMaxPooling1D()(x)

    x = Dense(50, activation="relu")(x)

    x = Dropout(0.2)(x)
    x = Dense(5, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                     metrics=['accuracy'])
    return model

In [161]:
model = CNN_model()
model.summary()


Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 200)               0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 200, 240)          64800     
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 200, 100)          96100     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 50, 100)           0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 50, 120)           57960     
_________________________________________________________________
global_max_pooling1d_9 (Glob (None, 120)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 50)                6050

In [162]:
# from tensorflow.python.client import device_lib
# from keras import backend as K
# K.tensorflow_backend._get_available_gpus()

In [165]:
batch_size = 32
epochs = 2

In [166]:
hist = model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs,validation_data=(X_te,y_test))

Train on 45000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


In [167]:
test_data = pd.read_csv('./test.csv')

In [168]:
list_sentences_test_data = test_data["comment_text"]


In [169]:
list_tokenized_test_data = tokenizer.texts_to_sequences(list_sentences_test_data)

In [170]:
X_test_data = pad_sequences(list_tokenized_test_data, maxlen=maxlen)

In [171]:
y_test_data = model.predict(X_test_data,batch_size=batch_size,verbose=1)



In [172]:
df_results = pd.DataFrame(y_test_data, columns = ["severe_toxicity", "obscene", "identity_attack", "insult", "threat"]) 

In [173]:
# df_results

In [174]:
# test_data

In [175]:
df = pd.concat([test_data.reset_index(drop=True),df_results.reset_index(drop=True)], axis=1)


In [176]:
# df

In [180]:
top10 = df.sort_values(by='obscene', ascending=False).head(10)
pd.set_option('display.max_colwidth', -1)
top10

Unnamed: 0,id,comment_text,severe_toxicity,obscene,identity_attack,insult,threat
70110,7167430,"""Rude"" to a union goon or a lefty is saying anything you don't like. You work for NEA, the absolutely worst union in collective bargaining terms, but the best at getting your mind-numbed lefty Ed-School grads to the polls and buying school boards. Damn I'd go back just to sit across the table from an utter a** like you; I'd eat your lunch and maybe have some fun with your wife.",0.07,0.32,0.08,0.55,0.04
64520,7161840,"Well, naturally Flowers is against immigration being blocked - that's her bread & butter. Plus, it's woefully obvious that Trump and his ""advisers"" don't know their as*** from holes in the ground and anyone who doesn't see this EO is wrong is an idiot. Otherwise, she's just as nasty as ever.",0.05,0.32,0.05,0.71,0.01
64837,7162157,it's a bit rich that Bostonians - the most racist city in NA - are telling people to get the f*ck out of the city - hypocrisy be thy name Massholes,0.05,0.31,0.05,0.65,0.02
90469,7187789,What is there to debate\nTrudeau is an idiot and a dick,0.05,0.29,0.07,0.69,0.03
32576,7129896,AM is an idiotic jock who doesn't know about the issues.,0.04,0.28,0.05,0.73,0.02
44755,7142075,Stupid is as stupid does.,0.04,0.28,0.05,0.74,0.02
4184,7101504,Your an idiot,0.06,0.28,0.07,0.67,0.03
26959,7124279,Your an idiot,0.06,0.28,0.07,0.67,0.03
7409,7104729,Stupid👇,0.05,0.28,0.05,0.77,0.02
97305,7194625,Well the Dumper Post has a Indian idiot writing with this guy!,0.05,0.28,0.06,0.68,0.02


In [141]:
# # serialize model to JSON
# model_json = model.to_json()
# with open("model.json", "w") as json_file:
#     json_file.write(model_json)
# # serialize weights to HDF5
# model.save_weights("model.h5")
# print("Saved model to disk")

In [None]:
# # load json and create model
# json_file = open('model.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# # load weights into new model
# loaded_model.load_weights("model.h5")
# print("Loaded model from disk")