# EECS 738 Comment Toxicity Prediction Model

In [6]:
import sys
sys.path.insert(0, '../src/')
import tools

import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping

In [7]:
# establish file toolkit
t = tools.tools('../data/train.csv')
df = t.data

Processing training data...
Removing Punctuation...
Building Stop Word Dictionary...
About to get all good comments


In [8]:
import CommentPredictor as CP
mod = CP.CommentPredictor(t)

## Data Prep/Weight Mapping
Here we prep the text corpus for neural network training. getProcessedComments gives us all comments, lowercased with punctuation and Unicode characters stripped. We then replace each word in each comment with the calculated preliminary toxicity weight we have calculated for that word. We pad/truncate comments so that each comment is a 200-word sequence, and create a list of target toxicity labels corresponding to these comments. These two lists (input and output) will form the training set for our network.

In [9]:
procComms = (t.getProcessedComments())
labels = df['target']
wordDict = mod.word_weight_dict
print("Training set size: " + str(len(procComms)))
for index, comment in enumerate(procComms):
    split = text_to_word_sequence(comment)
    for j, word in enumerate(split):
        split[j] = wordDict[word]
    procComms[index] = split
uniqueWords = len(wordDict)
commentLen = 200
x = pad_sequences(procComms, maxlen=commentLen, value=0.0, dtype='float32', padding="post", truncating="post")
y = labels


print("Unique words: " + str(uniqueWords))


Removing Punctuation...
Training set size: 1804874
Unique words: 287025


In [7]:
print(x[0])
print(procComms[0])

[0.  0.  0.  0.5 0.  0.  0.5 0.  0.  0.5 0.  0.5 0.  0.5 0.  0.5 0.5 0.5
 0.5 0.5 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
 0.  0. ]
[0, 0, 0, 0.5, 0, 0, 0.5, 0, 0, 0.5, 0, 0.5, 0, 0.5, 0, 0.5, 0.5, 0.5, 0.5, 0.5]


In [19]:
tdf = pd.read_csv('../data/test.csv')
testProcComms = t.processCommentList(tdf['comment_text'])
for index, comment in enumerate(testProcComms):
    split = text_to_word_sequence(comment)
    for j, word in enumerate(split):
        if word in wordDict:
            split[j] = wordDict[word]
        else: split[j] = 0.0
    testProcComms[index] = split
xTest = pad_sequences(testProcComms, maxlen=commentLen)


Removing Punctuation...


## Neural Network Construction
We now initialize a very simple neural network to predict the toxicity value of each comment, represented as a sequence of 200 word-toxicity weights. These serve as the inputs to 200 input neurons. We utilize two standard, 50-neuron dense layers and two dropout layers, which each drop 1/10 of their nodes from the network at random. This encourages the network to generalize and avoid overfitting the training set.

In [10]:
#input layer for an arbitrary number of comments of length commentLen
inputLayer = Input(shape=(commentLen, ))    
#drop 1/10 nodes to improve generalization
n = Dropout(rate=.9)(inputLayer)
#standard nn
n = Dense(50, activation='relu')(inputLayer)
#drop again
n = Dropout(rate=.9)(n)
#standard nn, sigmoid for values between 0 and 1
n = Dense(50, activation='relu')(n)
n = Dense(1, activation='sigmoid')(n)
model = Model(inputs=inputLayer, outputs=n)
sgdOpt = optimizers.SGD(lr=0.0001, momentum=0.0, decay=0.0, nesterov=False)
model.compile(loss='binary_crossentropy', optimizer=sgdOpt, metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                10050     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 51        
Total params: 12,651
Trainable params: 12,651
Non-trainable params: 0
_________________

### Neural Network Training
Train the neural network on the word-weight lists for each comment, trying to predict toxicity score.
Use SGD mini-batches of size 32 and 2 epochs. Remove 1/10 of the training data for validation.

In [11]:
model.fit(x, y, batch_size=10, epochs=2, validation_split=.1)
preds = model.predict(x)

Instructions for updating:
Use tf.cast instead.
Train on 1624386 samples, validate on 180488 samples
Epoch 1/1


## Prediction
The neu

In [12]:
trainComments = df["comment_text"]
print(trainComments[0])
print(preds[0])
print(trainComments[4])
print(preds[4])

This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!
[0.12665713]
haha you guys are a bunch of losers.
[0.12853405]


In [13]:
print(y[0])
print(y[4])

0.0
0.8936170212765957
