# EECS 738 Comment Toxicity Prediction Model

In [1]:
import sys
sys.path.insert(0, '../src/')
import tools

import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [4]:
# establish file toolkit
t = tools.tools('../data/train.csv')
df = t.data

Processing training data...
Removing Punctuation...
Building Stop Word Dictionary...
About to get all good comments


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  comments_scores['comment_text'] = self.processCommentList(comments_scores['comment_text'])


In [5]:
print(df['target'])

0          0.000000
1          0.000000
2          0.000000
3          0.000000
4          0.893617
5          0.666667
6          0.457627
7          0.000000
8          0.000000
9          0.000000
10         0.000000
11         0.440000
12         0.000000
13         0.600000
14         0.500000
15         0.000000
16         0.000000
17         0.000000
18         0.000000
19         0.500000
20         0.000000
21         0.000000
22         0.000000
23         0.000000
24         0.000000
25         0.000000
26         0.000000
27         0.000000
28         0.000000
29         0.000000
             ...   
1804844    0.000000
1804845    0.000000
1804846    0.000000
1804847    0.000000
1804848    0.000000
1804849    0.400000
1804850    0.000000
1804851    0.000000
1804852    0.300000
1804853    0.000000
1804854    0.000000
1804855    0.166667
1804856    0.500000
1804857    0.700000
1804858    0.200000
1804859    0.000000
1804860    0.000000
1804861    0.000000
1804862    0.000000


In [6]:
import CommentPredictor as CP
mod = CP.CommentPredictor(t)

## Data Prep/Weight Mapping
Here we prep the text corpus for neural network training. getProcessedComments gives us all comments, lowercased with punctuation and Unicode characters stripped. We then replace each word in each comment with the calculated preliminary toxicity weight we have calculated for that word. We pad/truncate comments so that each comment is a 200-word sequence, and create a list of target toxicity labels corresponding to these comments. These two lists (input and output) will form the training set for our network.

In [12]:
procComms = (t.getProcessedComments())
labels = df['target']
wordDict = mod.word_weight_dict
print("Training set size: " + str(len(procComms)))
for index, comment in enumerate(procComms):
    split = text_to_word_sequence(comment)
    for j, word in enumerate(split):
        split[j] = wordDict[word]
    procComms[index] = split
uniqueWords = len(wordDict)
commentLen = 200
x = pad_sequences(procComms, maxlen=commentLen, value=0.0, dtype='float32', padding="post", truncating="post")
y = labels


print("Unique words: " + str(uniqueWords))


Removing Punctuation...
Training set size: 1804874
Unique words: 287025


In [13]:
print(x[0])
print(procComms[0])

[0.   0.   0.   0.66 0.   0.   0.66 0.   0.   0.66 0.   0.66 0.   0.66
 0.   0.66 0.66 0.66 0.66 0.66 0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
 0.   0.   0.   0.  ]
[0.0, 0.0,

In [19]:
tdf = pd.read_csv('../data/test.csv')
testProcComms = t.processCommentList(tdf['comment_text'])
for index, comment in enumerate(testProcComms):
    split = text_to_word_sequence(comment)
    for j, word in enumerate(split):
        if word in wordDict:
            split[j] = wordDict[word]
        else: split[j] = 0.0
    testProcComms[index] = split
xTest = pad_sequences(testProcComms, maxlen=commentLen)


Removing Punctuation...


## Neural Network Construction
We now initialize a very simple neural network to predict the toxicity value of each comment, represented as a sequence of 200 word-toxicity weights. These serve as the inputs to 200 input neurons. We utilize two standard, 50-neuron dense layers and two dropout layers, which each drop 1/10 of their nodes from the network at random. This encourages the network to generalize and avoid overfitting the training set.

In [25]:
#input layer for an arbitrary number of comments of length commentLen
inputLayer = Input(shape=(commentLen, ))    
#drop 1/10 nodes to improve generalization
n = Dropout(rate=.9)(inputLayer)
#standard nn
n = Dense(50, activation='relu')(n)
#drop again
n = Dropout(rate=.9)(n)
#standard nn, sigmoid for values between 0 and 1
n = Dense(50, activation='relu')(n)
n = Dense(1, activation='sigmoid')(n)
model = Model(inputs=inputLayer, outputs=n)
adamOpt = optimizers.Adam(lr=.0001)
model.compile(loss='binary_crossentropy', optimizer=adamOpt, metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 200)               0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 50)                10050     
_________________________________________________________________
dropout_10 (Dropout)         (None, 50)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 51        
Total params: 12,651
Trainable params: 12,651
Non-trainable params: 0
_________________________________________________________________


### Neural Network Training
Train the neural network on the word-weight lists for each comment, trying to predict toxicity score.
Use SGD mini-batches of size 32 and 2 epochs. Remove 1/10 of the training data for validation.

In [None]:
model.fit(x, y, batch_size=32, epochs=1, validation_split=.1)
preds = model.predict(x)

Train on 1624386 samples, validate on 180488 samples
Epoch 1/1