**Attibution**: This code was adapted from the following 'Text Classification using Neural Networks' tutorial by gk_:
https://machinelearnings.co/text-classification-using-neural-networks-f5cd7b8765c6

In [1]:
# use natural language toolkit
import nltk
from nltk.stem.lancaster import LancasterStemmer
ls = LancasterStemmer()

import os
import json
import datetime
import pandas as pd
import numpy as np
import time

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.shape

(1804874, 45)

In [4]:
def assign_toxicity(target):
    if target > 0.8:
        return "super toxic"
    elif target > 0.5:
        return "toxic"
    else:
        return "not toxic"

In [5]:
train['cat'] = train.target.apply(assign_toxicity,)

In [6]:
holdout = train[100000:150000]
for_model = train[:100000]

In [7]:
super_toxic = for_model[for_model.cat == 'super toxic']

toxic = for_model[for_model.cat == 'toxic']
nontoxic = for_model[for_model.cat == 'not toxic']

In [8]:
super_toxic.shape, toxic.shape, nontoxic.shape

((1190, 46), (3944, 46), (94866, 46))

In [9]:
nontoxic.shape[0] / for_model.shape[0]

0.94866

In [10]:
sample = super_toxic.head(10).append(toxic.head(20)).append(nontoxic.head(70))

## Add stopwords

In [11]:
words = []
classes = []
documents = []
stops = ['?']

In [12]:
# loop through each sentence in our training data
for i,r in sample.iterrows():
    # tokenize each word in the sentence
    w = nltk.word_tokenize(r['comment_text'])
    # add to our words list
    words.extend(w)
    # add to documents in our corpus
    documents.append((w, r['cat']))
    # add to our classes list
    if r['cat'] not in classes:
        classes.append(r['cat'])

# stem and lower each word and remove duplicates
words = [ls.stem(w.lower()) for w in words if w not in stops]
words = list(set(words))

# remove duplicates
classes = list(set(classes))

print (len(documents), "documents")
print (len(classes), "classes", classes)

100 documents
3 classes ['toxic', 'super toxic', 'not toxic']


In [15]:
# compute sigmoid nonlinearity
def sigmoid(x):
    output = 1/(1+np.exp(-x))
    return output

# convert output of sigmoid function to its derivative
def sigmoid_output_to_derivative(output):
    return output*(1-output)
 
def clean_up_sentence(sentence):
    # tokenize the pattern
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word
    sentence_words = [ls.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

def think(sentence, show_details=False):
    x = bow(sentence.lower(), words, show_details)
    if show_details:
        print ("sentence:", sentence, "\n bow:", x)
    # input layer is our bag of words
    l0 = x
    # matrix multiplication of input and hidden layer
    l1 = sigmoid(np.dot(l0, synapse_0))
    # output layer
    l2 = sigmoid(np.dot(l1, synapse_1))
    return l2

## Vary parameters

In [35]:

def train(X, y, hidden_neurons=10, alpha=1, epochs=5000, dropout=True, dropout_percent=0.5):

    print ("Number of Neurons: {}, alpha: {}, epochs: {}, dropout: {}, dropout_percent: {}".format(hidden_neurons, alpha, epochs, dropout, dropout_percent))
    np.random.seed(1)

    last_mean_error = 1
    # randomly initialize our weights with mean 0
    synapse_0 = 2*np.random.random((len(X[0]), hidden_neurons)) - 1
    synapse_1 = 2*np.random.random((hidden_neurons, len(classes))) - 1

    prev_synapse_0_weight_update = np.zeros_like(synapse_0)
    prev_synapse_1_weight_update = np.zeros_like(synapse_1)

    synapse_0_direction_count = np.zeros_like(synapse_0)
    synapse_1_direction_count = np.zeros_like(synapse_1)
        
    for j in iter(range(epochs+1)):

        # Feed forward through layers 0, 1, and 2
        layer_0 = X
        layer_1 = sigmoid(np.dot(layer_0, synapse_0))
                
        if(dropout):
            layer_1 *= np.random.binomial([np.ones((len(X),hidden_neurons))],1-dropout_percent)[0] * (1.0/(1-dropout_percent))

        layer_2 = sigmoid(np.dot(layer_1, synapse_1))

        # how much did we miss the target value?
        layer_2_error = y - layer_2

        if (j% 1000) == 0 and j > 5000:
            # if this 10k iteration's error is greater than the last iteration, break out
            if np.mean(np.abs(layer_2_error)) < last_mean_error:
                print ("delta after "+str(j)+" iterations:" + str(np.mean(np.abs(layer_2_error))) )
                last_mean_error = np.mean(np.abs(layer_2_error))
            else:
                print ("break:", np.mean(np.abs(layer_2_error)), ">", last_mean_error )
                break
                
        # in what direction is the target value?
        # were we really sure? if so, don't change too much.
        layer_2_delta = layer_2_error * sigmoid_output_to_derivative(layer_2)

        # how much did each l1 value contribute to the l2 error (according to the weights)?
        layer_1_error = layer_2_delta.dot(synapse_1.T)

        # in what direction is the target l1?
        # were we really sure? if so, don't change too much.
        layer_1_delta = layer_1_error * sigmoid_output_to_derivative(layer_1)
        
        synapse_1_weight_update = (layer_1.T.dot(layer_2_delta))
        synapse_0_weight_update = (layer_0.T.dot(layer_1_delta))
        
        if(j > 0):
            synapse_0_direction_count += np.abs(((synapse_0_weight_update > 0)+0) - ((prev_synapse_0_weight_update > 0) + 0))
            synapse_1_direction_count += np.abs(((synapse_1_weight_update > 0)+0) - ((prev_synapse_1_weight_update > 0) + 0))        
        
        synapse_1 += alpha * synapse_1_weight_update
        synapse_0 += alpha * synapse_0_weight_update
        
        prev_synapse_0_weight_update = synapse_0_weight_update
        prev_synapse_1_weight_update = synapse_1_weight_update

    now = datetime.datetime.now()

    # persist synapses
    synapse = {'synapse0': synapse_0.tolist(), 'synapse1': synapse_1.tolist(),
               'datetime': now.strftime("%Y-%m-%d %H:%M"),
               'words': words,
               'classes': classes
              }
    synapse_file = "synapses.json"

    with open(synapse_file, 'w') as outfile:
        json.dump(synapse, outfile, indent=4, sort_keys=True)
    print ("saved synapses to:", synapse_file)


In [49]:
def classify(sentence, show_details=False):
    results = think(sentence, show_details)

    results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD ] 
    results.sort(key=lambda x: x[1], reverse=True) 
    return_results =[[classes[r[0]],r[1]] for r in results]

    return return_results

def test_nn():
    sample_holdout = holdout.sample(n=1000)
    print(sample_holdout.cat.value_counts())
    sample_holdout['classified'] = sample_holdout.comment_text.apply(classify,)
    
    output = sample_holdout[['target', 'comment_text', 'cat', 'classified']]
    output['model_result'] = output.classified.apply(extract_result,)
    print()
    print("extracted results")
    print()

    output['accuracy'] = output.cat == output.model_result
    print(output.accuracy.value_counts())

In [56]:
# create our training data
training, output = [], []
# create an empty array for our output
output_empty = [0] * len(classes)
n = 0 

# training set, bag of words for each sentence
for doc in documents:
    n += 1
    if n % 100 == 0:
        print(n)
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word
    pattern_words = [ls.stem(word.lower()) for word in pattern_words]
    # create our bag of words array
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    training.append(bag)
    # output is a '0' for each tag and '1' for current tag
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    output.append(output_row)



100


In [57]:
X = np.array(training)
y = np.array(output)

start_time = time.time()

train(X, y, dropout=False, epochs = 6000)
print()
elapsed_time = time.time() - start_time
print ("processing time:", elapsed_time, "seconds")
# probability threshold
ERROR_THRESHOLD = 0.2
# load our calculated synapse values
synapse_file = 'synapses.json' 
with open(synapse_file) as data_file: 
    synapse = json.load(data_file) 
    synapse_0 = np.asarray(synapse['synapse0']) 
    synapse_1 = np.asarray(synapse['synapse1'])

print()
test_nn()


training, output = [], []


Number of Neurons: 10, alpha: 1, epochs: 6000, dropout: False, dropout_percent: 0.5
delta after 6000 iterations:0.0009924492963152196
saved synapses to: synapses.json

processing time: 3.6982030868530273 seconds

not toxic      926
toxic           61
super toxic     13
Name: cat, dtype: int64

extracted results

True     651
False    349
Name: accuracy, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


n = 100

Number of Neurons: 10, alpha: 1, epochs: 6000, dropout: True, dropout_percent: 0.5

True     932
False     68

n = 100

Number of Neurons: 10, alpha: 1, epochs: 6000, dropout: False, dropout_percent: 0.5

True     651
False    349