In [None]:
# Display plots directly in the notebook instead of in a new window
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import Counter
import re
import nltk
import numpy as np
nltk.download('punkt')

In [None]:
wholeTrainingSet = pd.read_csv('./train.csv')

In [None]:
# These first few cells are us following along with the project at the following link, just adjusting to our data.
# https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/

# note: Once model is working, we should use the actual test set here.
# Get a subset so that we don't waste time while just trying to get a basic model to work.
test_set  = wholeTrainingSet.copy()[10001:18001]
training_set = wholeTrainingSet.copy()[:10000]

# Training comments.
train_comments = training_set.comment_text.values

# Split test comments into two sets. One for validation, one for testing after training.
split_frac = .5
split_id = int(split_frac * len(test_set))
validation_set, test_set= test_set[:split_id], test_set[split_id:]

test_comments = test_set.comment_text.values
validation_comments = validation_set.comment_text.values

print(len(test_comments))
print(len(train_comments))
print(len(validation_comments))

In [54]:
%%time
#thank you Duy
toktok = nltk.ToktokTokenizer()
test_comments = np.array([toktok.tokenize(sent.lower()) for sent in test_comments])

print("Processed test_comments")    

print(test_comments[:2])

Processed test_comments
[list(['<<they', 'are', 'working', 'together>>', 'lol'])
 list(['"', 'individuals', 'with', 'pre-existing', 'conditions', 'didn', "'", 't', 'pay', 'the', 'same.', '"', 'wrong.', 'that', 'is', 'what', 'happens', 'in', 'a', 'real', 'insurance', 'system', ',', 'because', 'insurance', 'is', 'based', 'on', 'risk', 'to', 'the', 'company.', 'with', 'obamacare', 'there', 'is', 'no', 'risk', 'to', 'the', 'insurance', 'companies', 'because', 'they', 'are', 'acting', 'as', 'nothing', 'more', 'than', 'a', 'government', 'agency', 'with', 'endless', 'capital.', 'the', 'only', 'other', 'option', 'they', 'will', 'have', 'is', 'rationing', 'their', 'services.', 'obamacare', 'will', 'either', 'bankrupt', 'the', 'country', 'or', 'we', 'will', 'be', 'saddled', 'with', 'a', 'healthcare', 'system', 'that', 'has', 'to', 'pick', 'who', 'and', 'when', 'the', 'individual', 'gets', 'treated', '.'])]
CPU times: user 17.5 s, sys: 190 ms, total: 17.7 s
Wall time: 17.7 s


In [55]:
%%time

train_comments = np.array([toktok.tokenize(sent.lower()) for sent in train_comments])
    
print("Processed train_comments")
print(train_comments[:2])

Processed train_comments
[list(['this', 'is', 'so', 'cool.', 'it', "'", 's', 'like', ',', "'", 'would', 'you', 'want', 'your', 'mother', 'to', 'read', 'this??', "'", 'really', 'great', 'idea', ',', 'well', 'done', '!'])
 list(['thank', 'you', '!', '!', 'this', 'would', 'make', 'my', 'life', 'a', 'lot', 'less', 'anxiety-inducing.', 'keep', 'it', 'up', ',', 'and', 'don', "'", 't', 'let', 'anyone', 'get', 'in', 'your', 'way', '!'])]
CPU times: user 2min 25s, sys: 3.29 s, total: 2min 28s
Wall time: 2min 28s


In [56]:
validation_comments = np.array([toktok.tokenize(sent.lower()) for sent in validation_comments])

print("Processed validation_comments")
print(validation_comments[:2])

Processed validation_comments
[list(['i', "'", 'd', 'prefer', 'my', 'child', 'to', 'be', 'using', 'pot', 'over', 'alcohol', 'or', 'cigarettes', 'any', 'day', 'of', 'the', 'week.', 'rather', 'than', 'passing', 'legislation', 'to', 'do', 'this', 'or', 'that', ',', 'why', 'not', 'just', 'de-list', 'it', ',', 'making', 'it', 'of', 'about', 'the', 'same', 'stature', 'as', 'dandelions', '?', 'by', 'doing', 'so', 'the', 'it', 'becomes', 'common', 'place', 'and', 'even', 'boring', ',', 'the', 'already', 'downward', 'trend', 'in', 'adolescent', 'use', 'would', 'probably', 'accelerate', '.'])
 list(['perhaps', 'just', 'a', 'war', 'on', 'real', 'estate', 'agents', ',', 'brokers', 'and', 'realtors', 'that', 'have', 'won', 'big', 'time', 'with', 'current', 'fee', 'structure.', 'found', 'much', 'more', 'value', 'in', 'real', 'estate', 'lawyer', 'for', '$', '2k', 'than', '$', '50k+', 'for', 'agent', 'fees', '.'])]


In [57]:
words = Counter()

# Count the words in our tokenized training set sentences.
for sentence in train_comments:
    for word in sentence:
        words.update([word])

In [58]:
# Remove words that only appear once.

print(len(words))
for key in list(words):
    if(words[key] == 1):
        del words[key]
        
print(len(words))

492197
205618


In [59]:
# Sort words by frequency and add "_UNKNOWN" to our list
words = sorted(words, key=words.get, reverse=True)
words = ["_UNKNOWN"] + words

In [60]:
# A dictionary to map words to their index
word2idx = {o:i for i,o in enumerate(words)}

# A dictionary to map indexes to their word.
idx2word = {i:o for i,o in enumerate(words)}

In [61]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

In [62]:
# Convert tokenized sentence so that its words are replaced by the indices of its word from the word2idx dictionary.
def convertTokenizedSentenceToIDX(sentence):
    for i in range(len(sentence)):
        if sentence[i] in word2idx:
            # If it's in word2idx, it has an assigned index.
            sentence[i] = word2idx[sentence[i]]
        else:
            # If it's not in the word2idx, then it's unknown.
            sentence[i] = 0
    
    return sentence
            

print(convertTokenizedSentenceToIDX(['Is', 'that', 'the', 'law', '?']))

[0, 11, 1, 204, 19]


In [63]:
# Convert all our words to their indices.

for i in range(len(test_comments)):
    test_comments[i] = convertTokenizedSentenceToIDX(test_comments[i])

print("Processed test_comments")    

for i in range(len(train_comments)):
    train_comments[i] = convertTokenizedSentenceToIDX(train_comments[i])
    
print("Processed train_comments")    

for i in range(len(validation_comments)):
    validation_comments[i] = convertTokenizedSentenceToIDX(validation_comments[i])

print("Processed validation_comments")    

Processed test_comments
Processed train_comments
Processed validation_comments


In [64]:
# We now have a vectorized comment array for all three sets.
print("Training:")
print(train_comments[:5])

print("Testing:")
print(test_comments[:5])

print("Validation:")
print(validation_comments[:5])

Training:
[list([27, 8, 45, 12305, 14, 7, 17, 61, 2, 7, 50, 15, 118, 43, 1106, 3, 205, 31464, 7, 138, 182, 385, 2, 120, 248, 34])
 list([384, 15, 34, 34, 27, 50, 101, 69, 221, 6, 236, 243, 0, 189, 14, 70, 2, 4, 67, 7, 24, 159, 218, 71, 9, 43, 114, 34])
 list([27, 8, 163, 55, 10554, 3226, 212, 142, 4456, 3, 15, 16, 412, 14, 1035, 111, 5723, 34])
 list([8, 27, 184, 13, 7, 214, 21, 390, 3, 6486, 23, 69, 1194, 19, 65, 37, 15, 21, 6317, 14, 19])
 list([5997, 15, 778, 18, 6, 1086, 5, 2987, 10])]
Testing:
[list([82219, 18, 341, 0, 740])
 list([12, 1133, 26, 8177, 1713, 174, 7, 24, 147, 1, 2661, 12, 1146, 11, 8, 36, 850, 9, 6, 180, 544, 239, 2, 83, 544, 8, 343, 23, 826, 3, 1, 4956, 26, 1537, 54, 8, 47, 826, 3, 1, 544, 448, 83, 25, 18, 1895, 28, 157, 56, 74, 6, 116, 2068, 26, 3119, 11859, 1, 81, 84, 1587, 25, 37, 22, 8, 21522, 39, 3285, 1537, 37, 342, 3173, 1, 172, 31, 30, 37, 21, 14738, 26, 6, 1019, 239, 11, 49, 3, 1102, 42, 4, 65, 1, 849, 423, 1617, 10])
 list([13, 7, 179, 248, 11, 933, 32, 1

In [65]:
# Pad all the sentences so that they are 200 in length. This has the words at the beginning followed by 0's. I'm not
# sure if that matters or not.

def padSentence(sentence):
    padded = np.zeros(200, dtype=int)
    
    for i in range(len(sentence)):
        if i >= 200:
            break
            
        padded[i] = sentence[i]
    
    return padded

In [66]:
# Pad the sentences to a fixed length

for i in range(len(test_comments)):
    test_comments[i] = padSentence(test_comments[i])
    

for i in range(len(train_comments)):
    train_comments[i] = padSentence(train_comments[i])
    
for i in range(len(validation_comments)):
    validation_comments[i] = padSentence(validation_comments[i])

In [67]:
# We now have a vectorized comment array for all three sets with a fixed length of 200.

print("Training:")
print(train_comments[:1])

print("Testing:")
print(test_comments[:1])

print("Validation:")
print(validation_comments[:1])

Training:
[array([   27,     8,    45, 12305,    14,     7,    17,    61,     2,
           7,    50,    15,   118,    43,  1106,     3,   205, 31464,
           7,   138,   182,   385,     2,   120,   248,    34,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
         

In [None]:
# Make sure the datatype of our pd.arrays is int instead of object.

train_comments = np.vstack(train_comments).astype(int)
validation_comments = np.vstack(validation_comments).astype(int)
test_comments = np.vstack(test_comments).astype(int)

In [None]:
# Create our datasets. This just maps each vectorized sentence to it's corresponding "target" column.
# The target column is adjust so that any values >= .5 are turned into one, and the others 0.

batch_size = 50


# Create training data DataLoader

tensor_train_x = torch.from_numpy(train_comments)
tensor_train_y = torch.from_numpy(np.where(training_set.target >= .5, 1, 0))

train_data = TensorDataset(tensor_train_x, tensor_train_y)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)


# Create validation data DataLoader

tensor_validation_x = torch.from_numpy(validation_comments)
tensor_validation_y = torch.from_numpy(np.where(validation_set.target >= .5, 1, 0))

validation_data = TensorDataset(tensor_validation_x, tensor_validation_y)
validation_loader = DataLoader(validation_data, shuffle=True, batch_size=batch_size)


# Create test data DataLoader

tensor_test_x = torch.from_numpy(test_comments)
tensor_test_y = torch.from_numpy(np.where(test_set.target >= .5, 1, 0))

test_data = TensorDataset(tensor_test_x, tensor_test_y)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [None]:
# Example showing toxic comment #5's representation in our dataset.
# The input data is train_comments[4], a vectorized sentence.
# The output data is target_vals[4], a 0 or 1.

print(training_set.head(6))
print("Vectorized comment: ", train_comments[4])
print("Target: ", train_y.T[4])

In [None]:
print(len(train_comments))
print(len(validation_comments))
print(len(test_comments))

print()

print(len(train_loader))
print(len(validation_loader))
print(len(test_loader))

In [None]:
device = torch.device("cuda")

In [None]:
class ToxicityNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(ToxicityNet, self).__init__()
        
        # Specify how many outputs our linear layer (output layer) should have.
        self.output_size = output_size
        
        # How many times our LSTM is stacked.
        self.n_layers = n_layers
        
        # How many dimenions our hidden state representations should be.
        self.hidden_dim = hidden_dim
        
        # Embedding layer.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Actual LSTM that does all the work.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        
        # Helps prevent overfitting.
        self.dropout = nn.Dropout(drop_prob)
        
        # Output layer.
        self.fc = nn.Linear(hidden_dim, output_size)
        
        # Makes sure our outputs are valid probabilities.
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        
        # Our batch size is the length of x, as x is just a list of tokenized and indexed sentences.
        batch_size = x.size(0)
        
        # Makes sure the datatype of the word representations is long.
        x = x.long()
        
        # Embed the words.
        embeds = self.embedding(x)
        
        # Run the lstm on the input.
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # Dropout (Not really sure what it is, but it helps prevent overfitting.)
        out = self.dropout(lstm_out)
        
        # Linear to produce our target score
        out = self.fc(out)
        
        # Makes sure the target score is between 0 and 1
        out = self.sigmoid(out)
        
        # Since out is a list of len(sentence) hidden states, we only return the score corresponding with the last hidden state.
        out = out.view(batch_size, -1)
        out = out[:,-1]
        
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden


In [None]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 24
hidden_dim = 64
n_layers = 2

model = ToxicityNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
# How many times to loop over training the dataset.
epochs = 30

counter = 0
print_every = 100
clip = 5

# Our loss starts at infinity.
valid_loss_min = np.Inf

# Make sure it's keeping track of our gradients.
model.train()
for i in range(epochs):
    
    # Do the default hidden state
    h = model.init_hidden(batch_size)
    
    # For each input, paired with an output label in the training data
    for inputs, labels in train_loader:
        counter += 1
        
        # The initial hidden state
        h = tuple([e.data for e in h])
        
        # Send input and label to gpu.
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Reset the gradient to zero so the old gradient doesn't interfere with the new gradient.
        model.zero_grad()
        
        # Get the output of our model
        output, h = model(inputs, h)
        
        # Calculate the loss
        loss = criterion(output, labels.float())
        
        # No exploding gradient
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Calculate the gradients
        loss.backward()
        
        # Move the weights toward their optimum
        optimizer.step()
        
        if(counter % print_every == 0):
            val_h = model.init_hidden(batch_size)
            val_losses = []
            
            # We are evaluating, so we don't track these operations.
            model.eval()
            
            for val_inp, val_lab in validation_loader:
                # Initial hidden state
                val_h = tuple([each.data for each in val_h])
                
                # Send to gpu
                val_inp, val_lab = val_inp.to(device), val_lab.to(device)
                
                # Output of running on validation sentence.
                val_out, val_h = model(val_inp, val_h)
                
                # Get the loss from the loss function.
                val_loss = criterion(val_out, val_lab.float())
                
                # Record the loss on this item.
                val_losses.append(val_loss.item())
            
            # Back to training, track operations.
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}".format(np.mean(val_losses)))
            
            # If these weights improved our performance, then save them.
            if(np.mean(val_losses) <= valid_loss_min):
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min, np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)
                    

print(counter)

In [None]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

In [None]:
h = model.init_hidden(1)
h = tuple([each.data for each in h])

x = torch.tensor(convertTokenizedSentenceToIDX(list(["i", "love", "you"])))
x = x.view(1,3)
x = x.to(device)

out, hidden = model(x, h)

print(out)

if(out[0] >= .5):
    print("This is probably toxic.")
    

In [58]:
# This is just the tutorial from https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Turn each word into an index and put it in a tensor.
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]


word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

# Every word is assigned a one-hot vector representation.
print(word_to_ix)

# Since our data is just raw scores, we shouldn't have to do this. (?)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        
        # Dimension of the hidden states.
        self.hidden_dim = hidden_dim

        # A lookup table from indices to vectors.
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        # Embed our sentence
        embeds = self.word_embeddings(sentence)
        
        # 
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Our model in this case has our EMBEDDING_DIM and HIDDEN_DIM as defined above.
# Then our vocab size is just however many words we saw in our corpus. Our tagset
# is 3, as we're choosing either Noun, Verb, or Det.
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print("Print after training: ", tag_scores)
    


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
tensor([[-0.7633, -1.4841, -1.1804],
        [-0.7582, -1.4659, -1.2018],
        [-0.7564, -1.5394, -1.1516],
        [-0.8099, -1.4821, -1.1149],
        [-0.7384, -1.5232, -1.1903]])
Print after training:  tensor([[-0.1197, -2.7209, -3.0572],
        [-3.3684, -0.0393, -5.5054],
        [-2.7692, -4.7681, -0.0739],
        [-0.0474, -3.6342, -3.9161],
        [-3.2497, -0.0447, -5.3062]])
