In [2]:
# Display plots directly in the notebook instead of in a new window
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import Counter
import re
import nltk
import numpy as np
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jtuxhorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
wholeTrainingSet = pd.read_csv('./train.csv')

In [56]:
# These first few cells are us following along with the project at the following link, just adjusting to our data.
# https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/

# Get a subset so that we don't waste time while just trying to get a basic model to work.
test_set  = wholeTrainingSet.copy()[100001:120001]
training_set = wholeTrainingSet.copy()[:100000]

print(len(test_set))
print(len(training_set))

20000
100000


In [57]:
training_set.comment_text = training_set.comment_text.str.lower()
test_set.comment_text  = test_set.comment_text.str.lower()

In [58]:
# Tokenize our comments.
training_set.comment_text = training_set["comment_text"].apply(nltk.word_tokenize)
test_set.comment_text  = test_set["comment_text"].apply(nltk.word_tokenize)

In [59]:
words = Counter()

# Count the words in our tokenized training set sentences.
for index, comment in training_set.comment_text.iteritems():
    for word in comment:
        words.update([word])

In [60]:
len(words)

95004

In [61]:
# We now have a tokenized comment_text column in dataframe.
print(len(training_set))
training_set.head()

100000


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"[this, is, so, cool, ., it, 's, like, ,, 'woul...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,"[thank, you, !, !, this, would, make, my, life...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,"[this, is, such, an, urgent, design, problem, ...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,"[is, this, something, i, 'll, be, able, to, in...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,"[haha, you, guys, are, a, bunch, of, losers, .]",0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [62]:
# Remove words that only appear once.

print(len(words))
for key in list(words):
    if(words[key] == 1):
        del words[key]
        
print(len(words))

95004
47370


In [63]:
words = sorted(words, key=words.get, reverse=True)
words = ["_UNKNOWN"] + words

In [64]:
# A dictionary to map words to their index
word2idx = {o:i for i,o in enumerate(words)}

# A dictionary to map indexes to their word.
idx2word = {i:o for i,o in enumerate(words)}

In [65]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

In [66]:
split_frac = .5
split_id = int(split_frac * len(test_set))
validation_set, test_set= test_set[:split_id], test_set[split_id:]

In [67]:
print(len(validation_set))
validation_set.head()

10000


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
100001,364508,0.0,"[is, there, any, research, to, tell, us, wheth...",0.0,0.0,0.0,0.0,0.0,,,...,139639,approved,0,0,0,1,0,0.0,0,4
100002,364509,0.0,"[ot, ,, you, are, definitely, on, to, somethin...",0.0,0.0,0.0,0.0,0.0,,,...,139771,approved,0,0,0,1,0,0.0,0,4
100003,364511,0.4,"[not, once, does, anyone, mention, that, the, ...",0.0,0.2,0.4,0.3,0.0,0.0,0.0,...,139749,approved,0,0,0,0,0,0.0,5,10
100004,364512,0.0,"[os, ,, i, think, the, $, 12,000.00, proves, t...",0.0,0.0,0.0,0.0,0.0,,,...,139764,approved,0,0,0,1,0,0.0,0,4
100005,364513,0.0,"[i, 'm, a, fan, of, the, show, ,, because, i, ...",0.0,0.0,0.0,0.0,0.0,,,...,78709,approved,2,0,1,1,0,0.0,0,4


In [68]:
def convertTokenizedSentenceToIDX(sentence):
    for i in range(len(sentence)):
        if sentence[i] in word2idx:
            sentence[i] = word2idx[sentence[i]]
        else:
            sentence[i] = 0
    
    return sentence
            

print(convertTokenizedSentenceToIDX(['Is', 'that', 'the', 'law', '?']))

[0, 9, 2, 169, 16]


In [87]:
training_set['comment_text_idx'] = training_set.comment_text.apply(convertTokenizedSentenceToIDX)
test_set['comment_text_idx'] = test_set.comment_text.apply(convertTokenizedSentenceToIDX)
validation_set['comment_text_idx'] = validation_set.comment_text.apply(convertTokenizedSentenceToIDX)

In [88]:
def padSentence(sentence):
    padded = np.zeros(200, dtype=int)
    
    for i in range(len(sentence)):
        if i >= 200:
            break
            
        padded[i] = sentence[i]
    
    return padded

In [90]:
# Pad the sentence or Pandas gets really mad. Seriously.

training_set.comment_text_idx = training_set.comment_text.apply(padSentence)
validation_set.comment_text_idx = validation_set.comment_text.apply(padSentence)
test_set.comment_text_idx = test_set.comment_text.apply(padSentence)

In [92]:
# PANDAS DOESN'T WORK WELL WITH ARRAYS IN THE COLUMNS. IT TOOK ME FOREVER TO FIGURE THAT OUT. At least I couldn't get it to work.
# It would always set the datatype to object even though the only elements stored were ints. Making all the arrays
# the same length and doing this trick fixed it. The arrays 

training_float_arr = np.vstack(training_set.comment_text_idx).astype(int)
validation_float_arr = np.vstack(validation_set.comment_text_idx).astype(int)
testing_float_arr = np.vstack(test_set.comment_text_idx).astype(int)

In [93]:
batch_size = 100

train_data = TensorDataset(torch.from_numpy(training_float_arr), torch.from_numpy(training_set.severe_toxicity.values))
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

validation_data = TensorDataset(torch.from_numpy(validation_float_arr), torch.from_numpy(validation_set.severe_toxicity.values))
validation_loader = DataLoader(validation_data, shuffle=True, batch_size=batch_size)

test_data = TensorDataset(torch.from_numpy(testing_float_arr), torch.from_numpy(test_set.severe_toxicity.values))
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [97]:
print(len(training_float_arr))
print(len(validation_float_arr))
print(len(testing_float_arr))

print()

print(len(train_loader))
print(len(validation_loader))
print(len(test_loader))

100000
10000
10000

1000
100
100


In [75]:
device = torch.device("cuda")

In [76]:
class ToxicityAnalyzer(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob = .5):
        super(ToxicityAnalyzer, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first = True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        # Contiguous() creates a copy of a tensor that appears if it's been made from scratch, because it plays 
        # around with the bits of tensors to prevent excess memory allocation, but sometimes things need a 
        # contiguous tesnor.
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:, -1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                    weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [77]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

model = ToxicityAnalyzer(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr = .05
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [86]:
# How many times to traing on the data in the training set
epochs = 2

#
counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf

model.train()
for i in range(epochs):
    # Reinitialize our weights to 0.
    h = model.init_hidden(batch_size)
    
    for inputs, labels in train_loader:
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Reset the gradient to zero so the old gradient doesn't interfere with the new gradient.
        model.zero_grad()
        
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if(counter % print_every == 0):
            val_h = model.init_hidden(batch_size)
            val_losses = []
            model.eval()
            
            for val_inp, val_lab in validation_loader:
                val_h = tuple([each.data for each in val_h])
                val_inp, val_lab = val_inp.to(device), val_lab.to(device)
                val_out, val_h = model(val_inp, val_h)
                val_loss = criterion(val_out.squeeze(), val_lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}".format(np.mean(val_losses)))
            
            if(np.mean(val_losses) <= valid_loss_min):
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min, np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)
                    

print(counter)

Epoch: 1/2... Step: 100... Loss: 0.353300
Validation loss decreased (inf --> 0.353300). Saving model ...
Epoch: 1/2... Step: 200... Loss: 0.353207
Validation loss decreased (0.353300 --> 0.353207). Saving model ...
Epoch: 1/2... Step: 300... Loss: 0.353153
Validation loss decreased (0.353207 --> 0.353153). Saving model ...
Epoch: 1/2... Step: 400... Loss: 0.353152
Validation loss decreased (0.353153 --> 0.353152). Saving model ...
Epoch: 1/2... Step: 500... Loss: 0.353201
Epoch: 1/2... Step: 600... Loss: 0.353342
Epoch: 1/2... Step: 700... Loss: 0.353233
Epoch: 1/2... Step: 800... Loss: 0.353280
Epoch: 1/2... Step: 900... Loss: 0.353022
Validation loss decreased (0.353152 --> 0.353022). Saving model ...
Epoch: 1/2... Step: 1000... Loss: 0.353082
Epoch: 2/2... Step: 1100... Loss: 0.353267
Epoch: 2/2... Step: 1200... Loss: 0.353204
Epoch: 2/2... Step: 1300... Loss: 0.353399
Epoch: 2/2... Step: 1400... Loss: 0.353324
Epoch: 2/2... Step: 1500... Loss: 0.353332
Epoch: 2/2... Step: 1600... L

In [None]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

test_losses = []
num_correct = 0
h = model.init_hidden(batch_size)

model.eval()
for inputs, labels in test_loader:
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

In [14]:
# This is just the tutorial from https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Turn each word into an index and put it in a tensor.
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Four our data: "Sentence" "Offensive scores" 
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

# They are not using pretrained embeddings. I don't think we should have to do this.
# But it might be easier since it seems like Glove and Word2Vec are easiest to use with libraries
# which are not included on Datahub.

word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

# Every word is assigned a one-hot vector representation.
print(word_to_ix)

# Since our data is just raw scores, we shouldn't have to do this. (?)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        
        # Dimension of the hidden states.
        self.hidden_dim = hidden_dim

        # A lookup table from indices to vectors.
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        # Embed our sentence
        embeds = self.word_embeddings(sentence)
        
        # 
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Our model in this case has our EMBEDDING_DIM and HIDDEN_DIM as defined above.
# Then our vocab size is just however many words we saw in our corpus. Our tagset
# is 3, as we're choosing either Noun, Verb, or Det.
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print("Print after training: ", tag_scores)
    


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
tensor([[-1.3852, -0.9624, -1.0003],
        [-1.3570, -1.1749, -0.8354],
        [-1.3794, -1.2678, -0.7618],
        [-1.3699, -1.1893, -0.8177],
        [-1.3667, -1.2508, -0.7792]])
Print after training:  tensor([[-0.0341, -4.1614, -4.0227],
        [-4.5262, -0.0158, -5.3256],
        [-3.7815, -4.2559, -0.0377],
        [-0.0226, -4.4558, -4.5316],
        [-4.6019, -0.0128, -5.9298]])
