In [98]:
# Display plots directly in the notebook instead of in a new window
%matplotlib inline

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import Counter
import re
import nltk
import numpy as np
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jtuxhorn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [99]:
train = pd.read_csv('./train.csv')

In [100]:
# These first few cells are us following along with the project at the following link, just adjusting to our data.
# https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/

# Get a subset so that we don't waste time while just trying to get a basic model to work.
train = train[:100]
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [101]:
words = Counter()    # Count the occurences of words.

# Count the words in our sentences.
for index, comment in train.comment_text.iteritems():
    train_sentences.append([])
    
    for word in nltk.word_tokenize(comment):
        words.update([word.lower()])

In [102]:
print(words)


Counter({'.': 224, 'the': 184, 'to': 178, ',': 167, 'a': 114, 'and': 107, 'i': 92, 'of': 91, 'that': 66, 'is': 62, 'it': 58, 'you': 55, 'in': 55, 'on': 55, '!': 52, 'this': 49, 'for': 48, '?': 43, "'s": 42, 'be': 42, 'have': 39, 'are': 38, 'comments': 38, 'we': 37, "n't": 31, 'with': 30, 'but': 27, '``': 27, "''": 27, 'or': 27, 'do': 25, 'more': 25, 'they': 25, 'their': 23, 'as': 23, 'civil': 23, 'so': 21, 'if': 20, 'not': 18, 'other': 18, 'comment': 17, 'who': 17, 'people': 17, ':': 17, ')': 17, 'like': 16, 'will': 16, 'by': 16, 'one': 16, 'there': 16, "'re": 16, 'all': 16, 'up': 15, 'system': 15, '(': 15, 'would': 14, "'m": 14, 'my': 13, 'an': 13, 'at': 13, 'them': 13, 'has': 13, 'was': 12, 'what': 12, 'your': 11, ';': 11, "'ll": 11, 'out': 11, 'from': 11, 'just': 11, '-': 11, 'want': 10, 'great': 10, 'very': 10, 'no': 10, 'being': 10, 'only': 10, 'new': 10, 'think': 9, 'any': 9, 'than': 9, 'about': 9, 'see': 9, 'could': 9, '...': 9, 'read': 8, 'make': 8, 'when': 8, 'which': 8, 'even

In [103]:
# Tokenize our comments.
train.comment_text = train["comment_text"].apply(nltk.word_tokenize)

In [104]:
# We now have a tokenized comment_text column in dataframe.
train.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"[This, is, so, cool, ., It, 's, like, ,, 'woul...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,"[Thank, you, !, !, This, would, make, my, life...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,"[This, is, such, an, urgent, design, problem, ...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,"[Is, this, something, I, 'll, be, able, to, in...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,"[haha, you, guys, are, a, bunch, of, losers, .]",0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [14]:
# This is just the tutorial from https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

# Turn each word into an index and put it in a tensor.
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Four our data: "Sentence" "Offensive scores" 
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

# They are not using pretrained embeddings. I don't think we should have to do this.
# But it might be easier since it seems like Glove and Word2Vec are easiest to use with libraries
# which are not included on Datahub.

word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

# Every word is assigned a one-hot vector representation.
print(word_to_ix)

# Since our data is just raw scores, we shouldn't have to do this. (?)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        
        # Dimension of the hidden states.
        self.hidden_dim = hidden_dim

        # A lookup table from indices to vectors.
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        # Embed our sentence
        embeds = self.word_embeddings(sentence)
        
        # 
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Our model in this case has our EMBEDDING_DIM and HIDDEN_DIM as defined above.
# Then our vocab size is just however many words we saw in our corpus. Our tagset
# is 3, as we're choosing either Noun, Verb, or Det.
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

    # The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
    # for word i. The predicted tag is the maximum scoring tag.
    # Here, we can see the predicted sequence below is 0 1 2 0 1
    # since 0 is index of the maximum value of row 1,
    # 1 is the index of maximum value of row 2, etc.
    # Which is DET NOUN VERB DET NOUN, the correct sequence!
    print("Print after training: ", tag_scores)
    


{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
tensor([[-1.3852, -0.9624, -1.0003],
        [-1.3570, -1.1749, -0.8354],
        [-1.3794, -1.2678, -0.7618],
        [-1.3699, -1.1893, -0.8177],
        [-1.3667, -1.2508, -0.7792]])
Print after training:  tensor([[-0.0341, -4.1614, -4.0227],
        [-4.5262, -0.0158, -5.3256],
        [-3.7815, -4.2559, -0.0377],
        [-0.0226, -4.4558, -4.5316],
        [-4.6019, -0.0128, -5.9298]])
