In [2]:
# libraries
import numpy as np
import torch
from torch import nn as nn
import torch.nn.functional as F

In [120]:
# loading data
import csv
import itertools
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print("Reading CSV file...")
with open('/Users/dimitaryasenovoparlakov/Documents/ML/RNN_reddit_comments/reddit-comments-2015-08.csv', 'r') as f:
    reader = csv.reader(f, skipinitialspace=True)
    next(reader)
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x[0].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print("Parsed %d sentences." % (len(sentences)))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print("Using vocabulary size %d." % vocabulary_size)
print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

# Create the training data
X_train = [np.array([word_to_index[w] for w in sent[:-1]]) for sent in tokenized_sentences][0:200]
y_train = [np.array([word_to_index[w] for w in sent[1:]]) for sent in tokenized_sentences]




[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dimitaryasenovoparlakov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dimitaryasenovoparlakov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/dimitaryasenovoparlakov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Reading CSV file...
Parsed 79170 sentences.
Found 63024 unique words tokens.
Using vocabulary size 8000.
The least frequent word in our vocabulary is 'appointments' and appeared 10 times.

Example sentence: 'SENTENCE_START i joined a new league this year and they have different scoring rules than i'm used to. SENTENCE_END'

Example sentence after Pre-processing: '['SENTENCE_START', 'i', 'joined', 'a', 'new', 'league', 'this', 'year', 'and', 'they', 'have', 'different', 'scoring', 'rules', 'than', 'i', "'m", 'used', 'to', '.', 'SENTENCE_END']'


In [25]:
# hyperparameters

hyperparams = {
    'vocab_size': 8000,
    'hidden_dim': 8000,
    'truncation': 5,
    'num_epochs': 4,
    'lr': 0.001
}

In [83]:
class RNN(nn.Module):
    def __init__(self, hyperparams):
        super().__init__()
        
        vocab_size = hyperparams['vocab_size']
        hidden_dim = hyperparams['hidden_dim']
        max_u = np.sqrt(vocab_size)           
        max_w = np.sqrt(hidden_dim)
        
        U_init = torch.rand(
            hidden_dim, vocab_size,
            requires_grad=False
        )
        W_init = torch.rand(
            hidden_dim, hidden_dim,
            requires_grad=False
        )
        U_init = U_init / (max_u / 2.0) - (2.0 / max_u)
        W_init = W_init / (max_w / 2.0) - (2.0 / max_w)
        b_init = 4.0 * torch.rand(hidden_dim, requires_grad=False) / max_w
        
        self.U = nn.parameter.Parameter(data=U_init, requires_grad=True)
        self.W = nn.parameter.Parameter(data=W_init, requires_grad=True)
        self.b = nn.parameter.Parameter(data=b_init, requires_grad=True)
            
        
    def forward(self, s, x):
        out = F.linear(s, self.W)
        out += F.linear(x, self.U)
        out += self.b
        
        return F.relu(out)

In [127]:
def train_bptt(model, sentences, hyperparams):
    loss_fn = nn.CrossEntropyLoss()
    adam = torch.optim.Adam(model.parameters(), lr=hyperparams['lr'])

    for i in range(hyperparams['num_epochs']):
        for k, sent in enumerate(sentences):
            adam.zero_grad()
            
            s = torch.zeros(hyperparams['hidden_dim'])
            ws = F.one_hot(torch.tensor(sent), num_classes=hyperparams['vocab_size'])
            total_loss = torch.tensor(0.0)
            
            # print(sent.shape[0])
            for j in range(sent.shape[0] - 1):
                # print(j)
                x = ws[j].float()
                s = model(s, x)

                if j % hyperparams['truncation'] == 0:
                    s.detach()
                
                pred_distr = torch.unsqueeze(s, 0)
                total_loss += loss_fn(pred_distr, torch.tensor([sent[j+1]]))
            
            total_loss /= sent.shape[0]
            if k % 20 == 0:
                print(k, total_loss)
            total_loss.backward()
            adam.step()

In [None]:
model = RNN(hyperparams)

train_bptt(model, X_train, hyperparams)

0 tensor(8.5377, grad_fn=<DivBackward0>)
20 tensor(8.5552, grad_fn=<DivBackward0>)
40 tensor(8.6233, grad_fn=<DivBackward0>)
60 tensor(8.7755, grad_fn=<DivBackward0>)
80 tensor(7.7035, grad_fn=<DivBackward0>)
