In [1]:
from fromscratchtoml.neural_network.models import Sequential
from fromscratchtoml.neural_network.optimizers import StochasticGradientDescent
from fromscratchtoml.neural_network.layers import Dense, Activation
from fromscratchtoml.neural_network.layers import RNN

import numpy as np
from sklearn.model_selection import train_test_split

from fromscratchtoml.toolbox.random import Distribution
from fromscratchtoml.toolbox.preprocess import to_onehot

In [2]:
import sys
sys.executable

'/home/suchith/anaconda3/bin/python'

In [3]:
%%script false 
import csv
import nltk
nltk.download('punkt')
import itertools

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print ("Reading CSV file...")
with open('mldata/reddit-comments-2015-08.csv', 'r') as f:
#     reader = csv.reader(f, skipinitialspace=True)
    reader = csv.DictReader(f)
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x['body'].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print ("Parsed %d sentences." % (len(sentences)))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print ("\nExample sentence: '%s'" % sentences[0])
print ("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

In [4]:
import pickle

# with open('company_data.pkl', 'wb') as output:
#     pickle.dump(X_train, output, pickle.HIGHEST_PROTOCOL)
#     pickle.dump(y_train, output, pickle.HIGHEST_PROTOCOL)

# del X_train
# del y_train

with open('company_data.pkl', 'rb') as input1:
    X_train = pickle.load(input1)
    y_train = pickle.load(input1)

In [5]:
%%script false 
import numpy as np
# Create the training data
X_train = np.array([np.array(([word_to_index[w] for w in sent[:-1]])) for sent in tokenized_sentences])
y_train = np.array([np.array(([word_to_index[w] for w in sent[1:]])) for sent in tokenized_sentences])

In [6]:
len(X_train[10])

45

In [7]:
X_train[0], len(X_train[0])

(array([   0,    6, 3495,    7,  155,  796,   25,  222,    8,   32,   20,
         202, 4954,  350,   91,    6,   66,  207,    5,    2]), 20)

In [8]:
vocabulary_size = 8000

In [9]:
rnn = RNN(vocab_size=vocabulary_size)

In [10]:
temp = []
for x in X_train[0]:
    t = np.zeros(8000)
    t[x] = 1
    temp.append(t)
temp = np.array(temp)

In [11]:
rnn.forward(temp)

array([[  3.68801378,  -1.49106148,  -7.88593317, ...,   5.77177346,
         -5.19204936,  -3.01698411],
       [-10.22022362,   7.73612848,  -2.15200299, ...,  19.90785448,
          7.66017754,  14.66501426],
       [ -1.85256732,   5.85912267,   8.92890565, ...,   3.87456653,
        -12.98066922,  11.24654976],
       ...,
       [ 15.70266536,  -0.51664131, -12.08723437, ...,  13.18701326,
          2.47160501,   5.37469642],
       [  0.82642968,   1.09868945,  -9.43084108, ...,  13.41471326,
         -0.1239448 ,   0.66873969],
       [ -0.2815718 ,   1.88558196,   1.63544789, ..., -13.68677605,
        -18.23612251,   2.96564714]])

In [12]:
temp.shape

(20, 8000)

In [13]:
model1 = Sequential(verbose=1, vis_each_epoch=True)
model1.add(RNN(vocab_size=vocabulary_size, seed=1))
model1.add(Activation('sigmoid'))
sgd = StochasticGradientDescent(learning_rate=0.05)
model1.compile(optimizer=sgd, loss="mean_squared_error")

In [27]:
model1.fit(np.array([temp]), np.array([y_train[0]]), epochs=2)




epoch: 1/2  loss: 1883192.000 
[████████████████████                    ] 50% 




epoch: 2/2  loss: 1883192.000 
[████████████████████████████████████████] 100% 

In [28]:
model1.predict(np.array([temp[0]]))

ValueError: shapes (1,) and (8000,100) not aligned: 1 (dim 0) != 8000 (dim 0)

In [25]:
temp[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
temp

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [17]:
p = model1.forwardpass(temp)

In [18]:
for a,b in zip(temp, p):
    print(np.argmax(a), np.argmax(b))

0 1
6 1
3495 5
7 1
155 5
796 1
25 5
222 10
8 1
32 1
20 5
202 10
4954 5
350 1
91 5
6 1
66 5
207 1
5 5
2 1
