In [1]:
from fromscratchtoml.neural_network.models import Sequential
from fromscratchtoml.neural_network.optimizers import StochasticGradientDescent
from fromscratchtoml.neural_network.layers import Dense, Activation

import numpy as np
from sklearn.model_selection import train_test_split

from fromscratchtoml.toolbox.random import Distribution
from fromscratchtoml.toolbox.preprocess import to_onehot

In [2]:
%%script false 
import csv
import nltk
nltk.download('punkt')
import itertools

vocabulary_size = 8000
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

# Read the data and append SENTENCE_START and SENTENCE_END tokens
print ("Reading CSV file...")
with open('mldata/reddit-comments-2015-08.csv', 'r') as f:
#     reader = csv.reader(f, skipinitialspace=True)
    reader = csv.DictReader(f)
    # Split full comments into sentences
    sentences = itertools.chain(*[nltk.sent_tokenize(x['body'].lower()) for x in reader])
    # Append SENTENCE_START and SENTENCE_END
    sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]
print ("Parsed %d sentences." % (len(sentences)))
    
# Tokenize the sentences into words
tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

# Count the word frequencies
word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
print ("Found %d unique words tokens." % len(word_freq.items()))

# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print ("Using vocabulary size %d." % vocabulary_size)
print ("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]))

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

print ("\nExample sentence: '%s'" % sentences[0])
print ("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

In [3]:
import pickle

# with open('company_data.pkl', 'wb') as output:
#     pickle.dump(X_train, output, pickle.HIGHEST_PROTOCOL)
#     pickle.dump(y_train, output, pickle.HIGHEST_PROTOCOL)

# del X_train
# del y_train

with open('company_data.pkl', 'rb') as input1:
    X_train = pickle.load(input1)
    y_train = pickle.load(input1)

In [4]:
%%script false 
import numpy as np
# Create the training data
X_train = np.array([np.array(([word_to_index[w] for w in sent[:-1]])) for sent in tokenized_sentences])
y_train = np.array([np.array(([word_to_index[w] for w in sent[1:]])) for sent in tokenized_sentences])

In [5]:
len(X_train[10])

45

In [6]:
from fromscratchtoml.neural_network.layers import RNN

In [7]:
X_train[0], len(X_train[0])

(array([   0,    6, 3495,    7,  155,  796,   25,  222,    8,   32,   20,
         202, 4954,  350,   91,    6,   66,  207,    5,    2]), 20)

In [8]:
vocabulary_size = 8000

In [9]:
rnn = RNN(vocab_size=vocabulary_size)

In [10]:
temp = []
for x in X_train[0]:
    t = np.zeros(8000)
    t[x] = 1
    temp.append(t)
temp = np.array(temp)

In [11]:
rnn.forward(temp)

array([[ 11.00923787,  -5.05153523,   0.60755201, ...,  -2.86051066,
         -0.37358112,   1.3635817 ],
       [  9.08559153,  -3.8789021 , -10.43932832, ...,  -4.20035587,
         -5.20696358,  10.19458699],
       [-15.33182835,  -0.23303309,  -2.2664427 , ...,  -5.55254774,
        -17.32968314,  -3.07118871],
       ...,
       [  6.8288537 ,  -7.90612146,  11.49100927, ...,  -5.35288656,
         -9.62249715,   5.57048193],
       [ -6.59301018, -12.18209592, -10.62050043, ...,   0.11968294,
         -2.95958577,   8.51961997],
       [ -2.93784144, -20.72289253,   5.40047561, ...,  15.85503248,
         10.21757892,  -7.15739808]])

In [12]:
temp.shape

(20, 8000)

In [13]:
model1 = Sequential(verbose=1, vis_each_epoch=True)
model1.add(RNN(vocab_size=vocabulary_size, seed=1))
model1.add(Activation('sigmoid'))
sgd = StochasticGradientDescent(learning_rate=0.05)
model1.compile(optimizer=sgd, loss="mean_squared_error")

In [14]:
model1.fit(np.array([temp]), np.array([y_train[0]]), epochs=1)

16


ValueError: operands could not be broadcast together with shapes (8000,100) (8000,) (8000,100) 

In [None]:
model1.predict(np.array([temp]))

In [None]:
temp

In [None]:
p = model1.forwardpass(temp)

In [None]:
for a,b in zip(temp, p):
    print(np.argmax(a), np.argmax(b))