<a href="https://colab.research.google.com/github/michaeleby1/variational-autoencoder/blob/master/glove.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.preprocessing.text import Tokenizer
import spacy
import numpy as np
import sys
import io

import warnings
warnings.filterwarnings('ignore')

In [5]:
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [0]:
import en_core_web_md
nlp = en_core_web_md.load()


In [0]:
nlp.max_length = 16582681

In [0]:
# glove upload
from google.colab import files
uploaded = files.upload()

In [13]:
# text file upload
from google.colab import files
uploaded = files.upload()

Saving text.txt to text.txt


In [14]:
embeddings = {}
f = open('glove.6B.50d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings[word] = coefs
f.close()

print(f'Found {len(embeddings)} word vectors.')

Found 49623 word vectors.


In [15]:
embeddings['president'] 

array([-0.11875  ,  0.6722   ,  0.19444  ,  0.55269  ,  0.53698  ,
       -0.37237  , -0.73494  , -0.30575  , -0.92601  , -0.43276  ,
        0.026956 ,  0.66861  , -0.79097  , -0.015932 ,  0.53918  ,
        0.30341  , -0.67042  ,  0.0051129,  0.62272  , -0.55823  ,
       -0.10887  ,  0.57305  , -0.016149 , -1.1889   , -0.24318  ,
       -2.6289   ,  0.41262  , -0.12904  , -1.3238   ,  0.64731  ,
        2.3595   ,  0.34048  , -1.9889   , -0.79084  , -0.79739  ,
       -0.87998  , -0.72991  ,  0.011697 ,  0.090612 , -0.17287  ,
       -0.83274  ,  1.1932   , -0.75211  , -1.1603   , -0.10074  ,
        0.60224  , -1.3739   ,  0.33674  , -0.31224  ,  0.097583 ],
      dtype=float32)

In [0]:
with open('text.txt', 'r') as f:
  text = f.read()

In [0]:
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(text, disable=['ner', 'parser'])

In [0]:
sentences = [sent.string.strip() for sent in doc.sents]

In [19]:
sentences[0:20]

['Transcribed from the 1915 Martin Secker edition by David Price, email ccx074@pglaf.org                            [Picture: Book cover]                                    THE LESSON OF                                 THE MASTER                                 BY HENRY JAMES                        [Picture: Decorative graphic]                                  * * * * *                            LONDON: MARTIN SECKER                      NUMBER FIVE JOHN STREET ADELPHI                                  * * * * *                       This edition first printed 1915                                  * * * * *     I   HE had been told the ladies were at church, but this was corrected by what he saw from the top of the steps—they descended from a great height in two arms, with a circular sweep of the most charming effect—at the threshold of the door which, from the long bright gallery, overlooked the immense lawn.',
 'Three gentlemen, on the grass, at a distance, sat under the great trees,

In [0]:
maxlen = 20
step = 1
max_num_word = 10000

samples_sentences = []
next_word = []

In [0]:
# tokenze and transform each sentence into a sequence of ints
tokenizer = Tokenizer(num_words=max_num_word)
tokenizer.fit_on_texts(sentences)
list_tokenized_train = tokenizer.texts_to_sequences(sentences)

In [22]:
len(tokenizer.word_index)

45332

In [23]:
len(sentences)

150009

In [24]:
len(list_tokenized_train)

150009

In [0]:
token_word = []
for line in range(0, len(sentences)):
  that_sentences = list_tokenized_train[line]
  for i in range(0, len(that_sentences)):
    token_word.append(that_sentences[i])

In [26]:
len(token_word)

2890983

In [27]:
for i in range(0, len(token_word) - maxlen, step):
  samples_sentences.append(token_word[i:i + maxlen])
  next_word.append(token_word[i + maxlen])
print('Number of sentences:', len(samples_sentences))

Number of sentences: 2890963


In [0]:
# normalize sequences
x = np.asarray(samples_sentences).astype('float32')/max_num_word


In [0]:
# one hot encode y
y = np.zeros((len(sentences), max_num_word), dtype=np.bool)

for i in range(0, len(sentences)):
  for j in range(0, maxlen):
    y[i, next_word[j]] = 1

Building Neural Net

In [38]:
model = Sequential()

model.add(layers.Embedding(max_num_word, 200, input_length=maxlen))
model.add(layers.LSTM(256))
model.add(layers.Dense(max_num_word, activation='softmax'))

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 20, 200)           2000000   
_________________________________________________________________
lstm_4 (LSTM)                (None, 256)               467968    
_________________________________________________________________
dense_4 (Dense)              (None, 10000)             2570000   
Total params: 5,037,968
Trainable params: 5,037,968
Non-trainable params: 0
_________________________________________________________________


In [47]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.fit(x, y, batch_size=256, epochs=1)

ValueError: ignored