In [129]:
import tensorflow as tf
import numpy as np
import numpy.random as rng
import pandas as pd
import os, pdb, re
import string
import stats
import keras.preprocessing.text as text
from keras.preprocessing import sequence
from keras import utils
float_formatter = lambda x: "%.2f" % x
np.set_printoptions(linewidth=200,threshold=np.nan,formatter={'float_kind':float_formatter})
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option("display.max_colwidth",200)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
#Preprocessing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
MAX_WORD_COUNT = 60
N_OUT = 5
BATCH_SIZE = 5
HIDDEN_SIZE = 8
NUM_LAYERS = 2
INIT_SCALE = 0.05

In [3]:
fx = lambda x: pd.read_csv(x,delimiter="\t",header=0)
train_df,test_df = map(fx, ["train.tsv","test.tsv"])
feat_names = train_df.columns.values
#train_df.head()

# Get full sentences from data set

In [4]:
def group_max(group):
    idx = np.where(group["Phrase"].apply(len)==group["Phrase"].apply(len).max())[0][0]
    return group["Phrase"].iloc[idx]
df = train_df.groupby("SentenceId").apply(group_max).values
#remove_stop = lambda sentence: [word for word in sentence if word not in stopwords.words('english')]

# The Love Song of J. Alfred Prufrock

In [162]:
df = pd.read_table("The Love Song of J. Alfred Prufrock.txt",header=None,names=["Line"],dtype={"Line":str})
txt = df["Line"].values
txt = ' \n '.join([''.join(row) for row in txt]).lower()
txt = re.sub(r'[^\w\s]',' ',txt)
df_clean = txt.split("\n")

In [163]:
df_clean_yellow = list(filter(lambda row: "yellow" in row,df_clean))
df_clean_will = list(filter(lambda row: "will" in row,df_clean))
df_clean = np.concatenate((df_clean_will,df_clean_yellow))
#df_clean = df_clean_yellow

In [164]:
class Data_obj():
    def __init__(self,batch_size,clean_data):
        self.epoch = 1
        self.i = self.k = 0
        self.batch_size = batch_size
        self.clean_data = clean_data
        
        self.Tokenizer = text.Tokenizer()
        self.Tokenizer.fit_on_texts(self.clean_data)
        self.words = self.Tokenizer.word_index.keys()
        self.encoded_text = self.Tokenizer.texts_to_sequences(self.clean_data)
        
        self.inverse_tokenizer = lambda num: list(self.Tokenizer.word_index.keys())[list(self.Tokenizer.word_index.values()).index(num)] #inverse
        self.inverse_tokenizer_sentence = lambda sentence: list(map(self.inverse_tokenizer,sentence))
        
        self.vocab_size = len(self.Tokenizer.word_index) + 1
        print("There are {0} unique words in data set.".format(self.vocab_size))
        
    def shuffle(self):
        rng.shuffle(self.encoded_text)
        
    def new_batch(self):
        return np.zeros((self.batch_size,2)).astype(np.int32)
    
    def generator(self):
        self.shuffle()
        batch = self.new_batch()
        self.total_examples_seen = 0
        while True:
            
            self.current_sentence = self.encoded_text[self.i]
            sentence_len = len(self.current_sentence)
            if sentence_len < 2:
                self.i += 1
                continue
            for j in range(sentence_len):
                context = self.current_sentence[j]

                if j == 0:
                    target = self.current_sentence[j+1]
                elif j == sentence_len - 1:
                    target = self.current_sentence[j-1]
                elif rng.uniform() < 0.5:
                    target = self.current_sentence[j-1]
                else:
                    target = self.current_sentence[j+1]

                batch[self.k,0] = context
                batch[self.k,1] = target
                if self.k == BATCH_SIZE - 1:
                    self.k = 0
                    yield batch
                    batch = self.new_batch()
                    self.total_examples_seen += self.batch_size
                else:
                    self.k += 1
            self.i+=1
            if self.i == len(self.encoded_text):
                self.epoch += 1
                self.i = 0
                self.shuffle()  #shuffle after epoch
                

In [165]:
data_obj = Data_obj(batch_size=BATCH_SIZE,clean_data=df_clean)
generate_batch = data_obj.generator()

There are 56 unique words in data set.


In [166]:
for i in range(5):
    data = next(generate_batch)
    print(data_obj.current_sentence,data_obj.inverse_tokenizer_sentence(data_obj.current_sentence))
    for i in data:
        print(i,data_obj.inverse_tokenizer_sentence(i))

[4, 1, 5, 6, 4, 1, 5, 6] ['there', 'will', 'be', 'time', 'there', 'will', 'be', 'time']
[4 1] ['there', 'will']
[1 4] ['will', 'there']
[5 1] ['be', 'will']
[6 4] ['time', 'there']
[4 1] ['there', 'will']
[16, 32, 3, 33, 34, 35, 36, 1, 37] ['for', 'decisions', 'and', 'revisions', 'which', 'a', 'minute', 'will', 'reverse']
[1 5] ['will', 'be']
[5 1] ['be', 'will']
[6 5] ['time', 'be']
[16 32] ['for', 'decisions']
[32  3] ['decisions', 'and']
[16, 32, 3, 33, 34, 35, 36, 1, 37] ['for', 'decisions', 'and', 'revisions', 'which', 'a', 'minute', 'will', 'reverse']
[ 3 33] ['and', 'revisions']
[33 34] ['revisions', 'which']
[34 33] ['which', 'revisions']
[35 34] ['a', 'which']
[36 35] ['minute', 'a']
[8, 1, 12, 13, 14, 25, 26, 27, 15] ['they', 'will', 'say', 'how', 'his', 'hair', 'is', 'growing', 'thin']
[ 1 37] ['will', 'reverse']
[37  1] ['reverse', 'will']
[8 1] ['they', 'will']
[ 1 12] ['will', 'say']
[12  1] ['say', 'will']
[8, 1, 12, 13, 14, 25, 26, 27, 15] ['they', 'will', 'say', 'how',

## Embedding layer - Turns positive integers (indexes) into dense vectors of fixed size”

In [167]:
vocabulary_size = data_obj.vocab_size
embedding_size = HIDDEN_SIZE
train_inputs = tf.placeholder(tf.int32, shape=[None])
#train_labels = tf.placeholder(tf.int32, shape=[None, 1])
train_context = tf.placeholder(tf.int32, shape=[None, 1])
embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

In [168]:
nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size],
                            stddev=1.0 / np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
hidden_out = tf.matmul(embed, tf.transpose(nce_weights)) + nce_biases
soft_max = tf.nn.softmax(hidden_out)
loss = tf.reduce_mean(
        tf.nn.nce_loss(weights=nce_weights,
                       biases=nce_biases,
                       labels=train_context,
                       inputs=embed,
                       num_sampled=1,
                       num_classes=vocabulary_size))

In [169]:
learning_rate = tf.placeholder(tf.float32)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
saver = tf.train.Saver()
model_path = "/tmp/model.ckpt"

In [170]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    sess.run(init)
    embeddings_before = embeddings.eval()
    data_obj = Data_obj(batch_size=BATCH_SIZE,clean_data=df_clean)
    generate_batch = data_obj.generator()
    cur_losses = []
    lr = 0.3
    while True:
        data = next(generate_batch)
        feed_dict = {train_inputs: data[:,0],train_context:data[:,[1]],learning_rate:lr}
        _, cur_loss = sess.run([optimizer, loss], feed_dict=feed_dict)
        cur_losses.append(cur_loss)
        if data_obj.total_examples_seen % 2000 == 0:
            print("{0} seen with running loss of {1:.3f}. Current epoch = {2}. Current lr = {3:.3f}".format(data_obj.total_examples_seen,np.mean(cur_losses),data_obj.epoch,lr))
            cur_losses = []
            lr /= 1.0
        if data_obj.epoch == 500:
            print("Finished.")
            break
    save_path = saver.save(sess,model_path )
    learnt_embeddings = embeddings.eval()

There are 56 unique words in data set.
0 seen with running loss of 5.552. Current epoch = 1. Current lr = 0.300
2000 seen with running loss of 2.543. Current epoch = 19. Current lr = 0.300
4000 seen with running loss of 1.522. Current epoch = 38. Current lr = 0.300
6000 seen with running loss of 1.156. Current epoch = 57. Current lr = 0.300
8000 seen with running loss of 0.812. Current epoch = 76. Current lr = 0.300
10000 seen with running loss of 0.643. Current epoch = 95. Current lr = 0.300
12000 seen with running loss of 0.637. Current epoch = 114. Current lr = 0.300
14000 seen with running loss of 0.546. Current epoch = 133. Current lr = 0.300
16000 seen with running loss of 0.488. Current epoch = 151. Current lr = 0.300
18000 seen with running loss of 0.491. Current epoch = 170. Current lr = 0.300
20000 seen with running loss of 0.462. Current epoch = 189. Current lr = 0.300
22000 seen with running loss of 0.377. Current epoch = 208. Current lr = 0.300
24000 seen with running loss

In [171]:
learnt_embeddings

array([[0.27, -0.23, -0.09, 0.83, 0.05, -0.15, 0.77, -0.93],
       [-1.41, -2.00, -1.02, 0.58, -2.03, -0.82, -0.99, 0.84],
       [2.12, 0.81, -2.88, 1.05, 0.54, 1.68, -1.72, 1.43],
       [-2.43, -0.47, 0.58, 1.12, 1.01, 0.67, -0.47, 3.85],
       [-1.80, 1.23, 0.43, 0.89, 2.07, 0.07, -2.02, 1.54],
       [-0.74, 0.40, 0.50, 1.01, 1.83, 0.06, -3.16, -0.15],
       [0.59, 0.21, -2.73, -0.14, -2.60, 0.27, -2.62, 0.23],
       [-1.69, 1.86, 0.77, 2.50, -0.58, -1.50, -1.79, 0.58],
       [-0.79, 0.87, 1.38, -0.49, -0.06, 2.05, -3.03, 0.84],
       [-0.99, 2.68, -0.41, 0.22, -2.49, -1.33, -1.61, 1.78],
       [-1.27, 0.54, -2.20, 1.29, -1.27, -0.58, -2.49, -1.71],
       [-1.18, -1.67, 0.04, 1.56, 1.48, -1.59, -2.57, 0.63],
       [1.28, 1.04, 0.30, -0.03, 1.78, -0.25, -2.37, 2.20],
       [-1.21, -0.89, -2.89, -0.93, 1.72, 1.01, -0.96, 2.04],
       [1.25, -0.96, 1.09, 2.99, -0.27, -0.46, -2.38, 2.00],
       [-0.27, -2.04, -2.23, 0.77, 0.94, 0.27, -3.13, 0.08],
       [-1.84, 1.23, -0.6

In [173]:
with tf.Session() as sess:
    saver.restore(sess,model_path)
    top_n_words = 3
    for word_no in range(1,vocabulary_size):
        word = data_obj.inverse_tokenizer(word_no)
        feed_dict={train_inputs:np.array([word_no])}
        word_embed, word_pred = sess.run([embed,soft_max],feed_dict)
        word_pred = word_pred.squeeze()
        top_n_args = word_pred.argsort()[-top_n_words:]
        print("\n")
        print(word,word_no)
        print(word_pred)
        print(data_obj.inverse_tokenizer_sentence(top_n_args))

INFO:tensorflow:Restoring parameters from /tmp/model.ckpt


will 1
[0.00 0.00 0.00 0.00 0.13 0.34 0.00 0.03 0.20 0.00 0.00 0.00 0.10 0.00 0.00 0.00 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.04 0.00
 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.07 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00]
['there', 'they', 'be']


the 2
[0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.50 0.00 0.00 0.00 0.00 0.00 0.00 0.30 0.00 0.00 0.00 0.03 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.00 0.02 0.00 0.10 0.03]
['along', 'for', 'yellow']


and 3
[0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.58 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.09 0.07 0.00 0.00 0.00 0.00 0.04 0.05 0.00 0.07 0.10 0.00 0.00 0.00 0.00 0.00
 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00]
['murder', 'revis

In [176]:
pd.DataFrame(df_clean)

Unnamed: 0,0
0,and indeed there will be time
1,there will be time there will be time
2,there will be time to murder and create
3,and indeed there will be time
4,they will say how his hair is growing thin
5,they will say but how his arms and legs are thin
6,for decisions and revisions which a minute will reverse
7,am an attendant lord one that will do
8,i do not think that they will sing to me
9,the yellow fog that rubs its back upon the window panes
