[View in Colaboratory](https://colab.research.google.com/github/nrkfeller/learn_ml/blob/master/tfexamples.ipynb)

# LSTM for univariate time series

In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
tf.reset_default_graph()

from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('monthly-milk-production.csv', index_col='Month')

df.index = pd.to_datetime(df.index)

train_set = df.head(150)
test_set = df.tail(18)

scaler = MinMaxScaler()

train_scaled = scaler.fit_transform(train_set)
test_scaled = scaler.fit_transform(test_set)

def next_batch(training_data,batch_size,steps):
    rand_start = np.random.randint(0,len(training_data)-steps) 
    y_batch = np.array(training_data[rand_start:rand_start+steps+1]).reshape(1,steps+1)
    return y_batch[:, :-1].reshape(-1, steps, 1), y_batch[:, 1:].reshape(-1, steps, 1) 

num_inputs = 1
num_time_steps = 12
num_neurons = 100
num_outputs = 1
learning_rate = 0.01
num_train_iterations = 8000
batch_size = 1

X = tf.placeholder(tf.float32, [None, num_time_steps, num_inputs])
y = tf.placeholder(tf.float32, [None, num_time_steps, num_outputs])

cell = tf.contrib.rnn.OutputProjectionWrapper(
    tf.contrib.rnn.GRUCell(num_units=num_neurons, activation=tf.nn.relu),
    output_size=num_outputs) 

outputs, states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)

loss = tf.reduce_mean(tf.square(outputs - y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train = optimizer.minimize(loss)

init = tf.global_variables_initializer()

saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    
    for iteration in range(num_train_iterations):
        X_batch, y_batch = next_batch(train_scaled, batch_size, num_time_steps)
        sess.run(train, feed_dict={X:X_batch, y:y_batch})
        if iteration % 100 == 0:
            mse = loss.eval(feed_dict={X:X_batch, y:y_batch})
            print ("{} \t MSE: {}".format(iteration, mse))
    saver.save(sess, './ex_time_series_model')

In [0]:
with tf.Session() as sess:
    saver.restore(sess, "./ex_time_series_model")
    train_seed = list(train_scaled[-12:])
    for iteration in range(12):
        X_batch = np.array(train_seed[-num_time_steps:]).reshape(1, num_time_steps, 1)
        y_pred = sess.run(outputs, feed_dict={X: X_batch})
        train_seed.append(y_pred[0, -1, 0])
        
results = scaler.inverse_transform(np.array(train_seed[12:]).reshape(12,1))
test_set = test_set[:12]
test_set['Generated'] = results

plt.plot(test_set['Generated'])
plt.plot(df)

# Word to Vec

In [0]:
import collections
import math
import os
import errno
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  
from collections import Counter
import tensorflow as tf

data_dir = 'word2vec_data/words'
data_url = 'http://matthoney.net/dc/text8.zip'

def fetch_words_data(url=data_url, words_data=data_dir):
    os.makedirs(words_data, exist_ok=True)
    zip_path = os.path.join(words_data, 'words.zip')
    
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
        
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
        
    return data.decode('ascii').split()

def generate_batch(batch_size, num_skips, skip_window):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * skip_window
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index + span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = skip_window  # target label at the center of the buffer
        targets_to_avoid = [skip_window]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span - 1)
            targets_to_avoid.append(target)
            batch[i * num_skips + j] = buffer[skip_window]
            labels[i * num_skips + j, 0] = buffer[target]
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data) - span) % len(data)
    return batch, labels

def create_counts(vocab_size=50000):
    vocab = [] + Counter(words).most_common(vocab_size)
    vocab = np.array([word for word, _ in vocab])
    dictionary = {word:code for code, word in enumerate(vocab)}
    data = np.array([dictionary.get(word, 0) for word in words])
    return data, vocab
  
  
words = fetch_words_data()
  
data, vocabulary = create_counts()

batch_size = 128
embedding_size = 150
skip_window = 1
num_skips = 2

valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

num_sampled = 64
learning_rate = 0.01
vocabulary_size = 50000

tf.reset_default_graph()

train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])

valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
init_embeds = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
embeddings = tf.Variable(init_embeds)
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0/np.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, num_sampled, vocabulary_size))
optimizer = tf.train.AdamOptimizer(learning_rate = 1.0)
trainer = optimizer.minimize(loss)

norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

data_index = 0
init = tf.global_variables_initializer()

num_steps = 5000
with tf.Session() as sess:
    sess.run(init)
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
        feed_dict = {train_inputs:batch_inputs, train_labels:batch_labels}
        _, loss_val = sess.run([trainer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % 1000 == 0:
            if step > 0:
                average_loss = average_loss/1000
            print("average loss at step: {} is {}".format(step, average_loss))
            average_loss = 0
        final_embeddings = normalized_embeddings.eval()

In [0]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)

plot_only = 5000
low_dim_embed = tsne.fit_transform(final_embeddings[:plot_only, :])

labels = [vocabulary[i] for i in range(plot_only)]

def plot_with_labels(low_dim_embs, labels):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize=(18,18))
    for i , label in enumerate(labels):
        x, y = low_dim_embs[i,:]
        plt.scatter(x,y)
        plt.annotate(labels,
                     xy=(x, y),
                     xytext=(5,2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
        
plot_with_labels(low_dim_embed, labels)

# 