# Section 1:  Removal of bias in embedding (`Word2Vec`)

In [None]:
import gensim.downloader as api

wv = api.load("word2vec-google-news-300")

In [5]:
king = wv["king"]

print(king.shape)

print(wv.most_similar(positive=["king", "queen"], topn=10))

print(wv.similarity("king", "queen" ))

print(wv.most_similar(positive=["man", "rule"], negative=["woman"]))


## Section 2: Sentiment Analysis

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras import preprocessing
from tensorflow.keras.datasets import imdb

max_features = 10000
maxlen = 100
embedded_size = 8

# Load imdb dataset and print a few samples to check.
#
# IMDB: sentence (x) -> positive/negative (y)
#
# “The food was really good” 				 -> pos
# “The chicken crossed the road because it was uncooked” -> neg

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# x_train has a size (training_size, ). Because the sentences have variable size,
# we cannot represent this in matrix format.

print(x_train.shape)

# The first step is to make the column size constant.
#
# We do that by "padding" the sentences. If the sentences are bigger, we clip them.
# If they are smaller, we insert a "NO_WORD" token to the sentence.

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

print(x_train.shape)

# Let's see the first sentence

print(x_train[0])

# Input shape should be now (training_size, maxlen)

# Let's use an embedding to try to help to estimate 

xi = Input(x_train.shape[1:])

# Embedding input is (training_size, maxlen)
# Embedding output is (training_size, maxlen, embedded_size)

x = Embedding(max_features, embedded_size, input_length=maxlen)(xi)
x = Flatten()(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(inputs=xi, outputs=x)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.summary()

history = model.fit(
        x_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

p = model.predict(x_test)

#
# What's the current accuracy for this model?
# 
# Try to add a preloaded embedded from Glove from this model, see the
# suggestion in
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
# 
# Question: why does this model may help you get better accuracy?
# 
# Try to change the maxlen or the embedded_size and plot a 3D graph with
# embedded_size x maxlen x accuracy in a python jupyter notebook.
#
# Also try Conv1D + MaxPooling1D to improve results
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html



## Section 3: Sentiment Analysis with `LSTM`

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras import preprocessing
from tensorflow.keras.datasets import imdb

max_features = 10000
maxlen = 100
embedded_size = 20

# Load imdb dataset and print a few samples to check.
#
# IMDB: sentence (x) -> positive/negative (y)
#
# “The food was really good” 				 -> pos
# “The chicken crossed the road because it was uncooked” -> neg

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# x_train has a size (training_size, ). Because the sentences have variable size,
# we cannot represent this in matrix format.

print(x_train.shape)

# The first step is to make the column size constant.
#
# We do that by "padding" the sentences. If the sentences are bigger, we clip them.
# If they are smaller, we insert a "NO_WORD" token to the sentence.

x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

print(x_train.shape)

# Let's see the first sentence

print(x_train[0])

# Input shape should be now (training_size, maxlen)

# Let's use an embedding to try to help to estimate 

xi = Input(x_train.shape[1:])

# Embedding input is (training_size, maxlen)
# Embedding output is (training_size, maxlen, embedded_size)

x = Embedding(max_features, embedded_size, input_length=maxlen)(xi)

# Using LSTM to classify sentence as positive or negative
#
# “The chicken crossed the road because it was uncooked”
#
# h0 -> The 		-> h1
# h1 -> chicken 	-> h2
# h2 -> crossed 	-> h3
# h3 -> the 		-> h4
# h4 -> road		-> h5
# h5 -> because		-> h6
# h6 -> it		-> h7
# h7 -> was		-> h8
# h8 -> uncooked	-> h9
# h9 -> pos

# return_sequences: Boolean. Whether to return the last output in the output
#     sequence, or the full sequence.

# return_state: Boolean. Whether to return the last state in addition to the
#     output. The returned elements of the states list are the hidden state
#     and the cell state, respectively.

#
# What's the difference between return_sequences and return_state?
#

x = LSTM(32)(x)

#
# Try to get accuracy on validation set over 90%.
#

x = Dense(1, activation="sigmoid")(x)

model = Model(inputs=xi, outputs=x)

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.summary()

history = model.fit(
        x_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

p = model.predict(x_test)

#
# What's the current accuracy for this model?
# 
# Try to change the maxlen or the embedded_size and plot a 3D grpah with
# embedded_size x maxlen x accuracy in a python jupyter notebook.
#


## Section 4: Generating Text

In [None]:
'''Example script to generate text from Nietzsche's writings.

At least 20 epochs are required before the generated text
starts sounding coherent.

It is recommended to run this script on GPU, as recurrent
networks are quite computationally intensive.

If you try this script on new data, make sure your corpus
has at least ~100k characters. ~1M is better.
'''

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation, Dropout, Flatten, Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D
from tensorflow.keras.layers import GRU
from tensorflow.keras.optimizers import Adam
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

path = get_file("nietzsche.txt",
        origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")

text = open(path).read().lower()
print("corpus length:", len(text))

chars = set(text)
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 100
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print("nb sequences:", len(sentences))

print("Vectorization...")
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: 2 stacked LSTM
print("Build model...")
xi = Input((maxlen, len(chars)))
x = GRU(256, return_sequences=True)(xi)
x = Dropout(0.2)(x)
x = GRU(256, return_sequences=False)(x)
x = Dropout(0.2)(x)
x = Dense(len(chars))(x)
x = Activation("softmax")(x)

model = Model(inputs=xi, outputs=x)

model.summary()

adam = Adam(0.003)

model.compile(loss="categorical_crossentropy", optimizer=adam)


def sample(a, temperature=1.0):
    # helper function to sample an index from a probability array
    a = (np.log(a + 1e-8) / temperature).astype(np.float64)
    a = np.exp(a) / np.sum(np.exp(a))
    try:
      sample_result = np.argmax(np.random.multinomial(1, a, 1))
    except ValueError:
      error = 1.0 - np.sum(a)
      a[0] += error
      sample_result = np.argmax(np.random.multinomial(1, a, 1))
    return sample_result

# train the model, output generated text after each iteration
for iteration in range(1, 60):
    print()
    print("-" * 50)
    print("Iteration", iteration)

    model.fit(X, y, batch_size=4096, epochs=4)
    model.save_weights("weights.hdf5")

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print("----- diversity:", diversity)

        generated = ""
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print("----- Generating with seed: '" + sentence + "'")
        sys.stdout.write(generated)

        for i in range(200):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            # predict next char
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            # full sentence being generated
            generated += next_char

            # shift sentence
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()

            # let's consider only one sentence
            if next_char == ".":
              break
        print()

# Now you each group will perform the following tasks.
#
#   Part 1)
#
#   - Each group will pick up one set of data samples:
#     * assembly code (machine code z80, x86, ...)
#     * latex corpus
#     * html pages
#     * linux kernel source code (https://github.com/torvalds/linux)
#     * patents
#     * ...
#   - Modify the model to be trained in the corpus you chose
#   - Present the results
#  
#   Part 2)
#
#   - Pick up a book from Gutenberg (https://www.gutenberg.org/).
#   - Extract tokens from the book. You will need to keep the Tokenizer map
#     to generate the text
#   - Use embeddings + glove as the first layer (https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)
#   - Train a model to try to predict the next word of the book.
#   - Be careful with starting-tokens and invalid-tokens.
#   - Read a seed word.
#   - Generate text based on the seed word.
#



## Section 5: Translation with `seq2seq` 

In [None]:
# 1. Implement seq2seq based on
#
# https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
# to translate from english to portuguese. Portuguese dictionary can be found
# in http://www.manythings.org/anki/

# 2. Change seq2seq to generate automated responses english to portuguese in
# word-level.

