# Deep Learning for NLP - Project

RULES:

* Do not create any additional cell

* Fill in the blanks

* All cells should be runnable (modulo trivial compatibility bugs that we'd fix)

* 4 / 20 points will be allocated to the clarity of your code

* Efficient code will have a bonus

DELIVERABLE:

* this notebook
* the predictions of the SST test set

DO NOT INCLUDE THE DATASETS IN THE DELIVERABLE..

In [1]:
import io
import os
import numpy as np
import scipy
import json

In [2]:
PATH_TO_DATA = "data/"

# 1) Monolingual (English) word embeddings 

In [3]:
class Word2vec():
    def __init__(self, fname, nmax=100000):
        self.load_wordvec(fname, nmax)
        self.word2id = {w:i for i, w in  enumerate(self.word2vec.keys())}
        self.id2word = {v: k for k, v in self.word2id.items()}
        self.embeddings = np.array([vec for vec in self.word2vec.values()], ndmin = 2)
    
    def load_wordvec(self, fname, nmax):
        self.word2vec = {}
        with io.open(fname, encoding='utf-8') as f:
            next(f)
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                self.word2vec[word] = np.fromstring(vec, sep=' ')
                if i == (nmax - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(self.word2vec)))

    def most_similar(self, w, K=5):
        # K most similar words: self.score  -  np.argsort 
        if w not in self.word2id.keys():
            print("The word '{}' is not contained into the vocabulary".format(w))
            return False
        else :
            
            vector = self.word2vec[w]
            similarities = vector.T * self.embeddings / np.linalg.norm(vector)/np.sqrt(np.sum(self.embeddings**2, axis = 0))
            indexes = np.argsort(np.sum(similarities, axis = 1))[-K:]
            
            return [self.id2word[index] for index in indexes]

    def score(self, w1, w2, display = True):
        # cosine similarity: np.dot  -  np.linalg.norm
        try :
            vector1 = self.word2vec[w1]
            vector2 = self.word2vec[w2]
        except KeyError as e:
            print("The word '{}' is not contained into the vocabulary.".format(e.args[0]))
            return False

        return np.dot(vector1, vector2)/np.linalg.norm(vector1)/np.linalg.norm(vector2)


In [117]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'english.vec'), nmax=50000)#Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=25000)

# You will be evaluated on the output of the following:
for w1, w2 in zip(('cat', 'dog', 'dogs', 'paris', 'germany'), ('dog', 'pet', 'cats', 'france', 'berlin')):
    print(w1, w2, w2v.score(w1, w2))
for w1 in ['cat', 'dog', 'dogs', 'paris', 'germany']:
    print(w2v.most_similar(w1))

Loaded 49999 pretrained word vectors
cat dog 0.6380517245741391
dog pet 0.5802521571141541
dogs cats 0.6474995466659008
paris france 0.6153761202499448
germany berlin 0.591637835087271
['kitten', 'feline', 'timezone', 'cats', 'cat']
['dogs', 'lr/lc', 'spaniel', 'terrier', 'dog']
['breeds', 'dog', 'spaniel', 'terrier', 'dogs']
['montparnasse', 'm\\xc3\\xa9tro', 'military_decorations', 'paris', 'ordre_de_la_lib\\xc3\\xa3\\xc2\\xa9ration']
['westphalia', 'saxony', 'landkreis', 'vorpommern', 'germany']


In [5]:
class BoV():
    def __init__(self, w2v):
        self.w2v = w2v
    
    def encode(self, sentences, idf=False):
        # takes a list of sentences, outputs a numpy array of sentence embeddings
        # see TP1 for help
        sentemb = []
                
        for sent in sentences:
            if idf is False:
                # mean of word vectors
                embedded_sentence = [self.w2v.word2vec[word]
                                     if word in self.w2v.word2vec.keys() else np.zeros((300,))
                                     for word in sent]
                sentemb += [np.mean(embedded_sentence, axis = 0)]
                
            else:
                # idf-weighted mean of word vectors
                embedded_sentence = [self.w2v.word2vec[word]*idf[word]
                                     if word in self.w2v.word2vec.keys() else np.zeros((300,))
                                     for word in sent]
                sentemb += [np.mean(embedded_sentence, axis = 0)]
                
        return np.vstack(sentemb)

    def most_similar(self, s, sentences, idf=False, K=5):
        # get most similar sentences and **print** them
        keys = self.encode(sentences, idf)
        query = self.encode([s], idf)
        
        similarity_vec = query * keys/np.linalg.norm(query)/np.sqrt(np.sum(keys**2, axis = 0))
        indexes = np.argsort(np.sum(similarity_vec, axis = 1))[-K:]
        print("{} most similar sentences :\n - target sentence : '{}'".format(K, " ".join(s)))
        for index in indexes : 
            print(" - '" + " ".join(sentences[index])+"'")
        return [sentences[index] for index in indexes]

    def score(self, s1, s2, idf=False):
        # cosine similarity: use   np.dot  and  np.linalg.norm
        s1_encoded = self.encode([s1], idf)
        s2_encoded = self.encode([s2], idf)
        similarity = np.sum(s1_encoded* s2_encoded)/np.linalg.norm(s1_encoded)/np.linalg.norm(s2_encoded)
        
        print("Similarity between :\n - sentence 1: '{}'\n - sentence 2: '{}'\n => {}".format(' '.join(s1),
                                                                                          ' '.join(s2),
                                                                                          similarity))
        return similarity
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        idf = {}
        for sent in sentences:
            for w in set(sent):
                idf[w] = idf.get(w, 0) + 1
        
        for word in idf.keys():
            idf[word] = max(1, np.log10(len(sentences) / (idf[word])))
        return idf

In [6]:
def load_sentences(path):
    sentences = []
    with open(path, 'r', encoding = 'utf-8') as file:
        for line in file:
            sentences += [line.split(' ')[:-1]]# we get rid of the \n
    file.close()
    return sentences

In [7]:
w2v = Word2vec(os.path.join(PATH_TO_DATA, 'crawl-300d-200k.vec'), nmax=5000)
s2v = BoV(w2v)

# Load sentences in "PATH_TO_DATA/sentences.txt"
sentences = load_sentences(os.path.join(PATH_TO_DATA, "sentences.txt"))

# Build idf scores for each word
idf = {} if False else s2v.build_idf(sentences)

# You will be evaluated on the output of the following:
s2v.most_similar('' if not sentences else sentences[10], sentences)  # BoV-mean
s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13])


#idf = {}  
s2v.most_similar('' if not sentences else sentences[10], sentences, idf)  # BoV-idf
_ = s2v.score('' if not sentences else sentences[7], '' if not sentences else sentences[13], idf)

Loaded 5000 pretrained word vectors
5 most similar sentences :
 - target sentence : '1 smiling african american boy .'
 - 'a boy jumps on another boy .'
 - 'a boy skateboarding'
 - 'teen boy playing billiards .'
 - 'boy riding a horse .'
 - 'boy plays baseball .'
Similarity between :
 - sentence 1: '1 man singing and 1 man playing a saxophone in a concert .'
 - sentence 2: '10 people venture out to go crosscountry skiing .'
 => 0.6089445116147134
5 most similar sentences :
 - target sentence : '1 smiling african american boy .'
 - '4 people play soccer'
 - 'a small boy following 4 geese .'
 - 'a train station platform at 11 : 27 in the morning .'
 - '1 smiling african american boy .'
 - 'horse number 8 is racing'
Similarity between :
 - sentence 1: '1 man singing and 1 man playing a saxophone in a concert .'
 - sentence 2: '10 people venture out to go crosscountry skiing .'
 => 0.5963335652885747


# 2) Multilingual (English-French) word embeddings

Let's consider a bilingual dictionary of size V_a (e.g French-English).

Let's define **X** and **Y** the **French** and **English** matrices.

They contain the embeddings associated to the words in the bilingual dictionary.

We want to find a **mapping W** that will project the source word space (e.g French) to the target word space (e.g English).

Procrustes : **W\* = argmin || W.X - Y ||  s.t  W^T.W = Id**
has a closed form solution:
**W = U.V^T  where  U.Sig.V^T = SVD(Y.X^T)**

In what follows, you are asked to: 

#### 1 - Download and load 50k first vectors of fasttext data for french and english

In [18]:
import requests

english_vec_url = "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec"
french_vec_url = "https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.fr.vec"

def load_data_vec(url, max_count, output_file):
    r = requests.get(url, stream = True)
    data = {}
    counter =0
    with open(output_file, 'w', encoding = "utf-8") as output_file:
        for line in r.iter_lines():
            if counter >= max_count:
                break
            splits = str(line).split(" ")
            if len(splits) == 302:
                counter += 1
                line = str(line).replace("b'", "").replace("'b", "")
            
                output_file.write(str(line)+ "\n")
    output_file.close()
    return data

In [19]:
download = False
if download : french_vec = load_data_vec(french_vec_url, 50000, os.path.join(PATH_TO_DATA, "french.vec"))
if download : english_vec = load_data_vec(english_vec_url, 50000, os.path.join(PATH_TO_DATA, "english.vec"))

#### 2 - Get words that appear in both vocabs (= identical character strings)
     Use it to create the matrix X and Y (of aligned embeddings for these words)

In [127]:
w2v_fr = Word2vec(os.path.join(PATH_TO_DATA, 'french.vec'), nmax=50000)
w2v_en = Word2vec(os.path.join(PATH_TO_DATA, 'english.vec'), nmax=50000)

Loaded 49999 pretrained word vectors
Loaded 49999 pretrained word vectors


In [128]:
french_words = w2v_fr.word2id
english_words = w2v_en.word2id

In [138]:
common_words = set(french_words.keys()).intersection(set(english_words.keys()))

In [139]:
len(common_words)

18968

#### 2 - Get words that appear in both vocabs (= identical character strings)
     Use it to create the matrix X and Y (of aligned embeddings for these words)

In [130]:
X = np.array([w2v_fr.word2vec[word_fr] for word_fr in common_words]).T
Y = np.array([w2v_en.word2vec[word_en] for word_en in common_words]).T

In [131]:
X.shape

(300, 18968)

In [140]:
matrix = np.dot(Y, X.T)

In [141]:
U, sigma, V = np.linalg.svd(matrix)

In [142]:
W = np.dot(U,V)

#### 4 - After alignment with W, give examples of English nearest neighbors of some French words (and vice versa)
     You will be evaluated on that part and the code above

In [171]:
def find_closest_translation(word_to_translate, translation_matrix,
                             destination_w2v, starting_w2v, nearest_neighbours = 5):
    # embedding the english word
    vector_start = starting_w2v.word2vec[word_to_translate].reshape(-1, )
    # translating the embedding
    vector_dest = np.dot(translation_matrix.T, vector_start).reshape(-1,)
    # looking for the closest vectors in french
    similarities = np.sum(vector_dest.T * destination_w2v.embeddings / np.linalg.norm(vector_dest)/np.sqrt(np.sum(destination_w2v.embeddings**2, axis = 0)), axis = 1)

    indexes = np.argsort(similarities)[-nearest_neighbours:]
    
    print("The nearest {} neighbours of '{}' are : ".format(nearest_neighbours, word_to_translate))
    for index, index_vec in enumerate(indexes):
        print("{} - '{}'".format(nearest_neighbours-index, destination_w2v.id2word[index_vec]))
    #return [destination_w2v.id2word[index] for index in indexes]
    return similarities

In [173]:
test_words_en = ['cat', 'dog', 'boy', 'girl', 'car', 'computer', 'king', 'natural', 'language', 'processing']
for eng_word in test_words_en:
    _ = find_closest_translation(eng_word, W, w2v_fr, w2v_en, 10)

The nearest 10 neighbours of 'cat' are : 
10 - 'hound'
9 - 'dog'
8 - 'tail'
7 - 'mammif\xc3\xa8re'
6 - 'cats'
5 - 'grizzly'
4 - 'chat'
3 - 'felis'
2 - 'canis'
1 - 'cat'
The nearest 10 neighbours of 'dog' are : 
10 - 'dogs'
9 - 'canis'
8 - 'poney'
7 - 'sheep'
6 - 'porky'
5 - 'pig'
4 - 'dingo'
3 - 'chien'
2 - 'dog'
1 - 'hound'
The nearest 10 neighbours of 'boy' are : 
10 - 'adventures'
9 - 'teenage'
8 - 'boys'
7 - 'wife'
6 - 'boyz'
5 - 'daughter'
4 - 'dogg'
3 - 'sailor'
2 - 'girl'
1 - 'boy'
The nearest 10 neighbours of 'girl' are : 
10 - 'm\xc3\xa4dchen'
9 - 'teenage'
8 - '\xe2\x94\x9c\xe2\x94\x80'
7 - 'musume'
6 - 'girlfriend'
5 - 'actress'
4 - 'woman'
3 - 'sailor'
2 - 'girls'
1 - 'girl'
The nearest 10 neighbours of 'car' are : 
10 - 'audi'
9 - 'peugeot'
8 - 'pilote_record_tour'
7 - 'wrc'
6 - 'roadster'
5 - 'motorsport'
4 - 'wtcc'
3 - 'porsche'
2 - '\xc3\xa9curie_record_tour'
1 - '\xc3\xa9curie_pole'
The nearest 10 neighbours of 'computer' are : 
10 - 'windows'
9 - 'amiga'
8 - 'informat

In [177]:
test_words_fr = ['chat', 'chien', 'homme', 'fille', 'voiture', 'ordinateur', 'roi', 'traitement', 'automatique', 'langage', 'automatique']
for fre_word in test_words_fr:
    _ = find_closest_translation(fre_word, W.T, w2v_en, w2v_fr, 10)

The nearest 10 neighbours of 'chat' are : 
10 - 'bellied'
9 - 'rat'
8 - 'raccoons'
7 - 'timezone'
6 - 'hamster'
5 - 'lemur'
4 - 'squirrel'
3 - 'vole'
2 - 'macaque'
1 - 'lr/lc'
The nearest 10 neighbours of 'chien' are : 
10 - 'breeds'
9 - 'canis'
8 - 'kennel'
7 - 'fci'
6 - 'dog'
5 - 'poodle'
4 - 'vole'
3 - 'spaniel'
2 - 'terrier'
1 - 'lr/lc'
The nearest 10 neighbours of 'homme' are : 
10 - 'anatomist'
9 - 'industrialist'
8 - '_at_tn'
7 - 'politiques'
6 - 'businessman'
5 - 'politique'
4 - 'latns'
3 - 'diplomat'
2 - 'homme'
1 - 'politician'
The nearest 10 neighbours of 'fille' are : 
10 - 'married'
9 - 'eldest'
8 - 'predeceased'
7 - '\xe2\x94\x82'
6 - 'granddaughter'
5 - 'n\xc3\xa9e'
4 - 'remarried'
3 - 'daughter'
2 - 'heiress'
1 - 'noblewoman'
The nearest 10 neighbours of 'voiture' are : 
10 - 'sedans'
9 - 'porsche'
8 - 'coup\xc3\xa9'
7 - 'car'
6 - 'hatchback'
5 - 'chevrolet'
4 - 'wheelbase'
3 - 'citro\xc3\xabn'
2 - 'suv'
1 - 'peugeot'
The nearest 10 neighbours of 'ordinateur' are : 
10 

If you want to dive deeper on this subject: https://github.com/facebookresearch/MUSE

# 3) Sentence classification with BoV and scikit-learn

#### 1 - Load train/dev/test of Stanford Sentiment TreeBank (SST)
    (https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf)

In [180]:
SST_PATH = os.path.join(PATH_TO_DATA, "SST")
sentences_train = load_sentences(path = os.path.join(SST_PATH, "stsa.fine.train"))
sentences_dev = load_sentences(path = os.path.join(SST_PATH, "stsa.fine.dev"))
sentences_test = load_sentences(path = os.path.join(SST_PATH, "stsa.fine.test.X"))

#### 2 - Encode sentences with the BoV model above

In [181]:
# we will consider the w2V model built at the previous question
bov = BoV(w2v_en)
idf_train = bov.build_idf(sentences_train)
idf_dev = bov.build_idf(sentences_dev)
idf_test = bov.build_idf(sentences_test)

In [182]:
emb_train = bov.encode(sentences_train, idf_train)
emb_dev = bov.encode(sentences_dev, idf_dev)
emb_test = bov.encode(sentences_test, idf_test)

In [183]:
emb_train = np.array(emb_train, ndmin=2)
emb_dev = np.array(emb_dev, ndmin=2)
emb_test = np.array(emb_test, ndmin=2)

#### 3 - Learn Logistic Regression on top of sentence embeddings using scikit-learn
     (consider tuning the L2 regularization on the dev set)

In [184]:
labels_train = np.array([int(sentence[0]) for sentence in sentences_train], ndmin=1)
labels_dev = np.array([int(sentence[0]) for sentence in sentences_dev], ndmin=1)

In [185]:
import pandas as pd
target_train = pd.get_dummies(labels_train).values
target_dev = pd.get_dummies(labels_dev).values

In [189]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [190]:
model = OneVsRestClassifier(LogisticRegression(C=1))

In [191]:
model.fit(emb_train, labels_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [192]:
pred_train = model.predict(emb_train)

In [193]:
print("Training score : {}".format(sum(pred_train == labels_train)/len(pred_train)))

Training score : 0.46301498127340823


In [196]:
print("details on training data")
print(classification_report(labels_train, pred_train))

details on training data
             precision    recall  f1-score   support

          0       0.51      0.25      0.34      1092
          1       0.44      0.62      0.52      2218
          2       0.44      0.21      0.28      1624
          3       0.45      0.62      0.52      2322
          4       0.58      0.41      0.48      1288

avg / total       0.47      0.46      0.44      8544



In [197]:
pred_dev = model.predict(emb_dev)

In [199]:
print("details on development data")
print(classification_report(labels_dev, pred_dev))

details on development data
             precision    recall  f1-score   support

          0       0.37      0.20      0.26       139
          1       0.40      0.56      0.47       289
          2       0.27      0.10      0.14       229
          3       0.36      0.56      0.44       279
          4       0.45      0.30      0.36       165

avg / total       0.37      0.38      0.35      1101



In [198]:
print("Training score : {}".format(sum(pred_dev == labels_dev)/len(pred_dev)))

Training score : 0.3787465940054496


#### 4 - Produce 2210 predictions for the test set (in the same order). One line = one prediction (=0,1,2,3,4).
     Attach the output file "logreg_bov_y_test_sst.txt" to your deliverable.
     You will be evaluated on the results of the test set.

In [146]:
pred_test = model.predict(emb_test)
np.savetxt(fname = os.path.join(PATH_TO_DATA, "logreg_bov_y_test_sst.txt"), X = pred_test)

#### 5 - Try to improve performance with another classifier
     Attach the output file "XXX_bov_y_test_sst.txt" to your deliverable (where XXX = the name of the classifier)


In [200]:
import keras.backend as K
from keras.layers import Dense, Dropout
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [201]:
K.clear_session()
nn_model = Sequential()
nn_model.add(Dense(50, input_shape = (300, ), activation = 'relu'))
nn_model.add(Dropout(.3))
nn_model.add(Dense(30, activation = "relu"))
nn_model.add(Dense(5, activation = "softmax"))

In [202]:
nn_model.compile(loss = "categorical_crossentropy", metrics = ['accuracy'], optimizer = 'adam')
nn_model.fit(x = emb_train, 
             y=target_train, 
             validation_data=(emb_dev, target_dev), 
             batch_size = 32,
             epochs = 50, 
             verbose = 0) 

<keras.callbacks.History at 0x7fdac35d6e80>

In [204]:
nn_model.evaluate(emb_dev, target_dev)



[1.6138260589741664, 0.3678474114576759]

# 4) Sentence classification with LSTMs in Keras

## 4.1 - Preprocessing

In [10]:
import keras

Using Theano backend.


#### 1 - Load train/dev/test sets of SST

In [223]:
def load_raw_sentences(path):
    sentences = []
    with open(path, 'r', encoding = 'utf-8') as input_file:
        for line in input_file:
            sentences += [str(line).replace("'b", "")
                          .replace("\n", "")[2:]]
    input_file.close()
    return sentences

In [226]:
raw_train_sentences = load_raw_sentences(os.path.join(SST_PATH, "stsa.fine.train"))
raw_dev_sentences = load_raw_sentences(os.path.join(SST_PATH, "stsa.fine.dev"))
raw_test_sentences = load_raw_sentences(os.path.join(SST_PATH, "stsa.fine.test.X"))

#### 2 - Transform text to integers using keras.preprocessing.text.one_hot function
     https://keras.io/preprocessing/text/

In [206]:
from keras.preprocessing.text import one_hot

In [229]:
train_vocabulary_size = len(set([word for sent in raw_train_sentences for word in sent.split(" ")]))

In [245]:
encoded_train_sentences = [one_hot(sentence, round(train_vocabulary_size*1.3)) for sentence in raw_train_sentences]
encoded_dev_sentences = [one_hot(sentence, round(train_vocabulary_size*1.3)) for sentence in raw_dev_sentences]
encoded_test_sentences = [one_hot(sentence, round(train_vocabulary_size*1.3)) for sentence in raw_test_sentences]

**Padding input data**

Models in Keras (and elsewhere) take batches of sentences of the same length as input. It is because Deep Learning framework have been designed to handle well Tensors, which are particularly suited for fast computation on the GPU.

Since sentences have different sizes, we "pad" them. That is, we add dummy "padding" tokens so that they all have the same length.

The input to a Keras model thus has this size : (batchsize, maxseqlen) where maxseqlen is the maximum length of a sentence in the batch.

#### 3 - Pad your sequences using keras.preprocessing.sequence.pad_sequences
     https://keras.io/preprocessing/sequence/

In [246]:
from keras.preprocessing.sequence import pad_sequences

In [247]:
padded_encoded_train_sentences = pad_sequences(encoded_train_sentences)

In [249]:
max_len_train = padded_encoded_train_sentences.shape[1]
padded_encoded_dev_sentences = pad_sequences(encoded_dev_sentences, max_len_train)
padded_encoded_test_sentences = pad_sequences(encoded_test_sentences, max_len_train)

## 4.2 - Design and train your model

#### 4 - Design your encoder + classifier using keras.layers
     In Keras, Torch and other deep learning framework, we create a "container" which is the Sequential() module.
     Then we add components to this contained : the lookuptable, the LSTM, the classifier etc.
     All of these components are contained in the Sequential() and are trained together.

In [265]:
# ADAPT CODE BELOW

from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Activation
K.clear_session()

embed_dim  = 300  # word embedding dimension
nhid       = 64  # number of hidden units in the LSTM
vocab_size = round(train_vocabulary_size*1.3)  # size of the vocabulary
n_classes  = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_dim))
model.add(LSTM(nhid, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(n_classes, activation='sigmoid'))

#### 5 - Define your loss/optimizer/metrics

In [266]:
loss_classif     =  'categorical_crossentropy' # find the right loss for multi-class classification
optimizer        =  'rmsprop' # find the right optimizer
metrics_classif  =  ['accuracy']

# Observe how easy (but blackboxed) this is in Keras
model.compile(loss=loss_classif,
              optimizer=optimizer,
              metrics=metrics_classif)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         6466500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                93440     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 325       
Total params: 6,560,265
Trainable params: 6,560,265
Non-trainable params: 0
_________________________________________________________________


#### 6 - Train your model and find the best hyperparameters for your dev set
     you will be evaluated on the quality of your predictions on the test set

In [267]:
x_train = padded_encoded_train_sentences
x_val = padded_encoded_dev_sentences
y_train = target_train
y_val = target_dev

In [268]:
# ADAPT CODE BELOW
bs = 64
n_epochs = 6

history = model.fit(x_train, y_train, batch_size=bs, epochs=n_epochs, validation_data=(x_val, y_val))

Train on 8544 samples, validate on 1101 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
# 7 - Generate your predictions on the test set using model.predict(x_test)
#     https://keras.io/models/model/
#     Log your predictions in a file (one line = one integer: 0,1,2,3,4)
#     Attach the output file "logreg_lstm_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE


## 4.3 -- innovate !

In [None]:
# 8 - Open question: find a model that is better on your dev set
#     (e.g: use a 1D ConvNet, use a better classifier, pretrain your lookup tables ..)
#     you will get point if the results on the test set are better: be careful of not overfitting your dev set too much..
#     Attach the output file "XXX_XXX_y_test_sst.txt" to your deliverable.

# TYPE CODE HERE
