In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals


import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams, \
            build_unique_ngrams, create_sentence_vectors, create_sentence_vectors_submission

import sys

import tensorflow as tf
from tensorflow import keras

import gensim   # Not sure whether it is better to use gensim or tensorflow :/
import logging
from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Embedding, Input, Conv1D, Dense, MaxPooling1D, Flatten
from tensorflow.keras.models import Model
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

sys.path.append('../')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_pickle("dataframes/full_df_cleaned_train_0_8.pickle")

In [3]:
df_test = pd.read_pickle("dataframes/full_df_cleaned_test_0_2.pickle")

In [4]:
count_unique_words(df)

452521

In [5]:
w2v_model = Word2Vec.load("models/w2v_model_epochs_5_win_5_cbow_250.model")

2019-12-18 15:20:57,602 : INFO : loading Word2Vec object from models/w2v_model_epochs_5_win_5_cbow_250.model
2019-12-18 15:20:58,275 : INFO : loading wv recursively from models/w2v_model_epochs_5_win_5_cbow_250.model.wv.* with mmap=None
2019-12-18 15:20:58,276 : INFO : loading vectors from models/w2v_model_epochs_5_win_5_cbow_250.model.wv.vectors.npy with mmap=None
2019-12-18 15:20:59,135 : INFO : setting ignored attribute vectors_norm to None
2019-12-18 15:20:59,136 : INFO : loading vocabulary recursively from models/w2v_model_epochs_5_win_5_cbow_250.model.vocabulary.* with mmap=None
2019-12-18 15:20:59,137 : INFO : loading trainables recursively from models/w2v_model_epochs_5_win_5_cbow_250.model.trainables.* with mmap=None
2019-12-18 15:20:59,138 : INFO : loading syn1neg from models/w2v_model_epochs_5_win_5_cbow_250.model.trainables.syn1neg.npy with mmap=None
2019-12-18 15:20:59,978 : INFO : setting ignored attribute cum_table to None
2019-12-18 15:20:59,978 : INFO : loaded models/w

In [6]:
w2v_model.wv.word_vec("love").shape

(250,)

In [7]:
import numpy as np

def create_embedding_matrix(w2v_model, word_index):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index

    ## We can assume love is always present in our vocabulary ahaha
    embedding_matrix = np.zeros((vocab_size, w2v_model.wv.word_vec("love").shape[0]))  
    
    for word in w2v_model.wv.vocab:
        vector = w2v_model.wv.word_vec(word)
        if word in word_index:
            idx = word_index[word] 
            embedding_matrix[idx] = np.array(
                vector, dtype=np.float32)

    return embedding_matrix

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [9]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.sentence)

In [10]:
X_train = tokenizer.texts_to_sequences(df.sentence)

In [11]:
X_test = tokenizer.texts_to_sequences(df_test.sentence)

In [12]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

396986

In [16]:
X_train[X_train[0] == '#']

[15, 1810, 8634, 2884, 3383, 7]

In [17]:
df.iloc[8634].sentence

"want pottermore let ! can't obsessed let #nerdproblems"

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
def max_len(X):
    maxlen = 0
    for el in X:
        maxlen = maxlen if len(el) < maxlen else len(el)
    return maxlen
maxlen = int(max_len(X_train)/2)
print(maxlen)

22


In [15]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)

In [16]:
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [17]:
print(X_train[0, :])

[  15 1810 8634 2884 3383    7    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [18]:
embedding_dim = 250

In [19]:
embedding_matrix = create_embedding_matrix(
    w2v_model,
    tokenizer.word_index)

In [20]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print(nonzero_elements / vocab_size)
len(embedding_matrix)

0.7895593295481452


396986

In [21]:
# Convert -1 in 0 (otherwise it doesn't work)
y_train = np.where(df.label == 1, 1, 0)
y_test = np.where(df_test.label == 1, 1, 0)

In [None]:
for i in range(50, 251, 50):
    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim,
                               input_length=maxlen,
                               weights=[embedding_matrix],
                               trainable=False))
    model.add(layers.Conv1D(i, 5, activation='relu'))   ## Maybe I should increase the kernel window (currently only 5)
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(60, activation='relu'))
    model.add(layers.Dense(60, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    filepath="models/convolutional_nn_layers_of_size_{}_{}".format(i, 60)
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]

    history = model.fit(X_train, y_train,
                        epochs=20,
                        verbose=True,
                        callbacks=callbacks_list,
                        validation_data=(X_test, y_test),
                        batch_size=128)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 22, 250)           99246500  
_________________________________________________________________
conv1d (Conv1D)              (None, 18, 50)            62550     
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 60)                3060      
_________________________________________________________________
dense_1 (Dense)              (None, 60)                3660      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 61        
Total params: 99,315,831
Trainable params: 69,331
Non-trainable params: 99,246,500
___________________________________________________________

Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.81024, saving model to models/convolutional_nn_layers_of_size_50_60
Epoch 2/20
Epoch 00002: val_acc improved from 0.81024 to 0.81226, saving model to models/convolutional_nn_layers_of_size_50_60
Epoch 3/20
Epoch 00003: val_acc improved from 0.81226 to 0.81570, saving model to models/convolutional_nn_layers_of_size_50_60
Epoch 4/20
  93440/2000000 [>.............................] - ETA: 44s - loss: 0.3765 - acc: 0.8242

In [33]:
filepath="models/convolutional_nn_layers_of_size_{}_{}".format(256, 30)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    callbacks=callbacks_list,
                    validation_data=(X_test, y_test),
                    batch_size=128)

Train on 2000000 samples, validate on 500000 samples
Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.81485, saving model to models/convolutional_nn_layers_of_size_256_30
Epoch 2/10
Epoch 00002: val_acc improved from 0.81485 to 0.81871, saving model to models/convolutional_nn_layers_of_size_256_30
Epoch 3/10
Epoch 00003: val_acc improved from 0.81871 to 0.81954, saving model to models/convolutional_nn_layers_of_size_256_30
Epoch 4/10
Epoch 00004: val_acc did not improve from 0.81954
Epoch 5/10
Epoch 00005: val_acc did not improve from 0.81954
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.81954
Epoch 7/10

KeyboardInterrupt: 

In [None]:
#########################
### don't add this

In [21]:
for i in range(30, 101, 10):

    model = Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, 
                               weights=[embedding_matrix], 
                               input_length=maxlen, 
                               trainable=False))
    model.add(layers.GlobalMaxPool1D())
    model.add(layers.Dense(i, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.summary()



    filepath="models/random_nn_layers_of_size_{}_big".format(i)
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]

    history = model.fit(X_train, y_train,
                        epochs=30,
                        verbose=True,
                        validation_data=(X_test, y_test),
                        callbacks=callbacks_list,
                        batch_size=128)
    loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
    print("Training Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))
    print("#############################\n############################\n##################################")


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 44, 200)           79397200  
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 30)                6030      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 79,403,261
Trainable params: 79,403,261
Non-trainable params: 0
_________________________________________________________________
Train on 2000000 samples, validate on 500000 samples
Epoch 1/30
Epoch 00001: val_acc improved from -inf to 0.82468, saving model to models/random_nn_layers_of_size_30_big
Epoch 2/30
Epoch 00002: val_acc improved from 0.82468 to 0.82823, saving model to models/ra

KeyboardInterrupt: 