In [53]:
from __future__ import absolute_import, division, print_function, unicode_literals


import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams, \
            build_unique_ngrams, create_sentence_vectors, create_sentence_vectors_submission

import sys

import tensorflow as tf
from tensorflow import keras

import gensim   # Not sure whether it is better to use gensim or tensorflow :/
import logging
from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
df = pd.read_pickle("dataframes/full_df_cleaned_train_0_8.pickle")

In [16]:
df_test = pd.read_pickle("dataframes/full_df_cleaned_test_0_2.pickle")

In [11]:
count_unique_words(df)

452521

In [7]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.sentence)

In [15]:
X_train = tokenizer.texts_to_sequences(df.sentence)

In [17]:
X_test = tokenizer.texts_to_sequences(df_test.sentence)

In [19]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

396986

In [26]:
X_train[X_train[0] == '#']

[15, 1810, 8634, 2884, 3383, 7]

In [28]:
df.iloc[8634]

sentence    want pottermore let ! can't obsessed let #nerd...
label                                                      -1
Name: 2109505, dtype: object

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [41]:
def max_len(X):
    maxlen = 0
    for el in X:
        maxlen = maxlen if len(el) < maxlen else len(el)
    return maxlen
maxlen = max_len(X_train)

In [43]:
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)

In [44]:
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [45]:
print(X_train[0, :])

[  15 1810 8634 2884 3383    7    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [46]:
embedding_dim = 50

In [47]:
embedding_matrix = create_embedding_matrix(
    'models/glove.6B/glove.6B.50d.txt',
    tokenizer.word_index, embedding_dim)

In [51]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size
len(embedding_matrix)

396986

In [54]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 44, 50)            19849300  
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense (Dense)                (None, 10)                510       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 19,849,821
Trainable params: 19,849,821
Non-trainable params: 0
_________________________________________________________________


In [66]:
y_train = np.where(df.label == 1, 1, 0)
y_test = np.where(df_test.label == 1, 1, 0)

In [None]:
history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=512)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)


Train on 2000000 samples, validate on 500000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
  97792/2000000 [>.............................] - ETA: 23:37 - loss: 0.3642 - accuracy: 0.8348