In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals


import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams, \
            build_unique_ngrams, create_sentence_vectors, create_sentence_vectors_submission

import sys

import tensorflow as tf
from tensorflow import keras

import gensim   # Not sure whether it is better to use gensim or tensorflow :/
import logging
from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

sys.path.append('../')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
df = pd.read_pickle("dataframes/full_df_cleaned_train_0_8.pickle")

In [3]:
df_test = pd.read_pickle("dataframes/full_df_cleaned_test_0_2.pickle")

In [4]:
count_unique_words(df)

452521

In [4]:
def create_embedding_matrix_2(w2v_model, word_index):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index

    ## We can assume love is always present in our vocabulary ahaha
    embedding_matrix = np.zeros((vocab_size, w2v_model.wv.word_vec("love").shape[0]))  
    
    for word in w2v_model.wv.vocab:
        vector = w2v_model.wv.word_vec(word)
        if word in word_index:
            idx = word_index[word] 
            embedding_matrix[idx] = np.array(
                vector, dtype=np.float32)
    for row in range(embedding_matrix.shape[0]):
        if not np.any(embedding_matrix[row,:]):
            embedding_matrix[row,:] = np.random.rand(embedding_dim)

    return embedding_matrix

In [6]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    for row in range(embedding_matrix.shape[0]):
        if not np.any(embedding_matrix[row,:]):
            embedding_matrix[row,:] = np.random.rand(embedding_dim)

    return embedding_matrix

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.sentence)
X_train = tokenizer.texts_to_sequences(df.sentence)
X_test = tokenizer.texts_to_sequences(df_test.sentence)

def max_len(X):
    maxlen = 0
    for el in X:
        maxlen = maxlen if len(el) < maxlen else len(el)
    return maxlen
maxlen = 16   # magic number

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

y_train = np.where(df.label == 1, 1, 0)
y_test = np.where(df_test.label == 1, 1, 0)

In [6]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

396986

In [9]:
df.iloc[8634]

sentence    want pottermore let ! can't obsessed let #nerd...
label                                                      -1
Name: 2109505, dtype: object

In [10]:
print(X_train[0, :])

[  15 1810 8634 2884 3383    7    0    0    0    0    0    0    0    0
    0    0]


In [7]:
embedding_dim = 300

In [8]:
#w2v_model = Word2Vec.load("models/GoogleNews-vectors-negative300.bin")
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('models/GoogleNews-vectors-negative300.bin', binary=True)
embedding_matrix = create_embedding_matrix_2(
    w2v_model,
    tokenizer.word_index)

2019-12-18 21:12:52,754 : INFO : loading projection weights from models/GoogleNews-vectors-negative300.bin
2019-12-18 21:13:41,578 : INFO : loaded (3000000, 300) matrix from models/GoogleNews-vectors-negative300.bin
  """
  import sys
  


In [None]:
print(embedding_matrix)

In [9]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print(nonzero_elements / vocab_size)
len(embedding_matrix)

1.0


396986

In [10]:
#glove twitter

from tensorflow.keras.layers import GlobalMaxPooling1D, concatenate, Dropout, Dense, Embedding, Input, Conv1D
from tensorflow.keras.models import Model

# Specifying the input shape: the input is a sentence of maxlen words
embedding_layer = Embedding(vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=maxlen, 
                            trainable=True)
sequence_input = Input(shape=(maxlen,), dtype='int32')
# Creating the embedding using the previously constructed embedding matrix
embedded_sequences = embedding_layer(sequence_input)
convs = []
filter_sizes = [3,5]
for filter_size in filter_sizes:
    # Creating the convolutional layer:
    #    "filters" represents the number of different windows we want (i.e. how many channels to produce),
    #    therefore in our case we will end up with 200 different convolutions
    conv_layer = Conv1D(filters=100, 
                    kernel_size=filter_size, 
                    activation='relu')(embedded_sequences)
    # Creating the global max pooling layer
    pool_layer = GlobalMaxPooling1D()(conv_layer)
    convs.append(pool_layer)
merged_layers = concatenate(convs, axis=1)
# Create dropout leayer: randomly set a fraction of input units to 0, which helps prevent overfitting
x = Dropout(0.3)(merged_layers)  
# Create (regular) densely-connected layer
x = Dense(50, activation='relu')(x)
x = Dense(50, activation='relu')(x)
x = Dropout(0.2)(x)
preds = Dense(1, activation='sigmoid')(x)
model_tw = Model(sequence_input, preds)
model_tw.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_tw.summary()

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 16)           0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 16, 300)      119095800   input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 14, 100)      90100       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 12, 100)      150100      embedding[0][0]                  
__________________________________________________________________________________________________
global_max

In [11]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath="models/cnn_glove_tw"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model_tw.fit(X_train, y_train, epochs=50, verbose=True, validation_data=(X_test, y_test), callbacks=callbacks_list, batch_size=512)
loss, accuracy = model_tw.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model_tw.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Train on 2000000 samples, validate on 500000 samples
Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
  num_elements)


Epoch 1/50
Epoch 00001: val_acc improved from -inf to 0.82475, saving model to models/cnn_glove_tw
Epoch 2/50
Epoch 00002: val_acc improved from 0.82475 to 0.82915, saving model to models/cnn_glove_tw
Epoch 3/50
Epoch 00003: val_acc improved from 0.82915 to 0.82937, saving model to models/cnn_glove_tw
Epoch 4/50
Epoch 00004: val_acc did not improve from 0.82937
Epoch 5/50
Epoch 00005: val_acc did not improve from 0.82937
Epoch 6/50

KeyboardInterrupt: 