In [9]:
from __future__ import absolute_import, division, print_function, unicode_literals


import numpy as np
import pandas as pd
%load_ext autoreload
%autoreload 2
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from helpers import count_unique_words, count_unique_ngrams, \
            build_unique_ngrams, create_sentence_vectors, create_sentence_vectors_submission

import sys

import tensorflow as tf
from tensorflow import keras

import gensim   # Not sure whether it is better to use gensim or tensorflow :/
import logging
from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
df = pd.read_pickle("dataframes/full_df_cleaned_train_0_8.pickle")

In [11]:
df_test = pd.read_pickle("dataframes/full_df_cleaned_test_0_2.pickle")

In [12]:
count_unique_words(df)

452521

In [24]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    for row in range(embedding_matrix.shape[0]):
        if not np.any(embedding_matrix[row,:]):
            embedding_matrix[row,:] = np.random.rand(embedding_dim)

    return embedding_matrix

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.sentence)
X_train = tokenizer.texts_to_sequences(df.sentence)
X_test = tokenizer.texts_to_sequences(df_test.sentence)

#def max_len(X):
#    maxlen = 0
#    for el in X:
#        maxlen = maxlen if len(el) < maxlen else len(el)
#    return maxlen
#maxlen = max_len(X_train)
maxlen=16

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

y_train = np.where(df.label == 1, 1, 0)
y_test = np.where(df_test.label == 1, 1, 0)

In [15]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

396986

In [16]:
df.iloc[8634]

sentence    want pottermore let ! can't obsessed let #nerd...
label                                                      -1
Name: 2109505, dtype: object

In [17]:
print(X_train[0, :])

[  15 1810 8634 2884 3383    7    0    0    0    0    0    0    0    0
    0    0]


In [18]:
embedding_dim = 50

In [25]:
embedding_matrix = create_embedding_matrix(
    'glove/glove.twitter.27B.50d.txt',
    tokenizer.word_index, embedding_dim)

In [20]:
print(X_train.shape)

(2000000, 16)


In [21]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
print(nonzero_elements / vocab_size)
len(embedding_matrix)

0.3841621618898399


396986

In [21]:
#without embedding matrix
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 44, 50)            19849300  
_________________________________________________________________
conv1d (Conv1D)              (None, 40, 128)           32128     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 19,882,729
Trainable params: 19,882,729
Non-trainable params: 0
_________________________________________________________________


In [23]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath="models/cnn"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks_list,
                    batch_size=512)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)


Train on 2000000 samples, validate on 500000 samples
Epoch 1/50




Epoch 2/50




Epoch 3/50




Epoch 4/50




Epoch 5/50




Epoch 6/50




Epoch 7/50




Epoch 8/50




Epoch 9/50




Epoch 10/50




Epoch 11/50




Epoch 12/50




Epoch 13/50




Epoch 14/50




Epoch 15/50




KeyboardInterrupt: 

In [34]:
#with embedding matrix
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 44, 50)            19849300  
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 40, 128)           32128     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                1290      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 19,882,729
Trainable params: 19,882,729
Non-trainable params: 0
_________________________________________________________________


In [36]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath="models/cnn"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks_list,
                    batch_size=512)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

Train on 2000000 samples, validate on 500000 samples
Epoch 1/50
Epoch 00001: val_accuracy improved from -inf to 0.82180, saving model to models/cnn
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: models/cnn/assets


2019-12-17 18:25:16,971 : INFO : Assets written to: models/cnn/assets


Epoch 2/50
Epoch 00002: val_accuracy improved from 0.82180 to 0.82544, saving model to models/cnn
INFO:tensorflow:Assets written to: models/cnn/assets


2019-12-17 19:25:43,319 : INFO : Assets written to: models/cnn/assets


Epoch 3/50
Epoch 00003: val_accuracy did not improve from 0.82544
Epoch 4/50
Epoch 00004: val_accuracy did not improve from 0.82544
Epoch 5/50




KeyboardInterrupt: 

In [24]:
#using Stefano's link
from keras.layers import Embedding

embedding_layer = Embedding(vocab_size, 
                            embedding_dim, 
                            weights=[embedding_matrix], 
                            input_length=maxlen, 
                            trainable=False)

Using TensorFlow backend.


In [32]:
model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.MaxPooling1D(35))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='softmax'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

ValueError: Negative dimension size caused by subtracting 5 from 4 for 'max_pooling1d_1_1/MaxPool' (op: 'MaxPool') with input shapes: [?,4,1,128].

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath="models/cnn"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model.fit(X_train, y_train,
                    epochs=50,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks_list,
                    batch_size=512)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

In [29]:
#glove twitter

from keras.layers import GlobalMaxPooling1D, concatenate, Dropout, Dense, Embedding, Input, Conv1D
from keras.models import Model

# Specifying the input shape: the input is a sentence of maxlen words
embedding_layer = Embedding(vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=maxlen)
sequence_input = Input(shape=(maxlen,), dtype='int32')
# Creating the embedding using the previously constructed embedding matrix
embedded_sequences = embedding_layer(sequence_input)
convs = []
filter_sizes = [3,5]
for filter_size in filter_sizes:
    # Creating the convolutional layer:
    #    "filters" represents the number of different windows we want (i.e. how many channels to produce),
    #    therefore in our case we will end up with 200 different convolutions
    conv_layer = Conv1D(filters=100, 
                    kernel_size=filter_size, 
                    activation='relu')(embedded_sequences)
    # Creating the global max pooling layer
    pool_layer = GlobalMaxPooling1D()(conv_layer)
    convs.append(pool_layer)
merged_layers = concatenate(convs, axis=1)
# Create dropout leayer: randomly set a fraction of input units to 0, which helps prevent overfitting
x = Dropout(0.3)(merged_layers)  
# Create (regular) densely-connected layer
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
preds = Dense(1, activation='sigmoid')(x)
model_tw = Model(sequence_input, preds)
model_tw.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model_tw.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 16)           0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 16, 50)       19849300    input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, 14, 100)      15100       embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_12 (Conv1D)              (None, 12, 100)      25100       embedding_3[0][0]                
____________________________________________________________________________________________

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

filepath="models/cnn_glove_tw"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model_tw.fit(X_train, y_train, epochs=50, verbose=True, validation_data=(X_test, y_test), callbacks=callbacks_list, batch_size=512)
loss, accuracy = model_tw.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model_tw.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plot_history(history)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2000000 samples, validate on 500000 samples
Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.82353, saving model to models/cnn_glove_tw
Epoch 2/50

Epoch 00002: val_accuracy improved from 0.82353 to 0.82879, saving model to models/cnn_glove_tw
Epoch 3/50

Epoch 00003: val_accuracy improved from 0.82879 to 0.82983, saving model to models/cnn_glove_tw
Epoch 4/50

Epoch 00004: val_accuracy did not improve from 0.82983
Epoch 5/50