In [1]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics

from keras import models
from keras import layers
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

from preprocess_tweets import preprocess_tweet, remove_stopwords

Using TensorFlow backend.


In [2]:
MAX_LEN = 30 # Maximum number of words in a sequence

### Model #1

In [3]:
df_train = pd.read_csv("../data/train.csv")

df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)

In [4]:
X_train, X_val, y_train, y_val = train_test_split(df_train.text, df_train.target, test_size=0.1, random_state=37)

In [5]:
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_val_seq = tk.texts_to_sequences(X_val)

In [6]:
seq_lengths = df_train.text.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    7613.000000
mean       10.174044
std         4.060505
min         1.000000
25%         7.000000
50%        10.000000
75%        13.000000
max        28.000000
Name: text, dtype: float64

In [7]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_val_seq_trunc = pad_sequences(X_val_seq, maxlen = MAX_LEN)

In [8]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_val_le = le.transform(y_val)
y_train_oh = to_categorical(y_train_le)
y_val_oh = to_categorical(y_val_le)

In [9]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 8, input_length = MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 482       
Total params: 80,482
Trainable params: 80,482
Non-trainable params: 0
_________________________________________________________________


In [10]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [11]:
emb_model.fit(X_train_seq_trunc, 
                y_train_oh, 
                epochs = 30,
                batch_size = 512,
                validation_data = (X_val_seq_trunc, y_val_oh),
                verbose = 1)

Train on 6851 samples, validate on 762 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x7fbf103f9e90>

**Create submission file**

In [12]:
df_test = pd.read_csv("../data/test.csv")

In [13]:
ids = df_test['id'].to_numpy()

X_test = df_test.text.apply(preprocess_tweet).apply(remove_stopwords)
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_test)

X_test_seq = tk.texts_to_sequences(X_test)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen = MAX_LEN)

In [14]:
predictions = emb_model.predict_classes(X_test_seq_trunc)

In [15]:
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('../data/9. Submission_Keras_Embeddings.csv', index=False)

**Submission result**

Score: 0.56032

### Model #2 

**implementing early stopping to prevent overfitiing**

In [16]:
df_train = pd.read_csv("../data/train.csv")

In [17]:
X_train = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)
y_train = df_train.target

In [18]:
X_train, X_val, y_train, y_val = train_test_split(df_train.text, df_train.target, test_size=0.1, random_state=37)

In [19]:
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_val_seq = tk.texts_to_sequences(X_val)

In [20]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_val_seq_trunc = pad_sequences(X_val_seq, maxlen = MAX_LEN)

In [21]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_val_le = le.transform(y_val)
y_train_oh = to_categorical(y_train_le)
y_val_oh = to_categorical(y_val_le)

In [22]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 64)            640000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 3842      
Total params: 643,842
Trainable params: 643,842
Non-trainable params: 0
_________________________________________________________________


In [23]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [24]:
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
mc = ModelCheckpoint('../data/model_2.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [25]:
# fit model
emb_history = emb_model.fit(X_train_seq_trunc, 
                        y_train_oh, 
                        epochs = 30,
                        batch_size = 512,
                        validation_data = (X_val_seq_trunc, y_val_oh),
                        verbose = 0, 
                        callbacks=[es, mc])


Epoch 00001: val_accuracy improved from -inf to 0.63386, saving model to ../data/model_2.h5

Epoch 00002: val_accuracy improved from 0.63386 to 0.69160, saving model to ../data/model_2.h5

Epoch 00003: val_accuracy improved from 0.69160 to 0.71391, saving model to ../data/model_2.h5

Epoch 00004: val_accuracy improved from 0.71391 to 0.74803, saving model to ../data/model_2.h5

Epoch 00005: val_accuracy improved from 0.74803 to 0.77165, saving model to ../data/model_2.h5

Epoch 00006: val_accuracy improved from 0.77165 to 0.78740, saving model to ../data/model_2.h5

Epoch 00007: val_accuracy improved from 0.78740 to 0.79921, saving model to ../data/model_2.h5

Epoch 00008: val_accuracy improved from 0.79921 to 0.80315, saving model to ../data/model_2.h5

Epoch 00009: val_accuracy improved from 0.80315 to 0.81102, saving model to ../data/model_2.h5

Epoch 00010: val_accuracy did not improve from 0.81102

Epoch 00011: val_accuracy improved from 0.81102 to 0.81496, saving model to ../dat

In [26]:
# load the saved model
saved_model = load_model('../data/model_2.h5')

**Create submission file**

In [27]:
predictions = saved_model.predict_classes(X_test_seq_trunc)

In [28]:
pd.DataFrame({'id': ids, 'target': predictions}).to_csv('../data/10. Submission_Keras_Embeddings.csv', index=False)

**Submission result**

Score: 0.55828

### Model #3

In [None]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = MAX_LEN))
emb_model.add(layers.Flatten())

emb_model.add(layers.Dense(64, activation='relu'))
emb_model.add(layers.Dropout(0.5))
emb_model.add(layers.Dense(64, activation='relu'))
emb_model.add(layers.Dropout(0.5))
emb_model.add(layers.Dense(64, activation='relu'))
emb_model.add(layers.Dropout(0.5))

emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

In [None]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [None]:
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
mc = ModelCheckpoint('../data/model_3.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [None]:
# fit model
emb_history = emb_model.fit(X_train_emb, 
                            y_train_emb, 
                            epochs = 30,
                            validation_data = (X_valid_emb, y_valid_emb),
                            verbose=0, callbacks=[es, mc])

In [None]:
# load the saved model
saved_model = load_model('../data/model_3.h5')

In [None]:
# evaluate the model
_, train_acc = saved_model.evaluate(X_train_seq_trunc, y_train_oh, verbose=0)
_, test_acc = saved_model.evaluate(X_test_seq_trunc, y_test_oh, verbose=0)
print('Train accuracy: %.3f, Test accuracy: %.3f' % (train_acc, test_acc))

### Model #4

In [None]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test_labeled.csv")

df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)
df_test.text = df_test.text.apply(preprocess_tweet).apply(remove_stopwords)

In [None]:
X_train = df_train.text
y_train = df_train.target

X_test = df_test.text
y_test = df_test.target

In [None]:
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [None]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen = MAX_LEN)

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [None]:
# define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
cvscores = []

for train, test in kfold.split(X_train_seq_trunc, y_train_oh.argmax(1)):
  # create model
    emb_model = models.Sequential()
    emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = MAX_LEN))
    emb_model.add(layers.Flatten())

    emb_model.add(layers.Dense(8, activation='relu'))
    emb_model.add(layers.Dropout(0.1))
    emb_model.add(layers.Dense(8, activation='relu'))
    emb_model.add(layers.Dropout(0.1))

    emb_model.add(layers.Dense(2, activation='softmax'))
    
    # Compile model
    emb_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])    
    
    # Fit the model
    emb_model.fit(X_train_seq_trunc[train], y_train_oh[train], epochs = 30, verbose=0)

    # evaluate the model
    scores = emb_model.evaluate(X_train_seq_trunc[test], y_train_oh[test], verbose=0)
    
    print("%s: %.2f%%" % (emb_model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
# evaluate the model
_, test_acc = emb_model.evaluate(X_test_seq_trunc, y_test_oh, verbose=0)
print('Test accuracy: %.3f' % (test_acc))