In [1]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics

from keras import models
from keras import layers
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

from preprocess_tweets import preprocess_tweet

Using TensorFlow backend.


In [2]:
MAX_LEN = 30 # Maximum number of words in a sequence

In [3]:
'''Function to remove English stopwords from a Pandas Series.'''
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

### Model #1

In [4]:
df_train = pd.read_csv("../data/train.csv")

df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df_train.text, df_train.target, test_size=0.1, random_state=37)

In [6]:
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [7]:
seq_lengths = df_train.text.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    7613.000000
mean       10.306581
std         4.052247
min         1.000000
25%         7.000000
50%        10.000000
75%        13.000000
max        30.000000
Name: text, dtype: float64

In [8]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen = MAX_LEN)

In [9]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [10]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

In [11]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 8, input_length = MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 240)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 482       
Total params: 80,482
Trainable params: 80,482
Non-trainable params: 0
_________________________________________________________________


In [12]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])
    
emb_history = emb_model.fit(X_train_emb, 
                            y_train_emb, 
                            epochs = 30,
                            batch_size = 512,
                            validation_data = (X_valid_emb, y_valid_emb),
                            verbose = 1)

Train on 6165 samples, validate on 686 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [13]:
emb_model.fit(X_train_seq_trunc
              , y_train_oh
              , epochs=6
              , batch_size = 30
              , verbose=0)

emb_results = emb_model.evaluate(X_test_seq_trunc, y_test_oh)



In [14]:
print('Test accuracy: {0:.2f}%'.format(emb_results[1]*100))

Test accuracy: 79.79%


### Model #2 

**implementing early stopping to prevent overfitiing**

In [15]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test_labeled.csv")

df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)
df_test.text = df_test.text.apply(preprocess_tweet).apply(remove_stopwords)

In [16]:
X_train = df_train.text
y_train = df_train.target

X_test = df_test.text
y_test = df_test.target

In [17]:
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [18]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen = MAX_LEN)

In [19]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [20]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

In [21]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 64)            640000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 3842      
Total params: 643,842
Trainable params: 643,842
Non-trainable params: 0
_________________________________________________________________


In [22]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [23]:
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
mc = ModelCheckpoint('../data/best_model_2.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [24]:
# fit model
emb_history = emb_model.fit(X_train_emb, 
                            y_train_emb, 
                            epochs = 30,
                            validation_data = (X_valid_emb, y_valid_emb),
                            verbose=0, callbacks=[es, mc])


Epoch 00001: val_accuracy improved from -inf to 0.78084, saving model to ../data/best_model_2.h5

Epoch 00002: val_accuracy improved from 0.78084 to 0.80184, saving model to ../data/best_model_2.h5

Epoch 00003: val_accuracy did not improve from 0.80184

Epoch 00004: val_accuracy did not improve from 0.80184

Epoch 00005: val_accuracy did not improve from 0.80184

Epoch 00006: val_accuracy did not improve from 0.80184

Epoch 00007: val_accuracy did not improve from 0.80184

Epoch 00008: val_accuracy did not improve from 0.80184

Epoch 00009: val_accuracy did not improve from 0.80184

Epoch 00010: val_accuracy did not improve from 0.80184

Epoch 00011: val_accuracy did not improve from 0.80184

Epoch 00012: val_accuracy did not improve from 0.80184

Epoch 00013: val_accuracy did not improve from 0.80184

Epoch 00014: val_accuracy did not improve from 0.80184

Epoch 00015: val_accuracy did not improve from 0.80184

Epoch 00016: val_accuracy did not improve from 0.80184

Epoch 00017: val

In [25]:
# load the saved model
saved_model = load_model('../data/best_model_2.h5')

In [26]:
# evaluate the model
_, train_acc = saved_model.evaluate(X_train_seq_trunc, y_train_oh, verbose=0)
_, test_acc = saved_model.evaluate(X_test_seq_trunc, y_test_oh, verbose=0)
print('Train accuracy: %.3f, Test accuracy: %.3f' % (train_acc, test_acc))

print('Test accuracy: {0:.2f}%'.format(emb_results[1]*100))

Train accuracy: 0.926, Test accuracy: 0.790
Test accuracy: 79.79%


### Model #3

In [27]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = MAX_LEN))
emb_model.add(layers.Flatten())

emb_model.add(layers.Dense(64, activation='relu'))
emb_model.add(layers.Dropout(0.5))
emb_model.add(layers.Dense(64, activation='relu'))
emb_model.add(layers.Dropout(0.5))
emb_model.add(layers.Dense(64, activation='relu'))
emb_model.add(layers.Dropout(0.5))

emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 64)            640000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                122944    
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 64)               

In [28]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])

In [31]:
# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
mc = ModelCheckpoint('../data/best_model_3.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)

In [32]:
# fit model
emb_history = emb_model.fit(X_train_emb, 
                            y_train_emb, 
                            epochs = 30,
                            validation_data = (X_valid_emb, y_valid_emb),
                            verbose=0, callbacks=[es, mc])


Epoch 00001: val_accuracy improved from -inf to 0.80971, saving model to ../data/best_model_3.h5

Epoch 00002: val_accuracy did not improve from 0.80971

Epoch 00003: val_accuracy did not improve from 0.80971

Epoch 00004: val_accuracy did not improve from 0.80971

Epoch 00005: val_accuracy did not improve from 0.80971

Epoch 00006: val_accuracy did not improve from 0.80971

Epoch 00007: val_accuracy did not improve from 0.80971

Epoch 00008: val_accuracy did not improve from 0.80971

Epoch 00009: val_accuracy did not improve from 0.80971

Epoch 00010: val_accuracy did not improve from 0.80971

Epoch 00011: val_accuracy did not improve from 0.80971

Epoch 00012: val_accuracy did not improve from 0.80971

Epoch 00013: val_accuracy did not improve from 0.80971

Epoch 00014: val_accuracy did not improve from 0.80971

Epoch 00015: val_accuracy did not improve from 0.80971

Epoch 00016: val_accuracy did not improve from 0.80971

Epoch 00017: val_accuracy did not improve from 0.80971

Epoch

In [33]:
# load the saved model
saved_model = load_model('../data/best_model_3.h5')

In [34]:
# evaluate the model
_, train_acc = saved_model.evaluate(X_train_seq_trunc, y_train_oh, verbose=0)
_, test_acc = saved_model.evaluate(X_test_seq_trunc, y_test_oh, verbose=0)
print('Train accuracy: %.3f, Test accuracy: %.3f' % (train_acc, test_acc))

Train accuracy: 0.920, Test accuracy: 0.795


### Model #4

In [35]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test_labeled.csv")

df_train.text = df_train.text.apply(preprocess_tweet).apply(remove_stopwords)
df_test.text = df_test.text.apply(preprocess_tweet).apply(remove_stopwords)

In [36]:
X_train = df_train.text
y_train = df_train.target

X_test = df_test.text
y_test = df_test.target

In [37]:
tk = Tokenizer(num_words = 10000,
               filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               split = " ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [38]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen = MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen = MAX_LEN)

In [39]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [40]:
# define 5-fold cross validation test harness
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
cvscores = []

for train, test in kfold.split(X_train_seq_trunc, y_train_oh.argmax(1)):
  # create model
    emb_model = models.Sequential()
    emb_model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = MAX_LEN))
    emb_model.add(layers.Flatten())

    emb_model.add(layers.Dense(8, activation='relu'))
    emb_model.add(layers.Dropout(0.1))
    emb_model.add(layers.Dense(8, activation='relu'))
    emb_model.add(layers.Dropout(0.1))

    emb_model.add(layers.Dense(2, activation='softmax'))
    
    # Compile model
    emb_model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])    
    
    # Fit the model
    emb_model.fit(X_train_seq_trunc[train], y_train_oh[train], epochs = 30, verbose=0)

    # evaluate the model
    scores = emb_model.evaluate(X_train_seq_trunc[test], y_train_oh[test], verbose=0)
    
    print("%s: %.2f%%" % (emb_model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

accuracy: 76.17%
accuracy: 73.74%
accuracy: 77.61%
accuracy: 77.00%
accuracy: 75.76%
76.05% (+/- 1.33%)


In [41]:
# evaluate the model
_, test_acc = emb_model.evaluate(X_test_seq_trunc, y_test_oh, verbose=0)
print('Test accuracy: %.3f' % (test_acc))

Test accuracy: 0.764
