In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os, string, collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import utils
from utils import *

import snowballstemmer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn import metrics
from keras.wrappers.scikit_learn import KerasClassifier

from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Dense, Dropout, Convolution1D, MaxPooling1D, SpatialDropout1D, Input 
from keras.layers import GlobalMaxPooling1D, concatenate, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
PATH = os.getcwd()

In [3]:
df = pd.read_csv(f'{PATH}/data/Airline-Sentiment-2-w-AA.csv', usecols=['text', 'airline_sentiment'], encoding='ISO-8859-1')

In [4]:
df.shape

(14640, 2)

Encode categorical label class into numerical

In [5]:
le = LabelEncoder()
df['target'] = le.fit_transform(df['airline_sentiment'])

Text cleaning

In [6]:
tc = TextCleaner()
df['clean_text'] = tc.transform(df['text'])

Tokenization

In [7]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s): 
    return re_tok.sub(r' \1 ', s).split()

In [8]:
df['tokenized'] = df['clean_text'].apply(lambda row: tokenize(row))

Stopwords removing

In [9]:
stop = set(stopwords.words('english'))
stop.update(['amp', 'rt', 'cc'])
stop = stop - set(['no', 'not'])

In [10]:
def remove_stopwords(row):
    return [t for t in row if t not in stop]

In [11]:
df['tokenized'] = df['tokenized'].apply(lambda row: remove_stopwords(row))

In [12]:
pd.set_option('display.max_colwidth', -1)

In [13]:
df[['text', 'tokenized']].head()

Unnamed: 0,text,tokenized
0,@VirginAmerica What @dhepburn said.,[said]
1,@VirginAmerica plus you've added commercials to the experience... tacky.,"[plus, youve, added, commercials, experience, tacky]"
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,"[didnt, today, must, mean, need, take, another, trip]"
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse","[really, aggressive, blast, obnoxious, entertainment, guests, faces, little, recourse]"
4,@VirginAmerica and it's a really big bad thing about it,"[really, big, bad, thing]"


Vocabulary creation

In [14]:
def update_vocab_counter(row):
    for word in row:
        vocab_counter[word] += 1

In [15]:
vocab_counter = collections.Counter()
df['tokenized'].apply(update_vocab_counter);
vocab = sorted(vocab_counter, key=vocab_counter.get, reverse=True)

In [16]:
len(vocab)

12390

We limit the dictionary size to the top 5000 most frequent tokens

In [17]:
max_words = 5000

Dictionary that map each token with their id

In [18]:
w2id = {w:i for i, w in enumerate(vocab[:max_words])}

We will replace each token out of top 5000 with 'unk'

In [19]:
w2id['unk'] = -1

We transform each token by their id

In [20]:
def transform_to_ids(row):
    return [w2id[w] if w in w2id else w2id['unk'] for w in row]

In [21]:
df['tokenized_int'] = df['tokenized'].apply(lambda x: transform_to_ids(x))

Tweets length

In [22]:
lens = df['tokenized_int'].apply(lambda x: len(x))

In [23]:
min(lens), max(lens), np.mean(lens)

(0, 21, 8.987636612021857)

We set 20 as max length

In [24]:
maxlen = 20

Train, test split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df['tokenized_int'].values, df['target'].values, test_size=0.25, random_state=0)

Since we need that each document contains a fixed number of tokens (20), we fill with -1 (id that represents 'unk') every token with size < 20

In [26]:
x_train = pad_sequences(X_train, maxlen=maxlen, value=-1)
x_test = pad_sequences(X_test, maxlen=maxlen, value=-1)

We one-hot encode target classes

In [27]:
dummy_y = np_utils.to_categorical(y_train)
dummy_y_test = np_utils.to_categorical(y_test)

## Linear model

The first approach is to create a neural network with a 50 dimension embedding layer as input and no hidden layers. This is equivalent as apply logistic regression over word vectors rather than one hot encoded vectors.

In [30]:
def baseline_model():
    model = Sequential([Embedding(input_dim=max_words, output_dim=50, input_length=maxlen),
                        Flatten(),
                        Dense(3, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

For all the models we use cross-validation with 5 folds

In [31]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [34]:
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [35]:
results.mean()*100, results.std()*100

(78.96134572269361, 0.815186961410359)

We now train over full training set evaluating on test set. Best epoch is saved

In [45]:
filepath = f'{PATH}/results/linear.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [46]:
model = baseline_model()
model.fit(x_train, dummy_y, validation_data=(x_test, dummy_y_test), epochs=5, batch_size=100, callbacks=callbacks_list)

Train on 10980 samples, validate on 3660 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7de81a4f98>

## Convolutional Neural Network

Simple convolutional neural network with 1 conv layer (10 filters of size 3)

In [66]:
def conv_model():
    model = Sequential([Embedding(input_dim=max_words, output_dim=32, input_length=maxlen),
                        Convolution1D(10, 3, padding='same', activation='relu'),
                        MaxPooling1D(),
                        Flatten(),
                        Dense(50, activation='relu'),
                        Dense(3, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [67]:
estimator = KerasClassifier(build_fn=conv_model, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [68]:
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [69]:
results.mean()*100, results.std()*100

(76.43082918098617, 2.256687530027816)

Increasing number of filters to 64

In [90]:
def conv_model():
    model = Sequential([Embedding(input_dim=max_words, output_dim=32, input_length=maxlen),
                        Convolution1D(64, 3, padding='same', activation='relu'),
                        MaxPooling1D(),
                        Flatten(),
                        Dense(25, activation='relu'),
                        Dense(3, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [93]:
estimator = KerasClassifier(build_fn=conv_model, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [94]:
results.mean()*100, results.std()*100

(77.98683471377974, 0.9380869930210952)

We now create a neural network that applies different filter sizes (2, 3 and 4) to word vectors, then concatenate the outputs and apply a classifier on top on that.

In [113]:
def mult_conv():
    graph_in = Input(shape=(max_words, 50))

    convs = []
    for filter_size in range(2, 5):
        x = Convolution1D(64, filter_size, padding='same', activation='relu')(graph_in)
        convs.append(x)

    graph_out = concatenate(convs, axis=1)
    graph_out = GlobalMaxPooling1D()(graph_out)
    graph = Model(graph_in, graph_out)
    
    model = Sequential([Embedding(max_words, 50, input_length=maxlen),
                    graph,
                    Dropout(0.5),
                    Dense(25, activation='relu'),
                    Dense(3, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [114]:
estimator = KerasClassifier(build_fn=mult_conv, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [115]:
results.mean()*100, results.std()*100

(79.02512735902401, 0.7688410157779406)

In [116]:
filepath = f'{PATH}/results/mult_conv.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [117]:
model = mult_conv()
model.fit(x_train, dummy_y, validation_data=(x_test, dummy_y_test), epochs=5, batch_size=100, callbacks=callbacks_list)

Train on 10980 samples, validate on 3660 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7d64be6518>

In [29]:
model = load_model(f'{PATH}/results/mult_conv.hdf5')

In [31]:
preds = model.predict(x_test)

In [39]:
metrics.accuracy_score(y_test, np.argmax(preds, axis=1))

0.7969945355191257

In [38]:
print(metrics.classification_report(y_test, np.argmax(preds, axis=1)))

             precision    recall  f1-score   support

          0       0.83      0.92      0.87      2327
          1       0.66      0.51      0.58       772
          2       0.79      0.67      0.73       561

avg / total       0.79      0.80      0.79      3660



## LSTM

In [30]:
def simple_lstm():
    model = Sequential([Embedding(max_words, 50, input_length=maxlen),
                        LSTM(25),
                        Dense(3, activation='softmax')])

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [31]:
estimator = KerasClassifier(build_fn=simple_lstm, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [33]:
results.mean()*100, results.std()*100

(78.06884338406488, 0.8922790226571174)

## Pre-trained word embeddings

We load pre-trained word embeddings from GloVe

In [34]:
glove_dir = '/home/martinpella/Downloads/GloVe/Wikipedia2014'

In [35]:
embeddings_index = {}
f = open(glove_dir + '/glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [36]:
len(embeddings_index)

400000

In [37]:
def create_emb_matrix(max_words, embedding_dim):
    embedding_matrix = np.zeros((max_words, embedding_dim))
    found = 0
    for word, i in w2id.items():
        embedding_vector = embeddings_index.get(word)
        if i < max_words:
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                found += 1
            else:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix, found

In [38]:
embedding_matrix, found = create_emb_matrix(max_words, 50)

In [39]:
found

4676

#### Linear Model

In [124]:
def baseline_model():
    model = Sequential([Embedding(input_dim=max_words, output_dim=50, input_length=maxlen),
                        Flatten(),
                        Dense(3, activation='softmax')])

    model.layers[0].set_weights = [embedding_matrix]
    model.layers[0].trainable = True
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [131]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [132]:
results.mean()*100, results.std()*100

(78.79746936570686, 0.7629237463005589)

In [133]:
filepath = f'{PATH}/results/linear_glove.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [134]:
model = baseline_model()
model.fit(x_train, dummy_y, validation_data=(x_test, dummy_y_test), epochs=5, batch_size=100, callbacks=callbacks_list)

Train on 10980 samples, validate on 3660 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7d5aef5a90>

#### Multiple Conv layers

In [146]:
def mult_conv():
    graph_in = Input(shape=(max_words, 100))

    convs = []
    for filter_size in range(2, 5):
        x = Convolution1D(64, filter_size, padding='same', activation='relu')(graph_in)
        convs.append(x)

    graph_out = concatenate(convs, axis=1)
    graph_out = GlobalMaxPooling1D()(graph_out)
    graph = Model(graph_in, graph_out)
    
    model = Sequential([Embedding(max_words, 100, input_length=maxlen),
                    graph,
                    Dropout(0.5),
                    Dense(25, activation='relu'),
                    Dense(3, activation='softmax')])
    
    model.layers[0].set_weights = [embedding_matrix]
    model.layers[0].trainable = True

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [149]:
estimator = KerasClassifier(build_fn=mult_conv, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [150]:
results.mean()*100, results.std()*100

(78.87934992381346, 0.8683407303678339)

#### LSTM

In [180]:
def simple_lstm():
    model = Sequential([Embedding(max_words, 50, input_length=maxlen),
                        LSTM(25),
                        Dense(3, activation='softmax')])

    model.layers[0].set_weights = [embedding_matrix]
    model.layers[0].trainable = True

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [182]:
estimator = KerasClassifier(build_fn=simple_lstm, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [183]:
results.mean()*100, results.std()*100

(78.37855569354937, 0.6784259807207694)

In [184]:
filepath = f'{PATH}/results/lstm_glove.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [185]:
model = simple_lstm()
model.fit(x_train, dummy_y, validation_data=(x_test, dummy_y_test), epochs=5, batch_size=100, callbacks=callbacks_list)

Train on 10980 samples, validate on 3660 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f7d1ed61f28>

In [48]:
model = load_model(f'{PATH}/results/lstm_glove.hdf5')

In [52]:
preds = model.predict(x_test)

In [55]:
metrics.accuracy_score(y_test, np.argmax(preds, axis=1))

0.7833333333333333

In [56]:
print(metrics.classification_report(y_test, np.argmax(preds, axis=1)))

             precision    recall  f1-score   support

          0       0.85      0.88      0.86      2327
          1       0.60      0.58      0.59       772
          2       0.74      0.67      0.70       561

avg / total       0.78      0.78      0.78      3660



In [58]:
def bi_lstm():
    model = Sequential([Embedding(max_words, 50, input_length=maxlen),
                        Bidirectional(LSTM(25, dropout=0.2, recurrent_dropout=0.2)),
                        Dense(3, activation='softmax')])

    model.layers[0].set_weights = [embedding_matrix]
    model.layers[0].trainable = True

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [59]:
estimator = KerasClassifier(build_fn=bi_lstm, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [60]:
results.mean()*100, results.std()*100

(78.4420255733871, 1.164233643944509)

In [61]:
preds = model.predict(x_test)

In [62]:
metrics.accuracy_score(y_test, np.argmax(preds, axis=1))

0.7833333333333333

In [63]:
print(metrics.classification_report(y_test, np.argmax(preds, axis=1)))

             precision    recall  f1-score   support

          0       0.85      0.88      0.86      2327
          1       0.60      0.58      0.59       772
          2       0.74      0.67      0.70       561

avg / total       0.78      0.78      0.78      3660



In [49]:
def double_lstm():
    model = Sequential([Embedding(max_words, 50, input_length=maxlen),
                        LSTM(50, dropout=0.2, recurrent_dropout=0.2),
                        Dense(3, activation='softmax')])

    model.layers[0].set_weights = [embedding_matrix]
    model.layers[0].trainable = True

    model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
    
    return model

In [50]:
estimator = KerasClassifier(build_fn=double_lstm, epochs=5, batch_size=100, verbose=0)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)

In [51]:
results.mean()*100, results.std()*100

(78.28734835713527, 1.04602968811997)