In [31]:
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import sklearn.metrics as metrics

from keras import models
from keras import layers
from keras import regularizers

import warnings
warnings.filterwarnings('ignore')

In [39]:
NB_WORDS = 10000  # Parameter indicating the number of words we'll put in the dictionary
VAL_SIZE = 1000  # Size of the validation set
NB_START_EPOCHS = 30  # Number of epochs we usually start to train with
BATCH_SIZE = 512  # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 28  # Maximum number of words in a sequence

In [40]:
'''Function to remove English stopwords from a Pandas Series.'''
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 
    
'''Function to remove mentions, preceded by @, in a Pandas Series'''
def remove_mentions(input_text):
    return re.sub(r'@\w+', '', input_text)

In [41]:
df = pd.read_csv("../data/train.csv")

In [42]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [43]:
df.text = df.text.apply(remove_stopwords).apply(remove_mentions)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.target, test_size=0.1, random_state=37)

In [45]:
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [46]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))
seq_lengths.describe()

count    6851.000000
mean       10.749088
std         3.881148
min         1.000000
25%         8.000000
50%        11.000000
75%        14.000000
max        28.000000
Name: text, dtype: float64

In [47]:
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [48]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [49]:
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

In [50]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(NB_WORDS, 8, input_length=MAX_LEN))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(2, activation='softmax'))
emb_model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 28, 8)             80000     
_________________________________________________________________
flatten_6 (Flatten)          (None, 224)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 450       
Total params: 80,450
Trainable params: 80,450
Non-trainable params: 0
_________________________________________________________________


In [51]:
emb_model.compile(optimizer = 'adam', 
                  loss = 'binary_crossentropy', 
                  metrics = ['accuracy'])
    
emb_history = emb_model.fit(X_train_emb, 
                            y_train_emb, 
                            epochs = NB_START_EPOCHS,
                            batch_size = BATCH_SIZE,
                            validation_data = (X_valid_emb, y_valid_emb),
                            verbose = 1)

Train on 6165 samples, validate on 686 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [52]:
emb_model.fit(X_train_seq_trunc
              , y_train_oh
              , epochs=6
              , batch_size=BATCH_SIZE
              , verbose=0)

emb_results = emb_model.evaluate(X_test_seq_trunc, y_test_oh)
    
print('Test accuracy of word embeddings model: {0:.2f}%'.format(emb_results[1]*100))

Test accuracy of word embeddings model: 79.13%


In [53]:
y_pred_le = emb_model.predict_classes(X_test_seq_trunc)

In [54]:
print(metrics.classification_report(y_test_le, y_pred_le, labels=[0, 1, 2], digits=3))

              precision    recall  f1-score   support

           0      0.802     0.851     0.826       443
           1      0.774     0.708     0.740       319
           2      0.000     0.000     0.000         0

   micro avg      0.791     0.791     0.791       762
   macro avg      0.525     0.520     0.522       762
weighted avg      0.790     0.791     0.790       762

