In [2]:
from nltk.tokenize import word_tokenize
import nltk
from pprint import pprint  # pretty-printer
import gensim
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec

import numpy as np

# import test train split
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/madis/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/madis/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/madis/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# import the data

In [3]:
import csv
def data_to_lists(folder_path):
    assert folder_path[-1] == '/'
    train_list_of_texts = []
    train_list_of_targets = []

    with open(folder_path+'train.csv', 'r') as file:
        my_reader = csv.reader(file, delimiter=',')

        # skip header
        next(my_reader)

        for row in my_reader:
            train_list_of_texts.append(row[3])
            train_list_of_targets.append(row[4])

    test_list_of_texts = []

    with open(folder_path+'test.csv', 'r') as file:
        my_reader = csv.reader(file, delimiter=',')

        # skip header
        next(my_reader)

        for row in my_reader:
            test_list_of_texts.append(row[3])

    return train_list_of_texts, train_list_of_targets, test_list_of_texts


In [4]:
# import data
train_list_of_texts, train_list_of_targets, test_list_of_texts = data_to_lists('data/')

# Preprocessing

During the preprocessing, we will remove the stop words, the punctuation and the numbers. We will also lemmatize the words. the output will be a list of words. Once we have the list of lists, we will split into train and validation sets.

ex:
```
clean_text = [['deed', 'reason', 'earthquake', 'allah', 'forgive'],
 ['forest', 'ronge', 'canada'],
 ['resident','asked','shelter','place','notified','officer','evacuation','shelter','place','order','expected'] ...]
 ```

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora

en_stop = set(stopwords.words('english'))

def get_lemma2(word):
    from nltk.stem.wordnet import WordNetLemmatizer
    """
    This function takes a word and returns its rootword
    """
    return WordNetLemmatizer().lemmatize(word)
# remove extra characters
import re
def clean_chr(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

def nltk_preprocessing(text):
    """
    This function takes a text and returns a list of tokens
    - lowercase
    - remove short words
    - remove stopwords
    - remove extra characters
    - gets root word (lemma)

    """
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if len(token) > 4]      # remove short words
    tokens = [token for token in tokens if token not in en_stop] # remove stopwords
    tokens = [clean_chr(token) for token in tokens]              # remove extra characters
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

In [6]:
# clean data
clean_train_text = [nltk_preprocessing(text) for text in train_list_of_texts]
clean_test_text = [nltk_preprocessing(text) for text in test_list_of_texts]

In [7]:
# split data into train and test
X_train, X_test, y_train, y_val = train_test_split(clean_train_text, train_list_of_targets, test_size=0.3, random_state=123)

# Create the word2vec model

We will train a Word2Vec model that will learn the word embeddings for our training data.

Word2Vec is a shallow, two-layer neural network that is trained to reconstruct linguistic contexts of words. The model can either be trained on the skip-gram architecture or on the continuous bag of words (CBOW) architecture. The skip-gram architecture is used to predict the context given a word while the CBOW architecture is used to predict the word given its context.

For each tweet, we will create a vector that will be the average of the word embeddings of the words in the tweet.

In [8]:
from gensim.models import word2vec

n_space = 500
# train word2vec model
model_w2v = word2vec.Word2Vec(clean_train_text, size=n_space, window=5, min_count=1, workers=4)
model_w2v.build_vocab(clean_train_text, update=True)

# train using w2v
model_w2v.train(clean_train_text, total_examples=len(X_train), epochs=1)

# save model
model_w2v.save('models/model_w2v')

In [9]:
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [10]:
# scale the data for training
from sklearn.preprocessing import scale
train_vecs = np.concatenate([buildWordVector(z, n_space) for z in X_train])
train_vecs = scale(train_vecs)

val_vecs = np.concatenate([buildWordVector(z, n_space) for z in X_test])
val_vecs = scale(val_vecs)


  vec += model_w2v[word].reshape((1, size))
  vec += model_w2v[word].reshape((1, size))


the `train_vecs` array is a 2D array where each row is a vector representation of a tweet (the average of the word vectors in the tweet).

# Train the data

In [11]:
# train using ada boost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


# make a function to train and test the model
def train_and_test_model(model, train_vecs, train_labels, val_vecs, val_labels):
    # train model
    model.fit(train_vecs, train_labels)

    # get training accuracy
    train_preds = model.predict(train_vecs)
    train_acc = accuracy_score(train_labels, train_preds)

    print('Training accuracy: {}'.format(train_acc))
    # get validation accuracy
    y_pred = model.predict(val_vecs)
    val_acc = accuracy_score(val_labels, y_pred)

    print('Val accuracy: {}'.format(val_acc))
    return model

In [12]:
# train very basic classification

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
model_nb = GaussianNB()
model_nb = train_and_test_model(model_nb, train_vecs, y_train, val_vecs, y_val)

Training accuracy: 0.6100581722649653
Val accuracy: 0.6077057793345009


In [13]:
clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200
)

clf = train_and_test_model(clf, train_vecs, y_train, val_vecs, y_val)

Training accuracy: 0.8070932632764121
Val accuracy: 0.6808231173380035


ada boost seems to have a pretty good training score, however the validation score is not that good. This suggests the model is overfitting. We will try to improve the model by tuning the hyperparameters.