In [1]:
import numpy as np
import pprint
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
pp = pprint.PrettyPrinter(indent=4, sort_dicts=False)


In [2]:
def preprocess(data):
    data.text = data.text.apply(lambda x: x.lower())
    data.text = data.text.replace(r'\s+|\\n', '', regex=True)
    return data


In [3]:
train_data = pd.read_csv("data/train_data.csv")
train_data = preprocess(train_data)
print(train_data.head())

test_data = pd.read_csv("data/test_data.csv")
test_data = preprocess(test_data)
print(test_data.head())


  language                                               text    label
0    dansk   dette er et fremragende initiativ, og jeg stø...  Ireland
1    dansk   hr. formand, jeg er sikker på, at alle her er...  Ireland
2    dansk   hr. formand, folk på den nordlige halvkugle t...  England
3    dansk   hr. formand, med forbehold af nogle få ændrin...  England
4    dansk   - hr. formand, jeg må protestere mod den lemf...  England
                                                text
0   hr. formand, selv om vi i høj grad sympatiser...
1   quiero dejar constancia de mi apoyo a este in...
2   . – el comercio ilegal de riñones humanos se ...
3   signor presidente, per introdurre una nota di...
4   jeg stemte for meddelelsen af decharge til fæ...


In [4]:
print(tf.__version__)
print(tf.test.gpu_device_name())

train_data["label"] = train_data["label"].astype("category")
train_data["label"] = train_data["label"].cat.codes
train_features, train_labels = train_data["text"], tf.one_hot(train_data["label"], 3)

test_features = test_data["text"]


2.10.0
/device:GPU:0


In [5]:
import nltk
from nltk.tokenize import word_tokenize

tokenized_train_features = [word_tokenize(each_train_text) for each_train_text in train_features]
tokenized_test_features = [word_tokenize(each_test_text) for each_test_text in test_features]



In [7]:
from gensim.models import word2vec

vector_size = 100

w2v_model = word2vec.Word2Vec(
    tokenized_train_features,
    vector_size=vector_size,  # Dimensionality of the word vectors
    window=20,
    min_count=1,
    sg=1  # 1 for skip-gram; otherwise CBOW
)


In [8]:
vocab_list = list(w2v_model.wv.key_to_index.keys())


def remove_OOV_vocab(sample: list, list_vocab):
    """ Takes in tokenized sample in the form of list 
    and the vocabulary list and removes tokens from sample
    that are not in the vocabulary list"""
    in_vocab_sample = []
    for each_token in sample:
        if each_token in list_vocab:
            in_vocab_sample.append(each_token)
    return in_vocab_sample


tokenized_test_features = [remove_OOV_vocab(each_test_sample, vocab_list) for each_test_sample in tokenized_test_features]


vocab = w2v_model.wv.key_to_index.keys()
embedding_matrix = w2v_model.wv[vocab]



In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_seq_len = 100


def w2v_indexed_token_sequences(w2v_model, list_features):
    indexed_features = []
    for each_seq in list_features:
        list_token_indices = []
        for each_token in each_seq:
            try:
                list_token_indices.append(w2v_model.wv.key_to_index[each_token])
            except KeyError as e:
                continue
        indexed_features.append(list_token_indices)
    return indexed_features


indexed_train_features = w2v_indexed_token_sequences(w2v_model, tokenized_train_features)
indexed_test_features = w2v_indexed_token_sequences(w2v_model, tokenized_test_features)

padded_train = pad_sequences(indexed_train_features, padding='post', maxlen=max_seq_len, truncating='post')
padded_test = pad_sequences(indexed_test_features, padding='post', maxlen=max_seq_len, truncating='post')


In [40]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout, LSTM


def get_model():
    model = Sequential()
    model.add(
        Embedding(input_dim=259925,
                  output_dim=vector_size,
                  weights=[embedding_matrix],
                  input_length=max_seq_len))
    model.add(Dropout(0.5))
    model.add(LSTM(max_seq_len, return_sequences=True))
    model.add(LSTM(15))
    model.add(Dense(3, activation='softmax'))
    return model


# Adding callbacks for best model checkpoint
callbacks = [
    keras.callbacks.EarlyStopping(monitor="val_loss",
                                  patience=4,
                                  verbose=1,
                                  restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(filepath='models/lstm_with_w2v.hdf5',
                                    verbose=1,
                                    save_best_only=True)
]

model = get_model()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
tf.config.run_functions_eagerly(True)

# storing model training details to analyze later
history = model.fit(padded_train,
                    train_labels,
                    validation_split=0.33,
                    callbacks=callbacks,
                    epochs=10)


Epoch 1/10




Epoch 1: val_loss improved from inf to 1.05458, saving model to models\lstm_with_w2v.hdf5
Epoch 2/10
Epoch 2: val_loss improved from 1.05458 to 1.01935, saving model to models\lstm_with_w2v.hdf5
Epoch 3/10
Epoch 3: val_loss did not improve from 1.01935
Epoch 4/10
Epoch 4: val_loss did not improve from 1.01935
Epoch 5/10
Epoch 5: val_loss did not improve from 1.01935
Epoch 6/10

Epoch 6: val_loss did not improve from 1.01935
Epoch 6: early stopping


In [41]:
model_with_w2v = keras.models.load_model('models/lstm_with_w2v.hdf5')
y_pred_one_hot_encoded = (model_with_w2v.predict(padded_test) > 0.5).astype("int32")
y_pred_test = np.array(tf.argmax(y_pred_one_hot_encoded, axis=1))
mappings = {0: 'England', 1: 'Ireland', 2: 'Scotland'}




In [42]:
final_data = {"id": test_data.index+1, "label": y_pred_test}

submission = pd.DataFrame(data=final_data).set_index("id")
submission = submission.label.apply(lambda x: mappings[x])
submission.to_csv("submissions/submission_CNN.csv")
submission.head()


id
1    England
2    England
3    England
4    England
5    England
Name: label, dtype: object