# Task 3: Word2Vec + Averaging

## XGBoost and Logistic Regression

### Imports

In [1]:
from os import path
from collections import Counter

import numpy as np
import pandas as pd
import seaborn as sns
import itertools

import spacy

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import (
    SelectKBest, VarianceThreshold, f_classif)
from sklearn.linear_model import LogisticRegression

import xgboost as xgb

from sklearn.metrics import f1_score, balanced_accuracy_score

from data_io import read_data
from utils import label_map

## Load Data

In [2]:
texts_train, labels_train = read_data(mode='train')
y_train = [label_map[label] for label in labels_train]

In [3]:
texts_val, labels_val = read_data(mode='val')
y_val = [label_map[label] for label in labels_val]

In [4]:
texts_test, labels_test = read_data(mode='test')
y_test = [label_map[label] for label in labels_test]

## Preprocessing

We define a function that takes care of basic preprocessing steps such as lowercasing, stop-words removal, replacing digits and punctuation removal.

In [5]:
from string import punctuation
translator = str.maketrans(
    '', '', punctuation)
stemmer = SnowballStemmer('english')
stoplist = set(stopwords.words('english'))

nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

def normalize(doc, stem=False):
    """
    Input doc and return clean list of tokens
    """
    doc = doc.replace('\r', ' ').replace('\n', ' ')
    lower = doc.lower()
    doc = lower.translate(translator)
    doc = doc.split()
    doc = [w for w in doc if w not in stoplist]
    doc = [w if not w.isdigit() else '#' for w in doc]
    if stem:
        doc = [stemmer.stem(w) for w in doc]
    return doc

In [6]:
normalized_text_train = [normalize(doc) for doc in texts_train]
normalized_text_val = [normalize(doc) for doc in texts_val]
normalized_text_test = [normalize(doc) for doc in texts_test]

# Word2Vec 

In [7]:
import os
import gensim.models
from gensim.models import KeyedVectors
from gensim import utils

## Optional: Hyperparameter Tuning Word2Vec

This code can be used to tune the word2vec model for optimal performance of the XGBoost and/or logistic regression model. The data is first created accoring to the w2v parameterization and then stored for later use.

WARNING: long runtime!

In [8]:
tune_w2v = False 

if tune_w2v:
    
    # define directory to store embedded data
    embd_dir = 'data/embeddings'
    
    # define parameters to search over
    gridsearch_params = [(epochs, dim, min_words) 
                         for epochs in [5,10] 
                         for dim in [50,100,200] 
                         for min_words in [5,25,50]
                        ]
    
    for epochs, dim, min_words in gridsearch_params:
        model = gensim.models.Word2Vec(sentences = normalized_text_train,   # list of tokenized sentences
                                       workers = 8,                         # Number of threads to run in parallel
                                       iter = epochs,                       # Number of epochs
                                       size = dim,                          # Word vector dimensionality     
                                       min_count =  min_words               # Minimum word count  
                                       )
        
        # get embedding from trained w2v model
        Xtrain,ytrain = get_embedding(model, normalized_text_train, y_train)
        Xval,yval = get_embedding(model, normalized_text_val, y_val)
    
        # save data for later use
        np.savez_compressed(embd_dir + "/embed_Xtrain_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz", Xtrain)
        np.savez_compressed(embd_dir + "/embed_ytrain_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz", ytrain)
        
        np.savez_compressed(embd_dir + "/embed_ytrain_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz", Xval)
        np.savez_compressed(embd_dir + "/embed_yeval_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz", yval)

    
    # XGBoost Parameters
    params = {
        'objective': 'multi:softmax',
        'max_depth': 5,
        'num_class': 5,
        'eval_metric': ['merror', 'mlogloss'],
        }
        
    res = []
    for epochs, dim, min_words in gridsearch_params:
        print("epochs={}, dim={}, min_words={} ".format(
                                 epochs,
                                 dim,
                                 min_words))
        # Load data
        Xtrain = np.load(embd_dir + "/embed_Xtrain_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz")['arr_0']
        Xval   = np.load(embd_dir + "/embed_Xeval_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz")['arr_0']
        ytrain = np.load(embd_dir + "/embed_ytrain_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz")['arr_0']
        yval   = np.load(embd_dir + "/embed_yeval_dim" + str(dim) + "_epochs" + str(epochs) + "_min_words"+ str(min_words) + ".npz")['arr_0']
        
        # Fit Logistic Regression
        lr = LogisticRegression(max_iter=1000)
        lr.fit(Xtrain, ytrain)
        
        # Compute Score
        lr.score(Xval, yval)
        y_preds_val = lr.predict(Xval)
        f1_lr = f1_score(yval, y_preds_val, average='micro')
        print('LogReg f1 score micro : {}',f1_lr)
              
        # XGBoost 
        #  Data
        dtrain = xgb.DMatrix(Xtrain, ytrain)
        dval = xgb.DMatrix(Xval, yval)
              
        evallist = [(dtrain, 'train'), (dval, 'eval')]
        
        num_round = 50
        bst = xgb.train(
                params=params,  
                dtrain=dtrain, 
                num_boost_round=num_round, 
                evals=evallist,
                early_stopping_rounds=10,
                verbose_eval=False)
        
        pred = bst.predict(dval)
        f1_xgb = f1_score(yval, pred, average='micro')
        print('XGBoost f1 score micro : {}',f1_xgb)
        
        res.append([epochs,dim,min_words,f1_lr,f1_xgb])
      
    # save results
    np.savez_compressed(embd_dir +"res.npz", res)

## Create Word2Vec Embedding

We ran hyperparamter tuning over the number of epochs, the dimensionality of the embedding and the minimum count of a word to be considered. Below are the parameters that gave the best results for the logistic regression and the xgboost consideres further down in the scirpt.

In [9]:
epochs = 5
dim = 300
min_words = 25

model = gensim.models.Word2Vec(sentences = normalized_text_train,   # list of tokenized sentences
                               workers = 8,                         # Number of threads to run in parallel
                               iter = epochs,                       # Number of epochs
                               size = dim,                          # Word vector dimensionality     
                               min_count =  min_words               # Minimum word count  
                               )

We define a function to convert the words into a vector embedding. All the sentences that contain only words that were not learned by the Word2Vec model (e.g. due to a too small word frequency) are discarded.

In [18]:
def get_embedding(model,data,label):
    x = []
    nan_indices = []
    for index in range(len(data)):
        # remove out-of-vocabulary words
        doc = [word for word in data[index] if word in model.wv.vocab.keys()]
        if not doc:
            # append zero vector
            x.append(np.zeros(dim))

        else:
            # append the vector for each document
            x.append(np.mean(model[doc], axis=0))
        
    X = np.array(x)
    y = label
    return X,y

In [19]:
Xtrain,ytrain = get_embedding(model, normalized_text_train, y_train)
Xval,yval = get_embedding(model, normalized_text_val, y_val)
Xtest,ytest = get_embedding(model, normalized_text_test, y_test)

  del sys.path[0]


## Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(Xtrain, ytrain)

Check performance measured by micro F1-score 

In [None]:
# Compute Score
lr.score(Xtest, ytest)
y_preds = lr.predict(Xtest)
f1_lr = f1_score(ytest, y_preds, average='micro')
print('LogReg f1 score micro : {}',f1_lr)

## XGBoost

In [None]:
# XGBoost Parameters
params = {
    'objective': 'multi:softmax',
    'max_depth': 8,
    'num_class': 5,
    'eval_metric': ['merror', 'mlogloss'],
    #'tree_method': 'gpu_hist'
    }

In [None]:
# Data
dtrain = xgb.DMatrix(Xtrain, ytrain)
dval = xgb.DMatrix(Xval, yval)
dtest = xgb.DMatrix(Xtest, ytest)
      
evallist = [(dtrain, 'train'), (dval, 'eval')]

num_round = 100
bst = xgb.train(
        params=params,  
        dtrain=dtrain, 
        num_boost_round=num_round, 
        evals=evallist,
        early_stopping_rounds=10,
        verbose_eval=True)

pred = bst.predict(dtest)
f1_xgb = f1_score(ytest, pred, average='micro')
print('XGBoost f1 score micro : {}',f1_xgb)