# 1. Data split

In [1]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

In [2]:
sessions_train = json.load(open('data/symbolised_sessions_training_group27.json', 'r'))
labels_train = json.load(open('data/labels_training_group27.json', 'r'))
sessions_eval = json.load(open('data/symbolised_sessions_eval_group27.json', 'r'))
labels_eval = json.load(open('data/labels_eval_group27.json', 'r'))

In [3]:
def ngram_featurizer(session, n):
    
    """takes in a list and an integer defining the size of ngrams.
     Returns the ngrams of desired size in the input string"""
    
    session = ['#']*(n-1) + session + ['+']*(n-1)
    ngrams = [tuple(session[i:i+n]) for i in range(len(session)-n+1)]
    
    return ngrams
    

def encode_sessions(sessions, n, mapping=None):
    
    """
    Takes in a list of lists, an integer indicating the character ngrams' size,
    and a dictionary mapping ngrams to numerical indices. If no dictionary is passed,
    one is created inside the function.
    The function outputs a 2d NumPy array with as many rows as there are strings in 
    the input list, and the mapping from ngrams to indices, representing the columns 
    of the NumPy array.
    """
    
    if not mapping:
        all_ngrams = set()
        for session in sessions:
            all_ngrams = all_ngrams.union(set(ngram_featurizer(session, n)))
    
        mapping = {ngram: i for i, ngram in enumerate(all_ngrams)}
    
    X = np.zeros((len(sessions), len(mapping)))
    for i, session in enumerate(sessions):
        for ngram in ngram_featurizer(session, n):
            try:
                X[i, mapping[ngram]] += 1
            except KeyError:
                pass
    
    return X, mapping

In [4]:
print(ngram_featurizer(sessions_train[0], 4))
print(set(ngram_featurizer(sessions_train[0], 4)))

[('#', '#', '#', 1), ('#', '#', 1, 2), ('#', 1, 2, 3), (1, 2, 3, 1), (2, 3, 1, 1), (3, 1, 1, 1), (1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 1, 2), (1, 1, 2, 1), (1, 2, 1, 1), (2, 1, 1, 2), (1, 1, 2, 1), (1, 2, 1, 1), (2, 1, 1, 2), (1, 1, 2, 1), (1, 2, 1, '+'), (2, 1, '+', '+'), (1, '+', '+', '+')]
{(1, '+', '+', '+'), (3, 1, 1, 1), (1, 2, 1, '+'), (2, 1, '+', '+'), (2, 3, 1, 1), (1, 2, 3, 1), (1, 2, 1, 1), ('#', '#', 1, 2), (2, 1, 1, 2), (1, 1, 1, 2), ('#', '#', '#', 1), (1, 1, 2, 1), (1, 1, 1, 1), ('#', 1, 2, 3)}


In [5]:
Xtrain, mapping = encode_sessions(sessions_train, 4)
Xtest, _ = encode_sessions(sessions_eval, 4, mapping=mapping)
print(mapping, end=' ')

{('#', '#', 2, 2): 0, ('#', 1, 2, 2): 1, (4, 1, 2, 4): 2, (2, 3, 1, 4): 3, (1, 2, 1, 1): 4, ('#', '#', 3, 3): 5, ('#', 1, 1, 3): 6, (2, 1, 1, 1): 7, (4, 3, 3, '+'): 8, (1, 4, 3, 1): 9, (4, 4, 3, 4): 10, (1, 3, 1, 4): 11, (1, 1, 2, 3): 12, ('#', 1, 2, 4): 13, ('#', 3, 4, 1): 14, (3, 4, '+', '+'): 15, (1, 2, 1, 3): 16, (3, 1, 1, 2): 17, (1, 2, 4, '+'): 18, (4, 3, 3, 1): 19, (2, 1, 1, 3): 20, (1, 4, 3, 3): 21, (3, 3, 3, 2): 22, ('#', 2, 1, 2): 23, (2, 4, 2, 1): 24, ('#', 4, 1, 4): 25, (2, '+', '+', '+'): 26, (3, 4, 3, 2): 27, ('#', 3, 4, 3): 28, (2, 3, 1, '+'): 29, (2, 1, 4, 2): 30, (2, 3, 4, 4): 31, (3, 1, 2, 2): 32, (1, 2, 4, 1): 33, (3, 1, 1, 4): 34, (4, 3, 3, 3): 35, (2, 2, 2, 1): 36, (3, 3, 3, 4): 37, (4, 4, 1, '+'): 38, (1, 3, 1, '+'): 39, ('#', '#', 1, 1): 40, ('#', 2, 1, 4): 41, (4, 1, 2, 1): 42, (3, 4, 4, 2): 43, (2, 3, 1, 1): 44, (3, 4, 3, 4): 45, (1, 4, 1, '+'): 46, (2, 1, 4, 4): 47, (1, 2, 4, 3): 48, (3, 1, 2, 4): 49, (4, 4, 1, 1): 50, (1, 3, 1, 1): 51, ('#', '#', 2, 1): 52, (

In [6]:
print(Xtrain[0,:])
#the same as the matrix above

[0. 0. 0. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 1. 0. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 4. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 3.]


# 2. Train 4-gram NB

In [7]:
# fit NB classifier.
"""
For imbalanced dataset, it is recommended to use ComplementNB
"""
from sklearn.naive_bayes import ComplementNB

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

alpha_list = [0.00001, 0.0001, 0.001, 0.1, 1]
param_grid = {'alpha': alpha_list}
scorers = {
    'accuracy': metrics.make_scorer(accuracy_score),
    'precision': metrics.make_scorer(precision_score),
    'recall': metrics.make_scorer(recall_score),
    'f1_score': metrics.make_scorer(f1_score)
}
cnb_model=ComplementNB()
grid_search = GridSearchCV(cnb_model, param_grid=param_grid, cv=5, scoring=scorers, refit=False, return_train_score=True)
grid_search.fit(Xtrain, labels_train)

results = pd.DataFrame(grid_search.cv_results_)
pivot_table = results.pivot_table(index='param_alpha', 
                                     values=['mean_train_accuracy', 'mean_test_accuracy', 
                                             'mean_train_precision', 'mean_test_precision',
                                             'mean_train_recall', 'mean_test_recall',
                                             'mean_train_f1_score', 'mean_test_f1_score'])

print("Grid Search Results:")
print(pivot_table)

Grid Search Results:
             mean_test_accuracy  mean_test_f1_score  mean_test_precision  \
param_alpha                                                                
0.00001                0.702218            0.516012             0.405612   
0.00010                0.702057            0.515877             0.405444   
0.00100                0.701961            0.515821             0.405352   
0.10000                0.701400            0.515352             0.404774   
1.00000                0.700615            0.514646             0.403951   

             mean_test_recall  mean_train_accuracy  mean_train_f1_score  \
param_alpha                                                               
0.00001              0.709040             0.702458             0.516646   
0.00010              0.709040             0.702310             0.516521   
0.00100              0.709111             0.702194             0.516424   
0.10000              0.709111             0.701713             0.515989

In [9]:
"""
The GridSearch Results show that with alpha = 0.01 we have the best F1-score. But this is still quite bad. 
Now we train the best model on the train set
"""
final_cnb_model = ComplementNB(alpha=0.001,fit_prior=True)
final_cnb_model.fit(Xtrain, labels_train)
Xtest_cnb = final_cnb_model.predict(Xtest)
f1_final = metrics.f1_score(labels_eval, Xtest_cnb)
acc_final = metrics.accuracy_score(labels_eval, Xtest_cnb)


In [10]:
f1_final

0.5169767686323977

In [11]:
acc_final

0.7055053515349612

# 3. Try oversampling methods to deal with imbalanced data

In [12]:
"""
The data is highly imbalanced as the essence of the problem. 
We will try to use 2 methods: SMOTE and ADASYN to deal with imbalanced data and check the result
In this step, we use a experimental setting to avoid data leakage when combining SMOTE and cross-validation
Reference: https://kiwidamien.github.io/how-to-do-cross-validation-when-upsampling-data.html
"""
#ADASYN

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn import datasets
from imblearn.over_sampling import SMOTE, ADASYN
imba_pipeline = make_pipeline(ADASYN(n_neighbors=5, random_state=2727), 
                                    ComplementNB())
param_grid = {
    'complementnb__alpha': alpha_list  
}

grid_imba = GridSearchCV(imba_pipeline, param_grid=param_grid, cv=5, scoring=scorers,
                        return_train_score=True,refit=False)
smote_grid = grid_imba.fit(Xtrain, labels_train)
smote_results = pd.DataFrame(smote_grid.cv_results_)
smote_pivot_table = smote_results.pivot_table(index='param_complementnb__alpha', 
                                     values=['mean_train_accuracy', 'mean_test_accuracy', 
                                             'mean_train_precision', 'mean_test_precision',
                                             'mean_train_recall', 'mean_test_recall',
                                             'mean_train_f1_score', 'mean_test_f1_score'])

print("Grid Search Results:")
print(smote_pivot_table)

Grid Search Results:
                           mean_test_accuracy  mean_test_f1_score  \
param_complementnb__alpha                                           
0.00001                              0.693934            0.535855   
0.00010                              0.693869            0.535803   
0.00100                              0.693821            0.535764   
0.10000                              0.693501            0.535504   
1.00000                              0.693277            0.535389   

                           mean_test_precision  mean_test_recall  \
param_complementnb__alpha                                          
0.00001                               0.405673          0.789134   
0.00010                               0.405613          0.789134   
0.00100                               0.405569          0.789134   
0.10000                               0.405271          0.789134   
1.00000                               0.405082          0.789349   

                  

In [13]:
adasyn_final_cnb_model = make_pipeline(ADASYN(n_neighbors=5, random_state=2727), 
                                    ComplementNB(alpha = 0.00001))
adasyn_final_cnb_model.fit(Xtrain, labels_train)
Xtest_cnb = adasyn_final_cnb_model.predict(Xtest)
f1_adasyn_final = metrics.f1_score(labels_eval, Xtest_cnb)
acc_adasyn_final = metrics.accuracy_score(labels_eval, Xtest_cnb)

In [14]:
#SMOTE

imba_pipeline = make_pipeline(SMOTE(random_state=2727), 
                                    ComplementNB())
param_grid = {
    'complementnb__alpha': alpha_list  
}

grid_imba = GridSearchCV(imba_pipeline, param_grid=param_grid, cv=5, scoring=scorers,
                        return_train_score=True,refit=False)
smote_grid = grid_imba.fit(Xtrain, labels_train)
smote_results = pd.DataFrame(smote_grid.cv_results_)
smote_pivot_table = smote_results.pivot_table(index='param_complementnb__alpha', 
                                     values=['mean_train_accuracy', 'mean_test_accuracy', 
                                             'mean_train_precision', 'mean_test_precision',
                                             'mean_train_recall', 'mean_test_recall',
                                             'mean_train_f1_score', 'mean_test_f1_score'])

print("Grid Search Results:")
print(smote_pivot_table)

Grid Search Results:
                           mean_test_accuracy  mean_test_f1_score  \
param_complementnb__alpha                                           
0.00001                              0.704333            0.520227   
0.00010                              0.704237            0.520146   
0.00100                              0.704092            0.519974   
0.10000                              0.703756            0.519715   
1.00000                              0.703387            0.519505   

                           mean_test_precision  mean_test_recall  \
param_complementnb__alpha                                          
0.00001                               0.408522          0.716054   
0.00010                               0.408422          0.716054   
0.00100                               0.408256          0.715911   
0.10000                               0.407914          0.715982   
1.00000                               0.407562          0.716269   

                  

In [15]:
smote_final_cnb_model = make_pipeline(SMOTE(random_state=2727), 
                                    ComplementNB(alpha = 0.00001))
smote_final_cnb_model.fit(Xtrain, labels_train)
Xtest_cnb = smote_final_cnb_model.predict(Xtest)
f1_smote_final = metrics.f1_score(labels_eval, Xtest_cnb)
acc_smote_final = metrics.accuracy_score(labels_eval, Xtest_cnb)

In [16]:
print("F1-score for model using ADASYN:", f1_adasyn_final)
print("F1-score for model using SMOTE:", f1_smote_final)

F1-score for model using ADASYN: 0.5374535497750831
F1-score for model using SMOTE: 0.5190245953331932


In [17]:
print("Accuracy for model using ADASYN:", acc_adasyn_final)
print("Accuracy for model using SMOTE:", acc_smote_final)

Accuracy for model using ADASYN: 0.6968531692623213
Accuracy for model using SMOTE: 0.7067230660770365


In [18]:
"""
Oversampling improved the model performance. 
ADASYN is better in terms of F1-score, while SMOTE is better in terms of accuracy.
However, considering the perfomance of F1-score is more important for the imbalanced dataset, the optimal model in this case
would be Complement Naive Bayes with ADASYN as the oversampling method.
"""


'\nOversampling improved the model performance. \nADASYN is better in terms of F1-score, while SMOTE is better in terms of accuracy.\nHowever, considering the perfomance of F1-score is more important for the imbalanced dataset, the optimal model in this case\nwould be Complement Naive Bayes with ADASYN as the oversampling method.\n'