# Naive Bayes model 

In [3]:
import numpy as np
import pandas as pd


## Load the data


In [27]:
## LOAD the data
df = pd.read_csv("data/filtered_data.csv")

## Fix the datatype of session column
def convert_session(x):
    return [int(i) for i in x.strip("[]").split(",")]

df.session = df.session.map(convert_session)

df.head()

Unnamed: 0,session_id_hash,product_action,purchase,len,session
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"['view', 'view', 'view', 'view', 'view', 'view...",0,15,"[1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1]"
1,0000913afa22ba9c31efb992bcf6388b0bbfe28056bef3...,"['view', 'detail', 'view', 'view', 'view', 'de...",0,120,"[1, 2, 1, 1, 1, 2, 1, 2, 1, 4, 1, 1, 1, 1, 2, ..."
2,00010d84aca1294479304044207fd268f63228844779c6...,"['view', 'view', 'view', 'view', 'view', 'view...",0,24,"[1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 2, 1, ..."
3,0001368d732951035a7ef7ef42b345a5c50b7d66966749...,"['view', 'view', 'detail', 'add', 'view', 'vie...",0,13,"[1, 1, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
4,0001c180fb742f96ff388ba8f67a568e6fa66aed30d0d2...,"['remove', 'view', 'detail', 'view', 'view', '...",1,23,"[3, 1, 2, 1, 1, 1, 3, 3, 3, 1, 1, 3, 1, 3, 1, ..."


## Split train and test

In [55]:
### get subset for training
## make the prediction for 5, 10 and 15, and store the f1 it in a dict
## First split the train/test data
from sklearn.model_selection import train_test_split
from collections import defaultdict

data = defaultdict(lambda: defaultdict())

for i in [5, 10, 15]:
    
    ## Subset from the original df
    temp = df[df.len >= i]
    temp["session"] = temp.session.map(lambda x: x[0:i])
    temp = temp[["session", "purchase"]]
    
    ## Split train and test
    X_train, X_test, y_train, y_test = train_test_split(temp["session"], temp["purchase"], test_size=0.2, random_state=123)
    data[f"{i}_clicks"]["X_train"] = X_train.to_numpy()
    data[f"{i}_clicks"]["X_test"] = X_test.to_numpy()
    data[f"{i}_clicks"]["y_train"] = y_train.to_numpy()
    data[f"{i}_clicks"]["y_test"] = y_test.to_numpy()
    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp["session"] = temp.session.map(lambda x: x[0:i])


In [None]:
## Define 

In [58]:
import numpy as np

def ngram_featurizer(session, n=3):
    
    """takes in a list and an integer defining the size of ngrams.
     Returns the ngrams of desired size in the input string"""
    
    session = ['#']*(n-1) + session + ['+']*(n-1)
    ngrams = [tuple(session[i:i+n]) for i in range(len(session)-n+1)]
    
    return ngrams
    

def encode_sessions(sessions,n=3, mapping=None):
    
    """
    Takes in a list of lists, an integer indicating the character ngrams' size,
    and a dictionary mapping ngrams to numerical indices. If no dictionary is passed,
    one is created inside the function.
    The function outputs a 2d NumPy array with as many rows as there are strings in 
    the input list, and the mapping from ngrams to indices, representing the columns 
    of the NumPy array.
    """
    
    if not mapping:
        all_ngrams = set()
        for session in sessions:
            all_ngrams = all_ngrams.union(set(ngram_featurizer(session, n)))
    
        mapping = {ngram: i for i, ngram in enumerate(all_ngrams)}
    
    X = np.zeros((len(sessions), len(mapping)))
    for i, session in enumerate(sessions):
        for ngram in ngram_featurizer(session, n):
            try:
                X[i, mapping[ngram]] += 1
            except KeyError:
                pass
    
    return X, mapping

In [69]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

score = {}
for i in [5,10,15]:
    sessions_train = data[f"{i}_clicks"]["X_train"]
    sessions_eval = data[f"{i}_clicks"]["X_test"]
    labels_train = data[f"{i}_clicks"]["y_train"]
    labels_eval = data[f"{i}_clicks"]["y_test"]

    Xtrain, mapping = encode_sessions(sessions_train)
    Xtest, _ = encode_sessions(sessions_eval, mapping=mapping)

    NB = MultinomialNB(alpha=1, fit_prior=True)
    NB.fit(Xtrain,labels_train)
    bayes_predictions = NB.predict(Xtest)
    
    f1 = metrics.f1_score(labels_eval, bayes_predictions)
    
    score[f"F1_{i}-clicks"] = f1

In [70]:
score

{'F1_5-clicks': 0.437307333859058,
 'F1_10-clicks': 0.41121939624435466,
 'F1_15-clicks': 0.4021269524759056}

## Save the score

In [71]:
import json

json.dump(score, open("data/naive_score.json", "w"))
