# Preprocessinng

In [64]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("training_data.csv")
df.head()

Unnamed: 0,session_id_hash,event_type,product_action,product_sku_hash,server_timestamp_epoch_ms,hashed_url
0,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885210881,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
1,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,61ef3869355b78e11011f39fc7ac8f8dfb209b3442a9d5...,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
2,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885213307,4ed279f4f0deab6dfc80f4f7bf49d527fd894fa478a9ce...
3,20c458b802f6ea9374783bfc528b19421be977a6769785...,event_product,detail,d5157f8bc52965390fa21ad5842a8502bc3eb8b0930f3f...,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...
4,20c458b802f6ea9374783bfc528b19421be977a6769785...,pageview,,,1550885215484,7e4527ac6a32deed4f4f06bb7c49b907b7ca371e59d57d...


## Pre-processing

pre-process the sessions (you can execute these tasks in the order you prefer, but make sure to specify which task is being solved in which block of code):
1. sessionise <font color='red'>(1pt)</font>
1. select sessions with at least one add-to-cart <font color='red'>(1pt)</font>
1. add class labels: treat purchase as the positive class <font color='red'>(1pt)</font>
1. cut purchase sessions to the last event before the first purchase <font color='red'>(1pt)</font>
1. remove sessions shorter than 5 and longer than 155 clicks <font color='red'>(1pt)</font>
1. symbolise actions <font color='red'>(1pt)</font>


In [3]:
# Sessionise:
# derive sessions from action by action dataset
# merge all the actions by session_id

df['product_action'] = df['product_action'].fillna('view')
df = df.groupby('session_id_hash')['product_action'].agg(list).reset_index()
df.head()

Unnamed: 0,session_id_hash,product_action
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi..."
1,000009f36a40de1d557afc083dbb3fc03eef2473337bad...,"[view, view]"
2,00000e812c3076d18245710a31b348d3f23314b7d0dc90...,[view]
3,00001355930ff05e66ab30bccff221c33eba90e1517397...,"[view, detail]"
4,0000162d1dad0beb867c191ab2c8c7c06086cc57d9ebe2...,"[view, view, view, view, detail, view, detail,..."


In [4]:
##  labelization
df['purchase'] = np.where(df.product_action.map(set(['purchase']).issubset), 1, 0)
df.head()

Unnamed: 0,session_id_hash,product_action,purchase
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",0
1,000009f36a40de1d557afc083dbb3fc03eef2473337bad...,"[view, view]",0
2,00000e812c3076d18245710a31b348d3f23314b7d0dc90...,[view],0
3,00001355930ff05e66ab30bccff221c33eba90e1517397...,"[view, detail]",0
4,0000162d1dad0beb867c191ab2c8c7c06086cc57d9ebe2...,"[view, view, view, view, detail, view, detail,...",0


In [5]:
### Cut actions before purchase if there is one
### keep the full sesssion if there is not
df['product_action'] = df['product_action'].map(lambda x: x[0:x.index('purchase')] if 'purchase' in x else x)

## select sessions with at least one add-to-cart
df = df[df.product_action.map(lambda x: "add" in x)]

### Filtered out outliers which > 155 or < 5.    

df["len"] = df["product_action"].map(len)

# removing very short and very long sessions
df.drop(df[df.len < 5].index, inplace=True)
df.drop(df[df.len > 155].index, inplace=True)

df.head()

Unnamed: 0,session_id_hash,product_action,purchase,len
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",0,18
37,0000913afa22ba9c31efb992bcf6388b0bbfe28056bef3...,"[view, view, view, detail, view, view, detail,...",0,139
64,00010d84aca1294479304044207fd268f63228844779c6...,"[view, view, view, detail, view, view, view, v...",0,41
84,0001368d732951035a7ef7ef42b345a5c50b7d66966749...,"[view, detail, add, view, view, detail, add, v...",0,16
119,0001c180fb742f96ff388ba8f67a568e6fa66aed30d0d2...,"[view, view, view, view, detail, add, remove, ...",1,29


In [6]:
## Sanity check: the output should be True
df[df.product_action.map(lambda x: "add" not in x)].shape[0]

0

In [7]:
## symbolise actions

from collections import Counter

sessions = df['product_action'].to_list()
labels = df['purchase'].to_list()

counts = Counter([item for session in sessions for item in session])
symbol2idx = {symbol: idx for idx, symbol in enumerate(sorted(counts, key=counts.get, reverse=True), 1)}

df["session"]=df["product_action"].map(lambda session: [symbol2idx[s] for s in session])

df.head()

Unnamed: 0,session_id_hash,product_action,purchase,len,session
0,00000114e1075962f022114fcfc17f2d874e694ac5d201...,"[view, detail, add, view, view, view, view, vi...",0,18,"[1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, ..."
37,0000913afa22ba9c31efb992bcf6388b0bbfe28056bef3...,"[view, view, view, detail, view, view, detail,...",0,139,"[1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, ..."
64,00010d84aca1294479304044207fd268f63228844779c6...,"[view, view, view, detail, view, view, view, v...",0,41,"[1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ..."
84,0001368d732951035a7ef7ef42b345a5c50b7d66966749...,"[view, detail, add, view, view, detail, add, v...",0,16,"[1, 2, 3, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
119,0001c180fb742f96ff388ba8f67a568e6fa66aed30d0d2...,"[view, view, view, view, detail, add, remove, ...",1,29,"[1, 1, 1, 1, 2, 3, 4, 1, 2, 1, 1, 1, 4, 4, 4, ..."


In [8]:
## Check the symbolisation
symbol2idx

{'view': 1, 'detail': 2, 'add': 3, 'remove': 4}

## Oracle model

In [66]:
def oracle(X_train, y_train):
    '''
    This function return a oracle model based on the session and label.
    '''
    from collections import defaultdict

    d = defaultdict(lambda: defaultdict(int))
    for s, l in zip(X_train, y_train):
        d[str(tuple(s))][l] += 1
        
    model = {}
    for session in d.keys():
        n_buy, n_ws = 0, 0
        for label in d[session].keys():
            if label:
                n_buy += d[session][label]
            else:
                n_ws += d[session][label]
        model[session] = n_buy / (n_buy + n_ws)
        
    return model

def oracle_predict(model, X_test, y_test=None):
    '''
    1. Make prediction on X_test.
    2. For possibility >0.5 predict 1, else 0.
    3. reutrn prediction and F1 score
    '''
    from sklearn import metrics

    y_pred = []
    for x in X_test:
        pos = model.get(str(x), "#") ## For sessions that not in the model, we assign it to #, and then predict 0.
        y_pred.append(pos)
        
    ## count "#" first and substitute with 0
    missing_rate = y_pred.count("#")/len(y_pred)

    y_pred = [1 if y != "#" and y >= 0.5 else 0 for y in y_pred]
        
    if y_test is not None:
#         y_test = y_test.to_list()
        f1 = metrics.f1_score(y_test, y_pred)
        return(f1, missing_rate,y_pred)
    else:
        return y_pred

In [67]:
## Oracle model for 5 clicks
## Subset from the original df
from sklearn.model_selection import train_test_split

i = 5
data = df[["session", "purchase"]]
data.session = df.session.map(lambda x: x[:x.index(3)+1+i] if len(x[x.index(3)+1:])>=i else None)
data.session = df.session.map(tuple)
data = data.dropna()
    
model = oracle(data.session, data.purchase)
f1, missing_rate, prediction = oracle_predict(model, data.session, data.purchase)
f1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


0.8457940701701135

In [68]:
# oracle model for 10 clicks
i = 10
data = df[["session", "purchase"]]
data.session = df.session.map(lambda x: x[:x.index(3)+1+i] if len(x[x.index(3)+1:])>=i else None)
data.session = df.session.map(tuple)
data = data.dropna()
    
model = oracle(data.session, data.purchase)
f1, missing_rate, prediction = oracle_predict(model, data.session, data.purchase)
f1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


0.8457940701701135

In [69]:
# oracle model for 15 clicks
i = 15
data = df[["session", "purchase"]]
data.session = df.session.map(lambda x: x[:x.index(3)+1+i] if len(x[x.index(3)+1:])>=i else None)
data.session = df.session.map(tuple)
data = data.dropna()
    
model = oracle(data.session, data.purchase)
f1, missing_rate, prediction = oracle_predict(model, data.session, data.purchase)
f1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


0.8457940701701135

## Hold Out

1. Subset data for n-click (n=5, 10, 15) models.
1. Do train/test split in for each subset.

In [11]:
### get subset for training
## make the prediction for 5, 10 and 15, and store the f1 it in a dict
## First split the train/test data
from sklearn.model_selection import train_test_split
from collections import defaultdict

data = defaultdict(lambda: defaultdict())

for i in [5, 10, 15]:
    
    ## Subset from the original df
    temp = df[["session", "purchase"]]
    temp.session = temp.session.map(lambda x: x[:x.index(3)+1+i] if len(x[x.index(3)+1:])>=i else None)
    temp = temp.dropna()
    
    ## Split train and test
    X_train, X_test, y_train, y_test = train_test_split(temp["session"], temp["purchase"], test_size=0.2, random_state=123)
    data[f"{i}_clicks"]["X_train"] = X_train.to_numpy()
    data[f"{i}_clicks"]["X_test"] = X_test.to_numpy()
    data[f"{i}_clicks"]["y_train"] = y_train.to_numpy()
    data[f"{i}_clicks"]["y_test"] = y_test.to_numpy()

In [31]:
data_size = pd.DataFrame(data).applymap(len)
data_size

Unnamed: 0,5_clicks,10_clicks,15_clicks
X_train,86213,60709,44216
X_test,21554,15178,11054
y_train,86213,60709,44216
y_test,21554,15178,11054


## Naive Bayes Model

In [32]:
import numpy as np

def ngram_featurizer(session, n=3):
    
    """takes in a list and an integer defining the size of ngrams.
     Returns the ngrams of desired size in the input string"""
    
    session = ['#']*(n-1) + session + ['+']*(n-1)
    ngrams = [tuple(session[i:i+n]) for i in range(len(session)-n+1)]
    
    return ngrams
    

def encode_sessions(sessions,n=3, mapping=None):
    
    """
    Takes in a list of lists, an integer indicating the character ngrams' size,
    and a dictionary mapping ngrams to numerical indices. If no dictionary is passed,
    one is created inside the function.
    The function outputs a 2d NumPy array with as many rows as there are strings in 
    the input list, and the mapping from ngrams to indices, representing the columns 
    of the NumPy array.
    """
    
    if not mapping:
        all_ngrams = set()
        for session in sessions:
            all_ngrams = all_ngrams.union(set(ngram_featurizer(session, n)))
    
        mapping = {ngram: i for i, ngram in enumerate(all_ngrams)}
    
    X = np.zeros((len(sessions), len(mapping)))
    for i, session in enumerate(sessions):
        for ngram in ngram_featurizer(session, n):
            try:
                X[i, mapping[ngram]] += 1
            except KeyError:
                pass
    
    return X, mapping

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

score = {}
for i in [5,10,15]:
    sessions_train = data[f"{i}_clicks"]["X_train"]
    sessions_eval = data[f"{i}_clicks"]["X_test"]
    labels_train = data[f"{i}_clicks"]["y_train"]
    labels_eval = data[f"{i}_clicks"]["y_test"]

    Xtrain, mapping = encode_sessions(sessions_train)
    Xtest, _ = encode_sessions(sessions_eval, mapping=mapping)

    NB = MultinomialNB(alpha=1, fit_prior=True)
    NB.fit(Xtrain,labels_train)
    bayes_predictions = NB.predict(Xtest)
    
    f1 = metrics.f1_score(labels_eval, bayes_predictions)
    
    score[f"F1_{i}-clicks"] = f1

In [34]:
score

{'F1_5-clicks': 0.34254807692307687,
 'F1_10-clicks': 0.3847316704459562,
 'F1_15-clicks': 0.3973732951675366}

## Markov Chain

In [79]:
# train Markov Chain
from collections import defaultdict

def get_ngram(session, i, n):
        
        """
        Takes in a list, an index and the ngram size n, and returns the history and current 
        token of the appropriate size: the current token is the one at the provided index, while the 
        history consists of the n-1 previous tokens. If the ngram size is 1, only the current token is returned.
        """
        
        if n == 1:
            return session[i]
        else:
            ngram = session[i-(n-1):i+1]
            history = tuple(ngram[:-1])
            target = ngram[-1]
            return (history, target)
        
                    
def update_counts(sessions, n):
        
    """
    Processes the input sessions given an ngram sizes and stores transition counts. 
    A dictionary of dictionary is created, where the first level key indicates the history, the second level 
    key indicates the current word, and the value indicates the history-current word co-occurrence count.
    If the ngram size is 1, then a simple dictionary mapping unigram to frequency count is created
    """
        
    counts = defaultdict(dict)
    for session in sessions:
        session = ['#']*(n-1) + session + ['+']
        for idx in range(n-1, len(session)):
            ngram = get_ngram(session, idx, n)
            if n == 1:
                counts[ngram] += 1
            else:
                # it's faster to try to do something and catch an exception than to use an if statement to 
                # check whether a condition is met beforehand. The if is checked everytime, the exception 
                # is only catched the first time, after that everything runs smoothly
                try:
                    counts[ngram[0]][ngram[1]] += 1
                except KeyError:
                    counts[ngram[0]][ngram[1]] = 1
        
    # first loop through the sentences in the corpus, than loop through each word in a sentence
    vocab = {event for session in sessions for event in session}
        
    return counts, vocab

In [78]:
def get_prior(labels):
    
    """
    Given a vector of binary labels (0 and 1), computes the probability of the positive class.
    """
    
    return np.mean(labels)

def get_posterior(prior, likelihood_one, likelihood_zero):
    
    """
    Given the prior of the positive class and the likelihood of an input under the language models trained
    on input from the positive and negative classes, returns the posterior probability of the positive class.
    """
    
    if 0 < likelihood_zero <= 1 and 0 < likelihood_one <= 1:
        return (prior * likelihood_one) / ((prior * likelihood_one) + ((1 - prior) * likelihood_zero))
    elif likelihood_zero <= 0 and likelihood_one <= 0:
        prior = np.log(prior) if 0 < prior <= 1 else prior
        normalize = np.logaddexp(prior + likelihood_one, 1 - prior + likelihood_zero)
        return np.exp(prior + likelihood_one - normalize)
    else:
        raise ValueError("One or both likelihood terms are higher than 1!")
    

def get_unigram_probability(ngram, counts, vocab_size, k=1):
        
    """
    When n==1, we get the unigram probability.
    Means that we get the probability of an event based on its prior event.
    In this function, we transfer the count into probability with smoothing constant of 1. 
    And do row normalization after.
    """
        
    tot = sum(list(counts.values())) + (vocab_size*k)
        
    try:
        ngram_count = counts[ngram] + k
    except KeyError:
        ngram_count = k
        
    return ngram_count/tot

def get_ngram_probability(history, target, counts, vocab_size, k=1):
        
    """
    When n>1, we get the ngram probability.
    Means that we get the probability of an event based on its prior n-1 event.
    In this function, we transfer the count into probability with smoothing constant of 1. 
    And do row normalization after.
    """
        
    try:
        ngram_tot = np.sum(list(counts[history].values())) + (vocab_size*k)
        try:
            transition_count = counts[history][target] + k
        except KeyError:
            transition_count = k
    except KeyError:
        transition_count = k
        ngram_tot = vocab_size*k
            
    return transition_count/ngram_tot 

def get_sequence_prob(session, counts, n, vocab_size, k=1):
        
    """
    Apply the Rule of Markov chain: 
    
    MULTIPLY the conditional probabily of each event in the sequence given its history with (n-1) length. 
    
    """
        
    probs = []
    for idx in range(n-1, len(session)):
        ngram = get_ngram(session, idx, n)
        if n == 1:
            probs.append(get_unigram_probability(ngram, counts, vocab_size))
        else:
            probs.append(get_ngram_probability(ngram[0], ngram[1], counts, vocab_size))

        return np.prod(probs)

In [77]:
def markov_train_predict(X_train, y_train, X_test, ngram = 3):
    '''
    1. take in the training data and split them into group of buy vs. non-buy;
    2. get the counts table for both group.
    3. Make prediction based on two count tables.
    '''
    
    ## Split the training data into two category: buy vs non_buy
    train_buy, train_ws = [], []
    for session, label in zip(X_train, y_train):
        if label == 1:
            train_buy.append(session)
        else:
            train_ws.append(session)
    
    ## calculate the count table
    buy_counts, buy_vocab = update_counts(train_buy, ngram)
    ws_counts, ws_vocab = update_counts(train_ws, ngram)
    
    ## get prior from traning data
    prior_buy = get_prior(y_train)
    
    ## Make prediction
    markov_predictions = []
    
    for session in X_test:
        likelihood_buy = get_sequence_prob(session, buy_counts, ngram, len(buy_vocab))
        likelihood_ws = get_sequence_prob(session, ws_counts, ngram, len(ws_vocab))

        markov_predictions.append(np.round(get_posterior(prior_buy, likelihood_buy, likelihood_ws)))
        
    return markov_predictions

### 5-clicks

In [72]:
i = '5_clicks'
X_train = data[i]['X_train']
y_train = data[i]['y_train']
X_test = data[i]['X_test']
y_test = data[i]['y_test']


In [80]:
for i in range(3,7):
    y_pred = markov_train_predict(X_train, y_train, X_test, ngram = i)

    # evaluate Markov Chain
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    print(f"For n={i}: acc={acc}, f1={f1}")

For n=3: acc=0.7528996937923356, f1=0.0
For n=4: acc=0.7528996937923356, f1=0.0
For n=5: acc=0.7528069035909808, f1=0.000375234521575985
For n=6: acc=0.7528532986916582, f1=0.00037530493525989863


### 10-clicks

In [81]:
i = '10_clicks'
X_train = data[i]['X_train']
y_train = data[i]['y_train']
X_test = data[i]['X_test']
y_test = data[i]['y_test']

In [83]:
for i in range(4,11):
    y_pred = markov_train_predict(X_train, y_train, X_test, ngram = i)

    # evaluate Markov Chain
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    print(f"For n={i}: acc={acc}, f1={f1}")

For n=4: acc=0.741929107919357, f1=0.0
For n=5: acc=0.7419949927526683, f1=0.0005104645227156713
For n=6: acc=0.741929107919357, f1=0.0
For n=7: acc=0.7414679140861774, f1=0.0
For n=8: acc=0.741929107919357, f1=0.0015294417537598777
For n=9: acc=0.7415996837528, f1=0.0030503304524656842
For n=10: acc=0.740677296086441, f1=0.005055611729019211


## 15-clicks

In [84]:
i = '10_clicks'
X_train = data[i]['X_train']
y_train = data[i]['y_train']
X_test = data[i]['X_test']
y_test = data[i]['y_test']

In [86]:
for i in range(4,12):
    y_pred = markov_train_predict(X_train, y_train, X_test, ngram = i)

    # evaluate Markov Chain
    acc = metrics.accuracy_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    print(f"For n={i}: acc={acc}, f1={f1}")

For n=4: acc=0.741929107919357, f1=0.0
For n=5: acc=0.7419949927526683, f1=0.0005104645227156713
For n=6: acc=0.741929107919357, f1=0.0
For n=7: acc=0.7414679140861774, f1=0.0
For n=8: acc=0.741929107919357, f1=0.0015294417537598777
For n=9: acc=0.7415996837528, f1=0.0030503304524656842
For n=10: acc=0.740677296086441, f1=0.005055611729019211
For n=11: acc=0.7402819870865727, f1=0.009547738693467336
