# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix, vstack

# Data Prep

## Load Data

In [2]:
data = pd.read_csv("./fnb_datav2.csv")

# Remove 
data = data.drop(columns = ["item_descrip", "tod"])
data.head(20)

# Add the following user features

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind
0,755,DISPLAY,17JAN2023,NONE,Screen1,ALL,segment3,B01,Semi Active
1,4521,DISPLAY,27FEB2023,NONE,Screen1,ALL,segment1,B07,Semi Active
2,4521,DISPLAY,18FEB2023,NONE,Screen1,ALL,segment1,B07,Semi Active
3,4521,DISPLAY,30JAN2023,NONE,Screen1,ALL,segment1,B07,Semi Active
4,4521,CLICK,05FEB2023,IBAB,Screen1,INSURE,segment1,B07,Semi Active
5,4521,CHECKOUT,05FEB2023,IBAB,Screen1,INSURE,segment1,B07,Semi Active
6,6145,DISPLAY,26FEB2023,NONE,Screen1,ALL,segment3,B01,Cold Start
7,6145,DISPLAY,27JAN2023,NONE,Screen1,ALL,segment3,B01,Cold Start
8,6145,DISPLAY,10FEB2023,NONE,Screen1,ALL,segment3,B01,Cold Start
9,6145,DISPLAY,10JAN2023,NONE,Screen1,ALL,segment3,B01,Cold Start


In [3]:
# Checking transactions for a specific ID:
id = 77196041
data[data["idcol"]==id]

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind
155291,77196041,DISPLAY,26FEB2023,NONE,Screen1,ALL,segment2,B01,Active
155292,77196041,DISPLAY,16FEB2023,NONE,Screen1,ALL,segment2,B01,Active
155293,77196041,DISPLAY,29MAR2023,NONE,Screen1,ALL,segment2,B01,Active
155294,77196041,DISPLAY,05JAN2023,NONE,Screen1,ALL,segment2,B01,Active
155295,77196041,CLICK,27MAR2023,FICQ,Screen2,INSURE,segment2,B01,Active
...,...,...,...,...,...,...,...,...,...
155353,77196041,CLICK,27MAR2023,CACU,Screen2,TRANSACT,segment2,B01,Active
155354,77196041,CHECKOUT,27MAR2023,CABC,Screen2,INVEST,segment2,B01,Active
155355,77196041,CLICK,27MAR2023,CABC,Screen2,INVEST,segment2,B01,Active
155356,77196041,CHECKOUT,27MAR2023,SEVP,Screen2,TRANSACT,segment2,B01,Active


In [4]:
data.nunique()

idcol          84375
interaction        3
int_date          88
item             104
page               2
item_type          7
segment            4
beh_segment       50
active_ind         3
dtype: int64

In [5]:
# Add the following user features
"""
- weekly interaction frequency, 
- most frequenctly item interacted with (other than ALL),

"""

# Give scores to the interactions, and then drop the interaction column:L
interaction_scores = {
    'CLICK': 1,
    'CHECKOUT': 2
}

interaction_scores = {
    'DISPLAY': 0,
    'CLICK': 1,
    'CHECKOUT': 2
}

data['interaction_scores'] = data['interaction'].map(interaction_scores).fillna(0).astype(int)

# Add ids for each unique item
data['item_id'] = pd.factorize(data['item'])[0]

data.drop(columns = ["interaction"])
data.head()

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,item_id
0,755,DISPLAY,17JAN2023,NONE,Screen1,ALL,segment3,B01,Semi Active,0,0
1,4521,DISPLAY,27FEB2023,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0
2,4521,DISPLAY,18FEB2023,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0
3,4521,DISPLAY,30JAN2023,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0
4,4521,CLICK,05FEB2023,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,1


### Add User Features:

#### Add the following user features
- weekly interaction frequency - on average, over the whole dataset, how many times does the user interact per week
- daily interaction frequency - similar to above
- monthly interaction frequency - how many times 
- most frequenctly item interacted with (other than ALL) over the whole 
- most frequently interacted with item type
- Ratio of checkout to click for each user-item combination (ask Lize)
- Potential other features to add from TOD:
    - average time between clicking item
    - average time between checking out the item


In [6]:
original_data =  data.copy()

target_idcol = 77196041
index = int(original_data[original_data['idcol'] == target_idcol].index[0])


In [7]:
# Adding daily, weekly and monthly interaction frequencies, over entire dataset
data = original_data.copy()
# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# Calculate daily interaction frequency
daily_freq = data.groupby(['idcol', 'int_date']).size().groupby(level=0).mean().reset_index(name='avg_daily_freq')

# Calculate weekly interaction frequency
data['week'] = data['int_date'].dt.isocalendar().week
weekly_freq = data.groupby(['idcol', 'week']).size().groupby(level=0).mean().reset_index(name='avg_weekly_freq')

# Calculate monthly interaction frequency
data['month'] = data['int_date'].dt.to_period('M')
monthly_freq = data.groupby(['idcol', 'month']).size().groupby(level=0).mean().reset_index(name='avg_monthly_freq')

# Merge frequencies back into the original DataFrame
data = data.merge(daily_freq, on='idcol')
data = data.merge(weekly_freq, on='idcol')
data = data.merge(monthly_freq, on='idcol')

data[data["idcol"] == target_idcol].head(20)




# # Assuming original_data is your original DataFrame
# # Make a copy of the original data
# data = original_data.copy()

# # Convert int_date to datetime
# data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# # Filter the data to include only CLICK or CHECKOUT interactions
# filtered_data = data[data['interaction'].isin(['CLICK', 'CHECKOUT'])]

# # Calculate daily interaction frequency
# daily_freq = filtered_data.groupby(['idcol', 'int_date']).size().groupby(level=0).mean().reset_index(name='avg_daily_freq')

# # Calculate weekly interaction frequency
# filtered_data['week'] = filtered_data['int_date'].dt.isocalendar().week
# weekly_freq = filtered_data.groupby(['idcol', 'week']).size().groupby(level=0).mean().reset_index(name='avg_weekly_freq')

# # Calculate monthly interaction frequency
# filtered_data['month'] = filtered_data['int_date'].dt.to_period('M')
# monthly_freq = filtered_data.groupby(['idcol', 'month']).size().groupby(level=0).mean().reset_index(name='avg_monthly_freq')

# # Merge frequencies back into the original DataFrame
# data = data.merge(daily_freq, on='idcol', how='left')
# data = data.merge(weekly_freq, on='idcol', how='left')
# data = data.merge(monthly_freq, on='idcol', how='left')
# data = data.fillna(0)

# Example: Display data for a specific idcol
# target_idcol = 755  # Replace with the actual idcol you want to inspect
# data[data["idcol"] == target_idcol].head(20)
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,week,month,avg_daily_freq,avg_weekly_freq,avg_monthly_freq
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,0,3,2023-01,1.0,1.0,1.0
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,9,2023-02,1.25,1.666667,2.5
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,7,2023-02,1.25,1.666667,2.5
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,5,2023-01,1.25,1.666667,2.5
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,1,5,2023-02,1.25,1.666667,2.5
5,4521,CHECKOUT,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,2,1,5,2023-02,1.25,1.666667,2.5
6,6145,DISPLAY,2023-02-26,NONE,Screen1,ALL,segment3,B01,Cold Start,0,0,8,2023-02,1.0,1.0,2.0
7,6145,DISPLAY,2023-01-27,NONE,Screen1,ALL,segment3,B01,Cold Start,0,0,4,2023-01,1.0,1.0,2.0
8,6145,DISPLAY,2023-02-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,0,6,2023-02,1.0,1.0,2.0
9,6145,DISPLAY,2023-01-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,0,2,2023-01,1.0,1.0,2.0


In [8]:
# Adding most clicked and most bought items and item types for each user:
clicks = data[data['interaction'] == 'CLICK']
most_clicked = clicks.groupby(['idcol', 'item']).size().reset_index(name='click_count')
most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item']]
most_clicked.rename(columns={'item': 'most_clicked_item'}, inplace=True)

# Determine the most bought item for each user
checkouts = data[data['interaction'] == 'CHECKOUT']
most_bought = checkouts.groupby(['idcol', 'item']).size().reset_index(name='checkout_count')
most_bought = most_bought.loc[most_bought.groupby('idcol')['checkout_count'].idxmax()][['idcol', 'item']]
most_bought.rename(columns={'item': 'most_bought_item'}, inplace=True)

# Merge the most clicked and most bought items back into the original DataFrame
data = data.merge(most_clicked, on='idcol', how='left')
data = data.merge(most_bought, on='idcol', how='left')

data.head()

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,week,month,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,0,3,2023-01,1.0,1.0,1.0,,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,9,2023-02,1.25,1.666667,2.5,IBAB,IBAB
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,7,2023-02,1.25,1.666667,2.5,IBAB,IBAB
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,5,2023-01,1.25,1.666667,2.5,IBAB,IBAB
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,1,5,2023-02,1.25,1.666667,2.5,IBAB,IBAB


In [9]:
# Adding most clicked and most bought items and item types for each user:
clicks = data[data['interaction'] == 'CLICK']
most_clicked = clicks.groupby(['idcol', 'item_type']).size().reset_index(name='click_count')
most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item_type']]
most_clicked.rename(columns={'item_type': 'most_clicked_item_type'}, inplace=True)

# Determine the most bought item for each user
checkouts = data[data['interaction'] == 'CHECKOUT']
most_bought = checkouts.groupby(['idcol', 'item_type']).size().reset_index(name='checkout_count')
most_bought = most_bought.loc[most_bought.groupby('idcol')['checkout_count'].idxmax()][['idcol', 'item_type']]
most_bought.rename(columns={'item_type': 'most_bought_item_type'}, inplace=True)

# Merge the most clicked and most bought items back into the original DataFrame
data = data.merge(most_clicked, on='idcol', how='left')
data = data.merge(most_bought, on='idcol', how='left')

data.head()

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,week,month,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,0,3,2023-01,1.0,1.0,1.0,,,,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,9,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,7,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,5,2023-01,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,1,5,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE


In [10]:
# # For each unique user/item combination, what is the ratio of CHECKOUT to CLICK?# First, filter the DataFrame to include only CHECKOUT and CLICK interactions
# # NOTE: This isn't valid because an item doesn't need to be clicked to check it out - that is weird. I can intelligently fill NaN values with 1 if there was a 
# # checkout but no click and 0 otherwise, but ask Lizes opinion first

# checkout_click_df = data[data['interaction'].isin(['CHECKOUT', 'CLICK'])]

# # Group by unique idcol-item combinations and count the occurrences of each interaction type
# interaction_counts = checkout_click_df.groupby(['idcol', 'item', 'interaction']).size().unstack(fill_value=0)

# # Calculate the ratio of CHECKOUT to CLICK interactions
# interaction_counts['checkout_click_ratio'] = interaction_counts['CHECKOUT'] / interaction_counts['CLICK']


# # # Merge the ratio back to the original DataFrame based on idcol and item
# # data = data.merge(interaction_counts.reset_index()[['idcol', 'item', 'checkout_click_ratio']], on=['idcol', 'item'], how='left')

# # # Display the DataFrame with the new column
# # data.head(20)

In [11]:
# # On which page did the user checkout on the item most frequently? (NOT USING)
# # Filter the DataFrame to include only CLICK interactions
# click_df = data[data['interaction'] == 'CHECKOUT']

# # Group by user-item-screen combination and count the occurrences
# click_counts = click_df.groupby(['idcol', 'item', 'page'])['interaction'].count().reset_index()

# # Find the screen with the maximum count for each user-item combination
# max_click_screen = click_counts.groupby(['idcol', 'item']).apply(lambda x: x.loc[x['interaction'].idxmax()]).reset_index(drop=True)

# # Merge the result back to the original DataFrame based on user-item combination
# data = data.merge(max_click_screen[['idcol', 'item', 'page']], on=['idcol', 'item'], how='left')
# data.rename(columns={'page': 'most_checked_out_screen'}, inplace=True)

# # Display the DataFrame with the new column
# data.head(20)

In [12]:
# I will fill NaN values with the mode of the data, for people of the same segment and BEH segment

def fill_nan_with_mode_for_segment_and_beh_segment(data):
    # Get unique combinations of segment and beh_segment
    unique_combinations = data[['segment', 'beh_segment']].drop_duplicates()

    for index, row in unique_combinations.iterrows():
        segment = row['segment']
        beh_segment = row['beh_segment']
        
        # Filter the DataFrame based on the current segment and beh_segment
        filtered_data = data[(data['segment'] == segment) & (data['beh_segment'] == beh_segment)]
        
        # Calculate mode for each column within the segment and beh_segment group
        mode_values = filtered_data.mode().iloc[0]
        
        # Replace NaN values in the original DataFrame with mode values for the current segment and beh_segment
        data.loc[(data['segment'] == segment) & (data['beh_segment'] == beh_segment)] = \
            filtered_data.fillna(mode_values)
    
    return data


test = fill_nan_with_mode_for_segment_and_beh_segment(data)

test.head()

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,week,month,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,0,3,2023-01,1.0,1.0,1.0,CTLN,CTLN,LEND,LEND
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,9,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,7,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,0,5,2023-01,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,1,5,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE


### Add Item Features
- Most bought by segment
- Most bought by beh segment
- Most clicked by segment
- Most clicked by beh_segment
- For this item, what is the ratio of checkouts to clicks over the entire dataset?
- WHich screen was this item accessed from the most?

In [13]:
original_data = data.copy()

In [14]:
data = original_data.copy()
# data.head()

# Which segment clicked this item the most?
# Step 1: Filter to only include "CLICK" interactions
click_data = data[data['interaction'] == 'CLICK']
# Step 2: Group by item and segment, count the number of "CLICK" interactions
click_counts = click_data.groupby(['item', 'segment']).size().reset_index(name='click_count')

# Step 3: Determine the segment with the maximum "CLICK" interactions for each item
most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
item_to_segment = most_clicked_by_seg.set_index('item')['segment'].to_dict()

# Step 5: Map this information back to the original dataframe
data['most_clicked_by_seg'] = data['item'].map(item_to_segment)

data.head(20)


Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,...,week,month,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,most_clicked_by_seg
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,...,3,2023-01,1.0,1.0,1.0,CTLN,CTLN,LEND,LEND,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,9,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,7,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,5,2023-01,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,...,5,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1
5,4521,CHECKOUT,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,2,...,5,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1
6,6145,DISPLAY,2023-02-26,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,8,2023-02,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,
7,6145,DISPLAY,2023-01-27,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,4,2023-01,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,
8,6145,DISPLAY,2023-02-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,6,2023-02,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,
9,6145,DISPLAY,2023-01-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2,2023-01,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,


In [15]:
# Which beh_segment clicked this item the most?
# Step 1: Filter to only include "CLICK" interactions
click_data = data[data['interaction'] == 'CLICK']
# Step 2: Group by item and segment, count the number of "CLICK" interactions
click_counts = click_data.groupby(['item', 'beh_segment']).size().reset_index(name='click_count')

# Step 3: Determine the segment with the maximum "CLICK" interactions for each item
most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
item_to_segment = most_clicked_by_seg.set_index('item')['beh_segment'].to_dict()

# Step 5: Map this information back to the original dataframe
data['most_clicked_by_beh_seg'] = data['item'].map(item_to_segment)

data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,...,month,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,most_clicked_by_seg,most_clicked_by_beh_seg
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,...,2023-01,1.0,1.0,1.0,CTLN,CTLN,LEND,LEND,,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,2023-01,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,...,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01
5,4521,CHECKOUT,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,2,...,2023-02,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01
6,6145,DISPLAY,2023-02-26,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2023-02,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,
7,6145,DISPLAY,2023-01-27,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2023-01,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,
8,6145,DISPLAY,2023-02-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2023-02,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,
9,6145,DISPLAY,2023-01-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2023-01,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,


In [16]:
# Which segment bought this item the most?
# Step 1: Filter to only include "CLICK" interactions
click_data = data[data['interaction'] == 'CHECKOUT']
# Step 2: Group by item and segment, count the number of "CLICK" interactions
click_counts = click_data.groupby(['item', 'segment']).size().reset_index(name='click_count')

# Step 3: Determine the segment with the maximum "CLICK" interactions for each item
most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
item_to_segment = most_clicked_by_seg.set_index('item')['segment'].to_dict()

# Step 5: Map this information back to the original dataframe
data['most_bought_by_seg'] = data['item'].map(item_to_segment)

data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,...,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,most_clicked_by_seg,most_clicked_by_beh_seg,most_bought_by_seg
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,...,1.0,1.0,1.0,CTLN,CTLN,LEND,LEND,,,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,,
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,,
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,,
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,...,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01,segment1
5,4521,CHECKOUT,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,2,...,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01,segment1
6,6145,DISPLAY,2023-02-26,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,,
7,6145,DISPLAY,2023-01-27,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,,
8,6145,DISPLAY,2023-02-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,,
9,6145,DISPLAY,2023-01-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND,,,


In [17]:
# Which beh_segment bought this item the most?
# Step 1: Filter to only include "CLICK" interactions
click_data = data[data['interaction'] == 'CHECKOUT']
# Step 2: Group by item and segment, count the number of "CLICK" interactions
click_counts = click_data.groupby(['item', 'beh_segment']).size().reset_index(name='click_count')

# Step 3: Determine the segment with the maximum "CLICK" interactions for each item
most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
item_to_segment = most_clicked_by_seg.set_index('item')['beh_segment'].to_dict()

# Step 5: Map this information back to the original dataframe
data['most_bought_by_beh_seg'] = data['item'].map(item_to_segment)

data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,...,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,most_clicked_by_seg,most_clicked_by_beh_seg,most_bought_by_seg,most_bought_by_beh_seg
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,...,1.0,1.0,CTLN,CTLN,LEND,LEND,,,,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,,,
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,,,
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,,,,
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,...,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01,segment1,B01
5,4521,CHECKOUT,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,2,...,1.666667,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01,segment1,B01
6,6145,DISPLAY,2023-02-26,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,2.0,CTLN,CTLN,LEND,LEND,,,,
7,6145,DISPLAY,2023-01-27,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,2.0,CTLN,CTLN,LEND,LEND,,,,
8,6145,DISPLAY,2023-02-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,2.0,CTLN,CTLN,LEND,LEND,,,,
9,6145,DISPLAY,2023-01-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,1.0,2.0,CTLN,CTLN,LEND,LEND,,,,


In [18]:
# Checout to click ratio for this item across the entire dataset: (I assume each item was checkout and clicked at least once)

# Filter the DataFrame to include only CHECKOUT and CLICK interactions
checkout_click_df = data[data['interaction'].isin(['CHECKOUT', 'CLICK'])]

# Group by item and count the occurrences of each interaction type
interaction_counts = checkout_click_df.groupby('item')['interaction'].value_counts().unstack(fill_value=0)

# Calculate the ratio of CHECKOUT to CLICK interactions for each item
interaction_counts['item_checkout_click_ratio'] = interaction_counts['CHECKOUT'] / interaction_counts['CLICK']

# Merge the ratio back to the original DataFrame based on item
data = data.merge(interaction_counts['item_checkout_click_ratio'].reset_index(), on='item', how='left')



# Display the DataFrame with the new column
data.head(20)

# Check NANS
# Assuming your DataFrame is named df
# Check for NaN values in each column
nan_columns = data.columns[data.isna().any()].tolist()

# Print the columns with NaN values
print("Columns with NaN values:", nan_columns)

Columns with NaN values: ['most_clicked_item', 'most_bought_item', 'most_clicked_item_type', 'most_bought_item_type', 'most_clicked_by_seg', 'most_clicked_by_beh_seg', 'most_bought_by_seg', 'most_bought_by_beh_seg', 'item_checkout_click_ratio']


In [19]:
# # WHich screen was this item accessed forom the most frequently (clicked or checked out)

# # Filter the DataFrame to include only CLICK and CHECKOUT interactions
# click_checkout_df = data[data['interaction'].isin(['CLICK', 'CHECKOUT'])]

# # Group by item and screen and count the occurrences of each combination
# screen_counts = click_checkout_df.groupby(['item', 'page'])['interaction'].count().reset_index()

# # Find the screen with the highest count for each item
# max_screen = screen_counts.loc[screen_counts.groupby('item')['interaction'].idxmax()]
# max_screen.rename(columns={'page': 'most_frequent_screen'}, inplace=True)

# # # Merge the screen information back to the original DataFrame based on item
# data = data.merge(max_screen[['item', 'most_frequent_screen']], on='item', how='left')

# # # Rename the column to indicate the most frequent screen
# # data.rename(columns={'page': 'most_frequent_screen'}, inplace=True)

# # # Display the DataFrame with the new column
# data.head()

# Prep Features:

In [20]:
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,page,item_type,segment,beh_segment,active_ind,interaction_scores,...,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,most_clicked_by_seg,most_clicked_by_beh_seg,most_bought_by_seg,most_bought_by_beh_seg,item_checkout_click_ratio
0,755,DISPLAY,2023-01-17,NONE,Screen1,ALL,segment3,B01,Semi Active,0,...,1.0,CTLN,CTLN,LEND,LEND,,,,,
1,4521,DISPLAY,2023-02-27,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,2.5,IBAB,IBAB,INSURE,INSURE,,,,,
2,4521,DISPLAY,2023-02-18,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,2.5,IBAB,IBAB,INSURE,INSURE,,,,,
3,4521,DISPLAY,2023-01-30,NONE,Screen1,ALL,segment1,B07,Semi Active,0,...,2.5,IBAB,IBAB,INSURE,INSURE,,,,,
4,4521,CLICK,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,1,...,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01,segment1,B01,0.949247
5,4521,CHECKOUT,2023-02-05,IBAB,Screen1,INSURE,segment1,B07,Semi Active,2,...,2.5,IBAB,IBAB,INSURE,INSURE,segment1,B01,segment1,B01,0.949247
6,6145,DISPLAY,2023-02-26,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2.0,CTLN,CTLN,LEND,LEND,,,,,
7,6145,DISPLAY,2023-01-27,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2.0,CTLN,CTLN,LEND,LEND,,,,,
8,6145,DISPLAY,2023-02-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2.0,CTLN,CTLN,LEND,LEND,,,,,
9,6145,DISPLAY,2023-01-10,NONE,Screen1,ALL,segment3,B01,Cold Start,0,...,2.0,CTLN,CTLN,LEND,LEND,,,,,


In [21]:
# Define the user columns, item columns and interaction columns:
u_cols = ["idcol", "segment", "beh_segment", "active_ind", "avg_daily_freq", "avg_weekly_freq", "avg_monthly_freq", "most_clicked_item", "most_bought_item",
          "most_clicked_item_type", "most_bought_item_type"] # TODO Include the checlout-click ratio if useful
# u_cols = ["idcol", "segment", "beh_segment", "active_ind", "avg_daily_freq", "avg_weekly_freq", "avg_monthly_freq"]
item_cols = ["item_id", "item", "item_type", "most_bought_by_beh_seg", "most_bought_by_seg", "most_clicked_by_beh_seg",
             "most_clicked_by_seg", "item_checkout_click_ratio"]
interact_cols = ["idcol", "item_id", "interaction_scores"] # I include idcol and item for now, to basically say "this user did this item with this score at this date"

# Experiment 4:
# u_cols = ["idcol", "segment", "beh_segment", "active_ind"]
# item_cols = ["item_id", "item", "item_type"]

# I want the interact_cols data in the following format:
# 

user, item, rating = data[u_cols].copy(), data[item_cols].copy(), data[interact_cols].copy()

# Accumulate rating data so that, for each unique user-item combination, there is a single row, otherwise the train-test split 
# have shared interactions. Can do an intelligent split, and use LightFM's built in weighting method,
# but the weights matrix generated with that method is the exact same, even if I do the below:
# This also allows us to incorporate the number of times that a user has interacted with an item as interaction weighting, implicitly
rating = rating.groupby(['idcol', 'item_id'], as_index=False)['interaction_scores'].sum()


# Drop duplicates, because I only need the unique items' and users' features:
item = item.drop_duplicates()
item = item.reset_index(drop=True)

user = user.drop_duplicates()
user = user.reset_index(drop=True)

print(rating.shape)
print(item.shape)
print(user.shape)

rating.head(20)

# drop the first row, we don't want to include ALL



(169646, 3)
(104, 8)
(84375, 11)


Unnamed: 0,idcol,item_id,interaction_scores
0,755,0,0
1,4521,0,0
2,4521,1,3
3,6145,0,0
4,7125,0,0
5,8469,0,0
6,13768,0,0
7,14454,0,0
8,14454,2,3
9,15000,0,0


In [22]:
# Normalise the interaction scores:
# Group by idcol and calculate the sum of interaction scores for each user
sum_interaction_scores = rating.groupby('idcol')['interaction_scores'].sum()

# Merge the sum of interaction scores back to the original DataFrame based on idcol
rating = rating.merge(sum_interaction_scores.reset_index(), on='idcol', suffixes=('', '_sum'))

# Calculate normalized interaction scores by dividing each interaction score by the sum
rating['normalized_interaction_score'] = rating['interaction_scores'] / rating['interaction_scores_sum']

# Replace NaN values with 0 in the normalized interaction scores column
rating['normalized_interaction_score'].fillna(0, inplace=True)

# Drop the temporary sum column
rating.drop(columns=['interaction_scores_sum'], inplace=True)

# Drop the unnormalised column
rating.drop(columns=['interaction_scores'], inplace=True)

# rename
rating.rename(columns={'normalized_interaction_score': 'interaction_scores'}, inplace=True)

# Display the DataFrame with the normalized interaction scores
rating.head(20)



Unnamed: 0,idcol,item_id,interaction_scores
0,755,0,0.0
1,4521,0,0.0
2,4521,1,1.0
3,6145,0,0.0
4,7125,0,0.0
5,8469,0,0.0
6,13768,0,0.0
7,14454,0,0.0
8,14454,2,1.0
9,15000,0,0.0


In [23]:
# # Normalise the user averages:
# # Function to normalize a column
# def normalize_column(df, column_name):
#     min_value = df[column_name].min()
#     max_value = df[column_name].max()
#     df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
#     return df

# # Normalize the numerical columns
# numerical_columns = ['avg_daily_freq', 'avg_weekly_freq', 'avg_monthly_freq']

# for column in numerical_columns:
#     user = normalize_column(user, column)


## User Features Data preparation

In [24]:
user.head()


Unnamed: 0,idcol,segment,beh_segment,active_ind,avg_daily_freq,avg_weekly_freq,avg_monthly_freq,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,755,segment3,B01,Semi Active,1.0,1.0,1.0,CTLN,CTLN,LEND,LEND
1,4521,segment1,B07,Semi Active,1.25,1.666667,2.5,IBAB,IBAB,INSURE,INSURE
2,6145,segment3,B01,Cold Start,1.0,1.0,2.0,CTLN,CTLN,LEND,LEND
3,7125,segment3,B01,Cold Start,1.0,1.0,1.0,CTLN,CTLN,LEND,LEND
4,8469,segment1,B01,Semi Active,1.0,1.0,1.5,CTLN,CTLN,LEND,LEND


In [25]:
user = pd.get_dummies(user,dtype = int, prefix="", prefix_sep="")
user_features_col = user.drop(columns =['idcol']).columns.values
user_feat = user.drop(columns =['idcol']).to_dict(orient='records')

user = user.sort_values(by='idcol', ascending=True)
# print(user_feat)
# user.shape
# user.shape


## Item Features Data prep:

In [26]:
# item.head()

item_features = pd.get_dummies(item, dtype = int, prefix="", prefix_sep="")
# item_features["idcol"] = data["idcol"]
item_features_col = item_features.drop(columns=['item_id']).columns.values


# Need some for of identification for the item features
# item_features["idcol"] = data["idcol"]


item_features.fillna(value = 0, inplace=True)
# item_features.shape
# print(item_feat[0])
# item.head()
# print(item_features.iloc[0,:])
# print(item_features_col)
item_features.head()

nan_columns = item_features.columns[item_features.isna().any()].tolist()

item_feat = item_features.drop(columns =['item_id']).to_dict(orient='records')


## Fit into LightFM Dataset

In [27]:
dataset = Dataset()
dataset.fit(users=[x for x in user['idcol']], items=[x for x in item['item_id']], item_features=item_features_col, user_features=user_features_col)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))


Num users: 84375, num_items 104.


## Build Item Features to be fitted into model

In [28]:
# 
item_features = dataset.build_item_features((x,y) for x,y in zip(item_features['item_id'],item_feat))
# item_features = dataset.build_item_features((x,item_features_col) for x in item_features['item_id'])
# for (x,y) in zip(item_features['item_id'],item_feat):
#     print(x)
#     print(y)
#     break

In [29]:
print(item_features.shape)

(104, 223)


## Build User Features to be fit into model

In [30]:
user_features = dataset.build_user_features((x,y) for x,y in zip(user['idcol'],user_feat))
print(user_features.shape)

(84375, 84544)


## Build interactions (user — item) and its respective weights (in this case our custom weights - 0, 1, 2)

In [31]:
# from sklearn.model_selection import train_test_split


# # We split the data into train and test by taking 20% of interactions for each user and moving that to the test set, i.e the training set will contain 80% of the items
# # that the user interacted with

# # Custom train-test split: Split the data into train and test before building interactions:
# train_interactions = pd.DataFrame()
# test_interaction = pd.DataFrame()

# for user_id, group in rating.groupby('idcol'):
#     if len(group) == 1:
#         train_interactions = pd.concat([train_interactions, group])
#     else:

#         train_group, test_group = train_test_split(group, test_size=0.2, train_size=0.8, random_state=42)
#         train_interactions = pd.concat([train_interactions, train_group])
#         test_interaction = pd.concat([test_interaction, test_group])





In [32]:
# print(train_interactions.shape)
# print(test_interaction.shape)

In [33]:
(interactions, weights) = dataset.build_interactions((x, y) for x,y in zip(rating['idcol'], rating['item_id']))

# With custom train/test splitting:
# train, train_w = dataset.build_interactions((x, y) for x,y in zip(train_interactions['idcol'], train_interactions['item_id']))

# test, test_w = dataset.build_interactions((x, y) for x,y in zip(test_interaction['idcol'], test_interaction['item_id']))


# print(weights.todense())
# print(interactions.shape)

# Model Training

## Train Test Split

In [34]:
# ORIGINAL:
train, test = random_train_test_split(interactions,test_percentage=0.2, random_state=42)
train_w, test_w = random_train_test_split(weights, test_percentage=0.2, random_state=42)

# Make a custom train-test split that uses either a_ the last 20% of interactions by date, or a random 20% of interactions for the test split
# This ensures that there are no cold start users in the testing set. We will do cold-start testing in a different manner

# SPlit the data from original data and then do all the data processing steps for each step separately. This ensures that the data doesn't bleed over into the test set.


## Model

In [35]:
# Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
# Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
no_components = 50
loss = 'warp'
epoch = 30
num_thread = 8
learning_rate = 0.08062443053534539
n = 5.4809279704140055
k = 9.583359248210815
model = LightFM(no_components= no_components, loss=loss, random_state = 42, learning_rate=learning_rate, n = n, k= k)
# model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

# Experiment 1,2,3:
# model.fit(train, epochs=epoch,num_threads = num_thread)

# Experiment 4, 5, 6:
# model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread)

# Experiemt 7
model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)


<lightfm.lightfm.LightFM at 0x7609a905a920>

## Model Evaluation

In [36]:
k=5

train_precision = precision_at_k(model, train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
test_precision = precision_at_k(model, test,train_interactions=train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()

train_recall = recall_at_k(model, train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()

train_auc = auc_score(model, train,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
test_auc = auc_score(model, test, train_interactions=train,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()

# No features:
# train_precision = precision_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_precision = precision_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_recall = recall_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_recall = recall_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_auc = auc_score(model, train, num_threads=num_thread).mean()
# test_auc = auc_score(model, test, train_interactions=train, num_threads=num_thread).mean()

print('Precision: train %.4f' % (train_precision))
print('Precision: test %.4f' % (test_precision))

print('Recall: train %.4f' % (train_recall))
print('Recall: test %.4f' % (test_recall))

print('AUC: train %.4f' % (train_auc))
print('AUC: test %.4f' % (test_auc))


# best sofar = 0.18758663535118103

# Experimental Results:
# Custom train test split:
# Precision: train 0.2288
# Precision: test 0.1589
# Recall: train 0.8799
# Recall: test 0.7329
# AUC: train 0.9784
# AUC: test 0.9383

# LightFM train test split:
# Precision: train 0.2769
# Precision: test 0.1867
# Recall: train 0.8681
# Recall: test 0.8029
# AUC: train 0.9774
# AUC: test 0.9553

In [None]:
print(test_precision)

# Automated Hyperparameter Optimisation

In [None]:
import optuna
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import auc_score

# Fetch the dataset

def objective(trial):

    # Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
    #  Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
    # Define the hyperparameters to be tuned
    no_components = trial.suggest_int('no_components', 10, 50)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
    # item_alpha = trial.suggest_loguniform('item_alpha', 1e-6, 1e-1)
    # user_alpha = trial.suggest_loguniform('user_alpha', 1e-6, 1e-1)
    k = trial.suggest_loguniform('k', 5, 25)
    n = trial.suggest_loguniform('n', 5, 25)
    
    # Create the LightFM model
    model = LightFM(
        loss='warp',
        no_components=no_components,
        learning_rate=learning_rate,
        # item_alpha=item_alpha,
        # user_alpha=user_alpha,
        k=k,
        n=n
    )
    model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)
    
    # Evaluate the model
    pak = precision_at_k(model, test,train_interactions=train, k=5,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
    
    return pak

# Run the optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=400)

print('Best hyperparameters: ', study.best_params)
np.save("BestParams.npy", study.best_params)
print('Best precision@k=5: ', study.best_value)


# Predictions

In [None]:
target_idcol = 77196041


map = dataset._user_id_mapping
index = map[77196041]

scores = model.predict(index, np.arange(104), user_features=user_features, )
# print(user.iloc[index,:])
top_items = item.iloc[np.argsort(-scores)]
# top_items.head()
# # print(item.shape)
# # print(top_items)
known_positives = item.iloc[interactions.tocsr()[index].indices]

top_items[0:10]
# [['item_id', 'item']]

In [None]:
# print(user["idcol"])

known_positives_rating = rating[(rating['idcol']==user['idcol'][index])][['item_id','interaction_scores']].merge(item[['item_id','item']], on = 'item_id')
# print(rating[(rating['idcol']==user['idcol'][index])])
# print(known_positives_rating)

# tor = rating.sort_values(by='idcol', ascending=True)
# tor.head(20)
# print(user['idcol'][3])
known_positives_rating[known_positives_rating['item_id'].isin(top_items['item_id'][0:10])]

In [None]:
known_positives_rating.sort_values(by=['interaction_scores'], ascending = False)



## Similar Item Calculation using cosine similarity

In [None]:
def similar_items(item_id, model, N=10, norm = True):
    item_bias ,item_representations = model.get_item_representations(features=item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)

    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]/ item_norms[item_id] ), key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] ), key=lambda x: -x[1])
    return similar


similar_item_list = similar_items(2, model)
similar_idx = [x[0] for x in similar_item_list ]
item.iloc[similar_idx]# Can also add the other

## Similar User Calculation

In [None]:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar
    
similar_item_list = similar_users(index,model, N = 400)
print(similar_item_list)
similar_idx = [x[0] for x in similar_item_list]
filtered_data = user.loc[similar_idx, :]
filtered_data.head(20)



In [None]:
# print(u_cols)


# i = 0
# lists = []
# for index, row in filtered_data.iterrows():
#     print(row)
#     break
#     userlst = []
#     pos_idxs = row[row == 1].index.tolist()
#     userlst.append(filtered_data.iloc[i,0])
#     userlst += pos_idxs
    
#     i+=1
#     lists.append(userlst)


# new_df = pd.DataFrame(data = lists, columns = u_cols)
# new_df.head(-1)



## Cold Start Problem

In [None]:
# import random

# from scipy import sparse

# def format_newuser_input(user_feature_map, user_feature_list):
#   num_features = len(user_feature_list)
#   normalised_val = 1.0 
#   target_indices = []
#   for feature in user_feature_list:
#     try:
#         target_indices.append(user_feature_map[feature])
#     except KeyError:
#         print("new user feature encountered '{}'".format(feature))
#         pass

#   new_user_features = np.zeros(len(user_feature_map.keys()))
#   for i in target_indices:
#     new_user_features[i] = normalised_val
#   new_user_features = sparse.csr_matrix(new_user_features)
#   return(new_user_features)

# user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
# user_feature_list = ["segment4", "B01", "Cold Start"]

# new_user_features = format_newuser_input(user_feature_map, u_cols)
# scores = model.predict(0, np.arange(104), user_features=new_user_features)

# top_items = item.iloc[np.argsort(-scores)]

# top_items.head()

new_user = pd.DataFrame(np.zeros(len(user_features_col))).T
new_user.columns = user_features_col
# print(new_user)

new_user_id = 86000
new_user['segment4'] = 1
new_user['B50'] = 1
new_user['Cold Start'] = 1

new_user = csr_matrix(new_user)
scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user)
top_items_new_user = item.iloc[np.argsort(-scores)]
top_items_new_user[0:10]