# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix, vstack
from sklearn.preprocessing import MinMaxScaler

# Data Prep

## Load Data

In [2]:
data = pd.read_csv("./fnb_datav2.csv")

# Remove 
data = data.drop(columns = ["item_descrip", "tod", "page"])
data.head(20)

# TEST:
# Drop rows where "item" column contains "NONE"
# data = data[data["active_ind"] != "Cold Start"]
data = data[data["item"] != "NONE"]

data.head(20)
# data.shape


Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
16,14454,CLICK,08FEB2023,CAFM,TRANSACT,segment2,B01,Active
17,14454,CHECKOUT,08FEB2023,CAFM,TRANSACT,segment2,B01,Active
18,15000,CLICK,31JAN2023,CARF,LEND,segment3,B01,Cold Start
19,15000,CHECKOUT,31JAN2023,CARF,LEND,segment3,B01,Cold Start
26,22924,CLICK,26FEB2023,FIWL,INVEST,segment2,B01,Active
27,22924,CHECKOUT,26FEB2023,FIWL,INVEST,segment2,B01,Active
37,23484,CHECKOUT,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start
38,23484,CLICK,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start


In [3]:
# # I remve a random Active user to test the cold start approach later:
# # Step 1: Filter users with active_ind equal to 'Active'
# active_users = data[data['active_ind'] == 'Active']

# # Check if there are any active users
# if not active_users.empty:
#     # Step 2: Randomly select one of these users
#     selected_user = active_users.sample(n=1)

#     # Get the user id of the selected user
#     selected_user_id = selected_user['idcol'].values[0]

#     # Step 3: Move all entries of this selected user to a new dataframe
#     selected_user_df = data[data['idcol'] == selected_user_id]

#     # Step 4: Remove this user's entries from the original dataframe
#     data = data[data['idcol'] != selected_user_id]

# selected_user_df.head()

In [4]:
# Checking transactions for a specific ID:
id = 77196041
data[data["idcol"]==id]

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
155295,77196041,CLICK,27MAR2023,FICQ,INSURE,segment2,B01,Active
155296,77196041,CHECKOUT,27MAR2023,FIWL,INVEST,segment2,B01,Active
155297,77196041,CHECKOUT,27MAR2023,FILS,INSURE,segment2,B01,Active
155298,77196041,CLICK,27MAR2023,FILS,INSURE,segment2,B01,Active
155299,77196041,CLICK,27MAR2023,FIHC,INSURE,segment2,B01,Active
...,...,...,...,...,...,...,...,...
155353,77196041,CLICK,27MAR2023,CACU,TRANSACT,segment2,B01,Active
155354,77196041,CHECKOUT,27MAR2023,CABC,INVEST,segment2,B01,Active
155355,77196041,CLICK,27MAR2023,CABC,INVEST,segment2,B01,Active
155356,77196041,CHECKOUT,27MAR2023,SEVP,TRANSACT,segment2,B01,Active


In [5]:
data.nunique()

idcol          42606
interaction        2
int_date          88
item             103
item_type          6
segment            4
beh_segment       48
active_ind         3
dtype: int64

In [6]:
# Add the following user features
"""
- weekly interaction frequency, 
- most frequenctly item interacted with (other than ALL),

"""

# Give scores to the interactions, and then drop the interaction column:L
# interaction_scores = {
#     'CLICK': 1,
#     'CHECKOUT': 2
# }

interaction_scores = {
    'DISPLAY': 0,
    'CLICK': 1,
    'CHECKOUT': 2
}

# Map interaction scores, fill missing values with 0
data['interaction_scores'] = data['interaction'].map(interaction_scores).fillna(0)

# Add ids for each unique item
data['item_id'] = pd.factorize(data['item'])[0]

# data.drop(columns = ["interaction"])
data.head(40)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,1,0
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,2,0
16,14454,CLICK,08FEB2023,CAFM,TRANSACT,segment2,B01,Active,1,1
17,14454,CHECKOUT,08FEB2023,CAFM,TRANSACT,segment2,B01,Active,2,1
18,15000,CLICK,31JAN2023,CARF,LEND,segment3,B01,Cold Start,1,2
19,15000,CHECKOUT,31JAN2023,CARF,LEND,segment3,B01,Cold Start,2,2
26,22924,CLICK,26FEB2023,FIWL,INVEST,segment2,B01,Active,1,3
27,22924,CHECKOUT,26FEB2023,FIWL,INVEST,segment2,B01,Active,2,3
37,23484,CHECKOUT,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start,2,4
38,23484,CLICK,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start,1,4


### Add User Features:

#### Add the following user features
- weekly interaction frequency - on average, over the whole dataset, how many times does the user interact per week
- daily interaction frequency - similar to above
- monthly interaction frequency - how many times 
- most frequenctly item interacted with (other than ALL) over the whole 
- most frequently interacted with item type
- Ratio of checkout to click for each user-item combination (ask Lize)
- Add an activity score, which is a metric that says how often, over the entire dataset, is the user active
    - Done by dividing the number of unique active days by the number of days in the dataset
- Potential other features to add from TOD:
    - average time between clicking item
    - average time between checking out the item


In [7]:
original_data =  data.copy()

target_idcol = 77196041
index = int(original_data[original_data['idcol'] == target_idcol].index[0])


In [8]:
# Add user daily activity score:
# On days that they are active, how many interactions do they make?
# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# Group by idcol to calculate total interactions and unique active days
user_interactions = data.groupby('idcol').size().reset_index(name='total_interactions')
active_days = data.groupby('idcol')['int_date'].nunique().reset_index(name='unique_active_days')

# Merge the results to calculate daily_activity_score
user_activity = pd.merge(user_interactions, active_days, on='idcol')
user_activity['daily_activity_score'] = user_activity['total_interactions'] / user_activity['unique_active_days']

# Merge the daily_activity_score back to the original DataFrame
data = pd.merge(data, user_activity[['idcol', 'daily_activity_score']], on='idcol', how='left')

# Put the activity score into bins:
# Create bins with equal frequency
data['activity_score_bin'] = pd.qcut(data['daily_activity_score'], q=4, labels=['Few', 'Some', 'Many', 'SUPER baie'])

data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few


In [9]:
# Add user activity frequency:
# Count the number of days that the user was active, and divide it by the number of days in the dataset:

# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'])

# Calculate the total number of unique days in the dataset
total_days = data["int_date"].nunique()

# Calculate the number of unique days each user had interactions
user_unique_days = data.groupby('idcol')['int_date'].nunique()

# Calculate the activity rate
activity_rate = user_unique_days / total_days

# Merge the activity_rate back into the original dataframe
data = data.merge(activity_rate.rename('activity_rate'), on='idcol')


# Put activity rate into buns
data['activity_rate_bin'] = pd.qcut(data['activity_rate'], q=4, labels=['Low', 'Medium', 'High', 'VeryHigh'])
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin,activity_rate,activity_rate_bin
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few,0.011364,Low
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few,0.011364,Low
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few,0.011364,Low
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few,0.011364,Low
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few,0.011364,Low
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0,Few,0.011364,Low
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0,Few,0.011364,Low
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0,Few,0.011364,Low
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0,Few,0.011364,Low
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0,Few,0.011364,Low


In [10]:
# Add which day of the week they were most frequently active on:
# Convert 'int_date' to datetime
data['int_date'] = pd.to_datetime(data['int_date'])

# Extract day of the week
data['day_of_week'] = data['int_date'].dt.day_name()

# Count interactions per user per day of the week
interaction_counts = data.groupby(['idcol', 'day_of_week']).size().reset_index(name='count')

# Find the most frequent day of the week for each user
most_frequent_day = interaction_counts.loc[interaction_counts.groupby('idcol')['count'].idxmax()]

# Merge this information back to the original dataframe
data = data.merge(most_frequent_day[['idcol', 'day_of_week']], on='idcol', suffixes=('', '_most_frequent'))

# Rename the column for clarity
data.rename(columns={'day_of_week_most_frequent': 'most_frequent_day'}, inplace=True)

# Display the resulting dataframe
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin,activity_rate,activity_rate_bin,day_of_week,most_frequent_day
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few,0.011364,Low,Sunday,Sunday
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few,0.011364,Low,Sunday,Sunday
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few,0.011364,Low,Wednesday,Wednesday
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few,0.011364,Low,Wednesday,Wednesday
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few,0.011364,Low,Tuesday,Tuesday
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0,Few,0.011364,Low,Tuesday,Tuesday
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0,Few,0.011364,Low,Sunday,Sunday
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0,Few,0.011364,Low,Sunday,Sunday
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0,Few,0.011364,Low,Wednesday,Wednesday
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0,Few,0.011364,Low,Wednesday,Wednesday


In [11]:
user_interactions = data['idcol'].value_counts().reset_index()
user_interactions.columns = ['idcol', 'user_interaction_count']

# Calculate the total number of interactions in the dataset
total_interactions = data.shape[0]

# Merge the user interaction counts back into the original dataframe
data = data.merge(user_interactions, on='idcol')

# Bin the number of interactions:
data['num_interactions'] = pd.qcut(data['user_interaction_count'], q=4, labels=['Little', 'A bit more', 'Quite a few', 'This person has problems'])
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin,activity_rate,activity_rate_bin,day_of_week,most_frequent_day,user_interaction_count,num_interactions
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few,0.011364,Low,Tuesday,Tuesday,2,Little
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0,Few,0.011364,Low,Tuesday,Tuesday,2,Little
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little


In [12]:


# # Adding most clicked and most bought items and item types for each user:
# clicks = data[data['interaction'] != 'DISPLAY']
# most_clicked = clicks.groupby(['idcol', 'item']).size().reset_index(name='click_count')
# most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item']]
# most_clicked.rename(columns={'item': 'most_interacted_item'}, inplace=True)

# # Merge the most clicked and most bought items back into the original DataFrame
# data = data.merge(most_clicked, on='idcol', how='left')
# # data = data.merge(most_bought, on='idcol', how='left')

# data.head()

# Filter out 'DISPLAY' interactions
clicks = data[data['interaction'] != 'DISPLAY']

# Group by idcol and item, and count the number of interactions
interaction_counts = clicks.groupby(['idcol', 'item']).size().reset_index(name='interaction_count')

# Sort interaction counts within each user group
interaction_counts = interaction_counts.sort_values(by=['idcol', 'interaction_count'], ascending=[True, False])

# Identify the most and second most interacted items
most_interacted = interaction_counts.groupby('idcol').nth(0).reset_index()[['idcol', 'item']]
most_interacted.rename(columns={'item': 'most_interacted_item'}, inplace=True)

# second_most_interacted = interaction_counts.groupby('idcol').nth(1).reset_index()[['idcol', 'item']]
# second_most_interacted.rename(columns={'item': 'second_most_interacted_item'}, inplace=True)

# Merge the most and second most interacted items back into the original DataFrame
data = data.merge(most_interacted, on='idcol', how='left')
# data = data.merge(second_most_interacted, on='idcol', how='left')

# Display the result
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin,activity_rate,activity_rate_bin,day_of_week,most_frequent_day,user_interaction_count,num_interactions,most_interacted_item
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,IBAB
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,IBAB
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few,0.011364,Low,Tuesday,Tuesday,2,Little,CARF
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0,Few,0.011364,Low,Tuesday,Tuesday,2,Little,CARF
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,FIWL
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,FIWL
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CUSS
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CUSS


In [13]:
# Adding most interacted item type
clicks = data[data['interaction'] != "DISPLAY"]
most_clicked = clicks.groupby(['idcol', 'item_type']).size().reset_index(name='click_count')
most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item_type']]
most_clicked.rename(columns={'item_type': 'most_interacted'}, inplace=True)

# # Determine the most bought item for each user
# checkouts = data[data['interaction'] == 'CHECKOUT']
# most_bought = checkouts.groupby(['idcol', 'item_type']).size().reset_index(name='checkout_count')
# most_bought = most_bought.loc[most_bought.groupby('idcol')['checkout_count'].idxmax()][['idcol', 'item_type']]
# most_bought.rename(columns={'item_type': 'most_bought_item_type'}, inplace=True)

# Merge the most clicked and most bought items back into the original DataFrame
data = data.merge(most_clicked, on='idcol', how='left')
# data = data.merge(most_bought, on='idcol', how='left')

data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin,activity_rate,activity_rate_bin,day_of_week,most_frequent_day,user_interaction_count,num_interactions,most_interacted_item,most_interacted
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few,0.011364,Low,Tuesday,Tuesday,2,Little,CARF,LEND


### Add Item Features
- Most bought by segment
- Most bought by beh segment
- Most clicked by segment
- Most clicked by beh_segment
- For this item, what is the ratio of checkouts to clicks over the entire dataset?
- WHich screen was this item accessed from the most?

In [14]:
original_data = data.copy()
data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_score_bin,activity_rate,activity_rate_bin,day_of_week,most_frequent_day,user_interaction_count,num_interactions,most_interacted_item,most_interacted
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,Few,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,Few,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,Few,0.011364,Low,Tuesday,Tuesday,2,Little,CARF,LEND


In [15]:
# Add number of unique users per item and normalise it per column:
# Calculate the number of unique users per item
unique_users_per_item = data.groupby('item')['idcol'].nunique().reset_index()
unique_users_per_item.columns = ['item', 'unique_user_count']

# Merge this information back into the original dataframe
data = data.merge(unique_users_per_item, on='item', how='left')

# Print skewness to determine bins:
# skewness_pd = pd.Series(data["unique_user_count"]).skew()
# print(skewness_pd)

# Apply log transformation to reduce skewness:
# data['log_unique_user_count'] = data['unique_user_count'].apply(np.log)

# skewness_pd = pd.Series(data["log_unique_user_count"]).skew()
# print(skewness_pd)
data['item_popularity'] = pd.qcut(data['unique_user_count'], q=6, labels=["Unknown", "Somewhat known", "known", "mildly popular", "popular", "Taylor Swift"])
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,...,activity_rate,activity_rate_bin,day_of_week,most_frequent_day,user_interaction_count,num_interactions,most_interacted_item,most_interacted,unique_user_count,item_popularity
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,...,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE,3229,popular
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,...,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE,3229,popular
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,...,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT,1114,Somewhat known
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,...,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT,1114,Somewhat known
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,...,0.011364,Low,Tuesday,Tuesday,2,Little,CARF,LEND,762,Unknown
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,...,0.011364,Low,Tuesday,Tuesday,2,Little,CARF,LEND,762,Unknown
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,...,0.011364,Low,Sunday,Sunday,2,Little,FIWL,INVEST,2786,popular
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,...,0.011364,Low,Sunday,Sunday,2,Little,FIWL,INVEST,2786,popular
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,...,0.011364,Low,Wednesday,Wednesday,2,Little,CUSS,INVEST,833,Somewhat known
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,...,0.011364,Low,Wednesday,Wednesday,2,Little,CUSS,INVEST,833,Somewhat known


# Prep Features:

In [16]:
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,...,activity_rate,activity_rate_bin,day_of_week,most_frequent_day,user_interaction_count,num_interactions,most_interacted_item,most_interacted,unique_user_count,item_popularity
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,...,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE,3229,popular
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,...,0.011364,Low,Sunday,Sunday,2,Little,IBAB,INSURE,3229,popular
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,...,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT,1114,Somewhat known
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,...,0.011364,Low,Wednesday,Wednesday,2,Little,CAFM,TRANSACT,1114,Somewhat known
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,...,0.011364,Low,Tuesday,Tuesday,2,Little,CARF,LEND,762,Unknown
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,...,0.011364,Low,Tuesday,Tuesday,2,Little,CARF,LEND,762,Unknown
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,...,0.011364,Low,Sunday,Sunday,2,Little,FIWL,INVEST,2786,popular
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,...,0.011364,Low,Sunday,Sunday,2,Little,FIWL,INVEST,2786,popular
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,...,0.011364,Low,Wednesday,Wednesday,2,Little,CUSS,INVEST,833,Somewhat known
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,...,0.011364,Low,Wednesday,Wednesday,2,Little,CUSS,INVEST,833,Somewhat known


In [17]:
# Define the user columns, item columns and interaction columns:
u_cols = ["idcol", "segment", "beh_segment", "active_ind", "most_interacted", "most_interacted_item", "activity_score_bin", "activity_rate_bin", "num_interactions",
          "most_frequent_day"] 
# u_cols = ["idcol", "segment", "beh_segment", "activity_score_bin", "activity_rate_bin", "num_interactions",
#           "most_frequent_day"] 
# u_cols = ["idcol", "segment", "beh_segment", "active_ind", "avg_daily_freq", "avg_weekly_freq", "avg_monthly_freq"]
# item_cols = ["item_id", "item", "item_type", "most_bought_by_beh_seg", "most_bought_by_seg", "most_clicked_by_beh_seg",
#              "most_clicked_by_seg"]

item_cols = ["item_id", "item_type", "item_popularity"]
interact_cols = ["idcol", "item_id", "interaction_scores"] # I include idcol and item for now, to basically say "this user did this item with this score at this date"


test_item_cols = ["item_id", "item_type", "item","item_popularity"]
# Experiment 4:
# u_cols = ["idcol", "segment", "beh_segment", "active_ind"]
# item_cols = ["item_id", "item", "item_type"]

# I want the interact_cols data in the following format:
# 

user, item, rating = data[u_cols].copy(), data[item_cols].copy(), data[interact_cols].copy()
test_items = data[test_item_cols].copy()
# Accumulate rating data so that, for each unique user-item combination, there is a single row, otherwise the train-test split 
# have shared interactions. Can do an intelligent split, and use LightFM's built in weighting method,
# but the weights matrix generated with that method is the exact same, even if I do the below:
# This also allows us to incorporate the number of times that a user has interacted with an item as interaction weighting, implicitly
rating = rating.groupby(['idcol', 'item_id'], as_index=False)['interaction_scores'].sum()


# Drop duplicates, because I only need the unique items' and users' features:
item = item.drop_duplicates()
# item = item.drop(columns=["item"], inplace=True)
item = item.reset_index(drop=True)

test_items = test_items.drop_duplicates()
test_items = test_items.reset_index(drop=True)

user = user.drop_duplicates()
user = user.reset_index(drop=True)

print(rating.shape)
print(item.shape)
print(user.shape)



# Clamp the interaction_scores to a maximum of 5
# rating['interaction_scores'] = rating['interaction_scores'].clip(upper=5)
display(user.head(20))

(93712, 3)
(103, 3)
(42606, 7)


Unnamed: 0,idcol,segment,beh_segment,activity_score_bin,activity_rate_bin,num_interactions,most_frequent_day
0,4521,segment1,B07,Few,Low,Little,Sunday
1,14454,segment2,B01,Few,Low,Little,Wednesday
2,15000,segment3,B01,Few,Low,Little,Tuesday
3,22924,segment2,B01,Few,Low,Little,Sunday
4,23484,segment2,B01,Few,Low,Little,Wednesday
5,24982,segment1,B08,Few,Low,Little,Tuesday
6,25577,segment3,B01,Few,Low,Little,Monday
7,27824,segment1,B08,Few,Low,Little,Tuesday
8,28951,segment1,B07,Many,VeryHigh,This person has problems,Monday
9,29630,segment4,B01,Few,Low,Little,Tuesday


In [41]:
# Normalise the interaction scores:
# Group by idcol and calculate the sum of interaction scores for each user
sum_interaction_scores = rating.groupby('idcol')['interaction_scores'].sum()

# Merge the sum of interaction scores back to the original DataFrame based on idcol
rating = rating.merge(sum_interaction_scores.reset_index(), on='idcol', suffixes=('', '_sum'))

# Calculate normalized interaction scores by dividing each interaction score by the sum
rating['normalized_interaction_score'] = rating['interaction_scores'] / rating['interaction_scores_sum']

# Replace NaN values with 0 in the normalized interaction scores column
rating['normalized_interaction_score'].fillna(0, inplace=True)

# Drop the temporary sum column
rating.drop(columns=['interaction_scores_sum'], inplace=True)

# Drop the unnormalised column
rating.drop(columns=['interaction_scores'], inplace=True)

# rename
rating.rename(columns={'normalized_interaction_score': 'interaction_scores'}, inplace=True)

# Display the DataFrame with the normalized interaction scores
rating.head(20)


Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,1.0
1,14454,1,1.0
2,15000,2,1.0
3,22924,3,1.0
4,23484,4,1.0
5,24982,5,1.0
6,25577,6,1.0
7,27824,7,1.0
8,28951,7,0.333333
9,28951,8,0.111111


In [48]:
# # Normalise the user averages:
# # Function to normalize a column
# def normalize_column(df, column_name):
#     min_value = df[column_name].min()
#     max_value = df[column_name].max()
#     df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
#     return df

# # Normalize the numerical columns
# numerical_columns = ['avg_daily_freq', 'avg_weekly_freq', 'avg_monthly_freq']

# for column in numerical_columns:
#     user = normalize_column(user, column)
# pre_mol = rating[["idcol", "item_id"]]
# Create the pivot table
# pivot_table = pre_mol.pivot_table(index='idcol', columns='item_id', aggfunc=len, fill_value=0)
# Create pivot table with interaction_scores as values and idcol as index and item_id as columns
pivot_table = rating.pivot_table(index='idcol', columns='item_id', values='interaction_scores', fill_value=0).round(2)


slaasd = pivot_table.iloc[:12,:10]

slaasd.head(-1)

item_id,0,1,2,3,4,5,6,7,8,9
idcol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4521,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14454,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22924,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23484,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
24982,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25577,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
27824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
28951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.11,0.07
29630,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
# # Merge the two DataFrames on 'idcol' and 'item_id'
# merged_df = pd.merge(slaasd, rating, on=['idcol', 'item_id'], how='left')

# # Multiply the values in the interaction matrix by the corresponding interaction scores
# for col in slaasd.columns[1:]:
#     merged_df[col] *= merged_df['interaction_scores']

# # Drop the 'interaction_scores' column
# merged_df.drop('interaction_scores', axis=1, inplace=True)

# # Pivot the DataFrame back to its original shape if needed
# result_df = merged_df.pivot_table(index='idcol', columns='item_id', fill_value=0)

# result_df.head(10)

KeyError: 'item_id'

## User Features Data preparation

In [20]:
user.head()


Unnamed: 0,idcol,segment,beh_segment,activity_score_bin,activity_rate_bin,num_interactions,most_frequent_day
0,4521,segment1,B07,Few,Low,Little,Sunday
1,14454,segment2,B01,Few,Low,Little,Wednesday
2,15000,segment3,B01,Few,Low,Little,Tuesday
3,22924,segment2,B01,Few,Low,Little,Sunday
4,23484,segment2,B01,Few,Low,Little,Wednesday


In [21]:
# # Normalise the user feaetures column wise:

# # Initialize the MinMaxScaler
# scaler = MinMaxScaler()

# # Select the columns to normalize
# columns_to_normalize = ['daily_activity_score', 'activity_rate']

# # Apply the scaler to the selected columns
# user[columns_to_normalize] = scaler.fit_transform(user[columns_to_normalize])

# # Display the result
# user.head(20)

In [22]:
user_train = pd.get_dummies(user,dtype = int, prefix="", prefix_sep="")
user_features_col = user_train.drop(columns =['idcol']).columns.values
user_feat = user_train.drop(columns =['idcol']).to_dict(orient='records')

user_train = user_train.sort_values(by='idcol', ascending=True)
# print(user_feat)
# user.shape
# user.shape
user_train.head(20)
# print(user_features_col)


Unnamed: 0,idcol,segment1,segment2,segment3,segment4,B01,B02,B03,B04,B05,...,A bit more,Quite a few,This person has problems,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,4521,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,14454,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,15000,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,22924,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,23484,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,24982,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,25577,0,0,1,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,27824,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
8,28951,1,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
9,29630,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [23]:
# gog = data.drop_duplicates()

# # # Put a value of 0.5 at the second most interacted item:

# # Create a dictionary mapping idcol to second_most_interacted_item
# second_most_interacted_dict = gog.set_index('idcol')['second_most_interacted_item'].dropna().to_dict()

# # Update user_train DataFrame using the dictionary
# for idcol, itm in second_most_interacted_dict.items():
#     user_train.loc[user_train['idcol'] == idcol, itm] = 0.5

# # Display the result
# # print(user_train)

# user_train.head()

## Item Features Data prep:

In [24]:
nan_counts = item.isna().sum()
nan_counts

item_id            0
item_type          0
item_popularity    0
dtype: int64

In [25]:
# # Normalise
# # Initialize the MinMaxScaler
# scaler = MinMaxScaler()

# # Select the columns to normalize
# columns_to_normalize = ['unique_users_segment1', 'unique_users_segment2', 'unique_users_segment3', 'unique_users_segment4']

# # Apply the scaler to the selected columns
# item[columns_to_normalize] = scaler.fit_transform(item[columns_to_normalize])

# # Display the result
# item.head(20)

In [26]:
# item.head()

item_features_df = pd.get_dummies(item, dtype = int, prefix="", prefix_sep="")
# item_features["idcol"] = data["idcol"]
item_features_col = item_features_df.drop(columns=['item_id']).columns.values


# Need some for of identification for the item features
# item_features["idcol"] = data["idcol"]


item_features_df.fillna(value = 0, inplace=True)
# item_features.shape
# print(item_feat[0])
# item.head()
# print(item_features.iloc[0,:])
# print(item_features_col)
item_features_df.head()

nan_columns = item_features_df.columns[item_features_df.isna().any()].tolist()

item_feat = item_features_df.drop(columns =['item_id']).to_dict(orient='records')

item_features_df.head()

# print()

Unnamed: 0,item_id,CONNECT,INSURE,INVEST,LEND,LIFESTYLE,TRANSACT,Unknown,Somewhat known,known,mildly popular,popular,Taylor Swift
0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,1,0,1,0,0,0,0
2,2,0,0,0,1,0,0,1,0,0,0,0,0
3,3,0,0,1,0,0,0,0,0,0,0,1,0
4,4,0,0,1,0,0,0,0,1,0,0,0,0


## Fit into LightFM Dataset

In [27]:
dataset = Dataset()
users=[x for x in user_train['idcol']]

items=[x for x in item['item_id']]


# dataset.fit(users=users, items=items, user_features=user_features_col)
dataset.fit(users=users, items=items, user_features=user_features_col, item_features=item_features_col)

# num_users, num_items = dataset.interactions_shape()
# print('Num users: {}, num_items {}.'.format(num_users, num_items))


## Build Item Features to be fitted into model

In [28]:
print(item_features_col)
# item_features = dataset.build_item_features(((x,y) for x,y in zip(item_features_df['item_id'],item_feat)), normalize=False)
item_features = dataset.build_item_features(((x,item_features_col) for x in item_features_df['item_id']), normalize=False)


['CONNECT' 'INSURE' 'INVEST' 'LEND' 'LIFESTYLE' 'TRANSACT' 'Unknown'
 'Somewhat known' 'known' 'mildly popular' 'popular' 'Taylor Swift']


In [29]:
print(item_features.shape)
# user_train.info()
# user.head()

(103, 115)


## Build User Features to be fit into model

In [30]:
print(user_feat[0])
user_features = dataset.build_user_features(((x,y) for x,y in zip(user_train['idcol'],user_feat)), normalize=False)

# user_features = dataset.build_user_features(((x,user_features_col) for x in user_train['idcol']))

# user_features_col

{'segment1': 1, 'segment2': 0, 'segment3': 0, 'segment4': 0, 'B01': 0, 'B02': 0, 'B03': 0, 'B04': 0, 'B05': 0, 'B06': 0, 'B07': 1, 'B08': 0, 'B09': 0, 'B10': 0, 'B11': 0, 'B12': 0, 'B13': 0, 'B14': 0, 'B15': 0, 'B16': 0, 'B17': 0, 'B18': 0, 'B19': 0, 'B20': 0, 'B21': 0, 'B22': 0, 'B23': 0, 'B24': 0, 'B25': 0, 'B26': 0, 'B27': 0, 'B28': 0, 'B29': 0, 'B30': 0, 'B31': 0, 'B32': 0, 'B33': 0, 'B34': 0, 'B35': 0, 'B36': 0, 'B37': 0, 'B38': 0, 'B39': 0, 'B40': 0, 'B41': 0, 'B42': 0, 'B44': 0, 'B46': 0, 'B47': 0, 'B48': 0, 'B49': 0, 'B50': 0, 'Few': 1, 'Some': 0, 'Many': 0, 'SUPER baie': 0, 'Low': 1, 'Medium': 0, 'High': 0, 'VeryHigh': 0, 'Little': 1, 'A bit more': 0, 'Quite a few': 0, 'This person has problems': 0, 'Friday': 0, 'Monday': 0, 'Saturday': 0, 'Sunday': 1, 'Thursday': 0, 'Tuesday': 0, 'Wednesday': 0}


In [31]:
print(user_features.shape)

(42606, 42677)


## Build interactions (user — item) and its respective weights (in this case our custom weights - 0, 1, 2)

In [32]:
# from sklearn.model_selection import train_test_split


# # We split the data into train and test by taking 20% of interactions for each user and moving that to the test set, i.e the training set will contain 80% of the items
# # that the user interacted with

# # Custom train-test split: Split the data into train and test before building interactions:
# train_interactions = pd.DataFrame()
# test_interaction = pd.DataFrame()

# for user_id, group in rating.groupby('idcol'):
#     if len(group) == 1:
#         train_interactions = pd.concat([train_interactions, group])
#     else:
#         train_group, test_group = train_test_split(group, test_size=0.2, train_size=0.8, random_state=42)
#         train_interactions = pd.concat([train_interactions, train_group])
#         test_interaction = pd.concat([test_interaction, test_group])


# (train, train_w) = dataset.build_interactions((x, y, w) for x,y,w in zip(train_interactions['idcol'], train_interactions['item_id'], train_interactions["interaction_scores"]))
# (test, test_w) = dataset.build_interactions((x, y, w) for x,y,w in zip(test_interaction['idcol'], test_interaction['item_id'], test_interaction["interaction_scores"]))


In [33]:
# print(train_interactions.shape)
# print(test_interaction.shape)

rating.head(10)

Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,1.0
1,14454,1,1.0
2,15000,2,1.0
3,22924,3,1.0
4,23484,4,1.0
5,24982,5,1.0
6,25577,6,1.0
7,27824,7,1.0
8,28951,7,0.333333
9,28951,8,0.111111


In [34]:
(interactions, weights) = dataset.build_interactions((x, y, w) for x,y,w in zip(rating['idcol'], rating['item_id'], rating["interaction_scores"]))

print(interactions.shape)


(42606, 103)


# Model Training

## Train Test Split

In [35]:
# ORIGINAL:
train, test = random_train_test_split(interactions,test_percentage=0.2, random_state=40)
train_w, test_w = random_train_test_split(weights, test_percentage=0.2, random_state=40)


print(train.shape)
print(test.shape)
# Make a custom train-test split that uses either a_ the last 20% of interactions by date, or a random 20% of interactions for the test split
# This ensures that there are no cold start users in the testing set. We will do cold-start testing in a different manner

# SPlit the data from original data and then do all the data processing steps for each step separately. This ensures that the data doesn't bleed over into the test set.


(42606, 103)
(42606, 103)


## Model

In [36]:
# Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
# Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
no_components = 45
loss = 'warp'
epoch = 30
num_thread = 8
learning_rate = 0.05
max_sampled = 10
# n = 5.4809279704140055
# k = 9.583359248210815
model = LightFM(no_components= no_components, loss=loss, random_state = 42, learning_rate=learning_rate, max_sampled=max_sampled)
# model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

# Experiemt 7
# Training without user features
model.fit(train, user_features= user_features, item_features=item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

# Pure CF
# model.fit(train, epochs=epoch,num_threads = num_thread)


<lightfm.lightfm.LightFM at 0x70a6dc02ece0>

## Model Evaluation

In [37]:
k=5

train_precision = precision_at_k(model, train, k=k, user_features=user_features,item_features=item_features,num_threads=num_thread).mean()
test_precision = precision_at_k(model, test,train_interactions=train,item_features=item_features, k=k, user_features=user_features,num_threads=num_thread).mean()

train_recall = recall_at_k(model, train, k=k, user_features=user_features,item_features=item_features, num_threads=num_thread).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=k, user_features=user_features,item_features=item_features, num_threads=num_thread).mean()

train_auc = auc_score(model, train, user_features=user_features,item_features=item_features, num_threads=num_thread).mean()
test_auc = auc_score(model, test, train_interactions=train,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()


test_rr = reciprocal_rank(model, test, train_interactions=train, item_features=item_features,user_features=user_features).mean()
train_rr = reciprocal_rank(model, train,item_features=item_features, user_features=user_features).mean()
# No features:
# train_precision = precision_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_precision = precision_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_recall = recall_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_recall = recall_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_auc = auc_score(model, train, num_threads=num_thread).mean()
# test_auc = auc_score(model, test, train_interactions=train, num_threads=num_thread).mean()

# test_rr = reciprocal_rank(model, test, train_interactions=train).mean()
# train_rr = reciprocal_rank(model, train).mean()

print('Precision: train %.4f' % (train_precision))
print('Precision: test %.4f' % (test_precision))

print('Recall: train %.4f' % (train_recall))
print('Recall: test %.4f' % (test_recall))

print('AUC: train %.4f' % (train_auc))
print('AUC: test %.4f' % (test_auc))

print('RR: train %.4f' % (train_rr))
print('RR: test %.4f' % (test_rr))

# Precision: train 0.3662
# Precision: test 0.1531
# Recall: train 0.9776
# Recall: test 0.6530
# AUC: train 0.9975
# AUC: test 0.9103
# RR: train 0.9693
# RR: test 0.6635

Precision: train 0.3606
Precision: test 0.0488
Recall: train 0.9714
Recall: test 0.1871
AUC: train 0.9970
AUC: test 0.7163
RR: train 0.9964
RR: test 0.1624


In [None]:
print(test_precision)

# Automated Hyperparameter Optimisation

In [None]:
# import optuna
# from lightfm import LightFM
# from lightfm.datasets import fetch_movielens
# from lightfm.evaluation import auc_score

# # Fetch the dataset

# def objective(trial):

#     # Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
#     #  Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
#     # Define the hyperparameters to be tuned
#     no_components = trial.suggest_int('no_components', 10, 50)
#     learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
#     # item_alpha = trial.suggest_loguniform('item_alpha', 1e-6, 1e-1)
#     # user_alpha = trial.suggest_loguniform('user_alpha', 1e-6, 1e-1)
#     k = trial.suggest_loguniform('k', 5, 25)
#     n = trial.suggest_loguniform('n', 5, 25)
    
#     # Create the LightFM model
#     model = LightFM(
#         loss='warp',
#         no_components=no_components,
#         learning_rate=learning_rate,
#         # item_alpha=item_alpha,
#         # user_alpha=user_alpha,
#         k=k,
#         n=n
#     )
#     model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)
    
#     # Evaluate the model
#     pak = precision_at_k(model, test,train_interactions=train, k=5,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
    
#     return pak

# # Run the optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=400)

# print('Best hyperparameters: ', study.best_params)
# np.save("BestParams.npy", study.best_params)
# print('Best precision@k=5: ', study.best_value)


# Predictions

In [None]:
target_idcol = 77196041
selected_user = data[data["active_ind"]=="Active"].sample(n=1)
target_idcol = selected_user["idcol"].iloc[0]
# target_idcol = 155531648
print(target_idcol)
# Rank the items based on interactions
# Assign scores to interaction types
# interaction_scores = {'DISPLAY': 0, 'CLICK': 1, 'CHECKOUT': 2}

predict_user = data[data["idcol"] == target_idcol]

predict_user.head()

# Map interaction types to scores
predict_user['interaction_score'] = predict_user['interaction'].map(interaction_scores)

# Rank items based on scores
predict_user['item_rank'] = predict_user.groupby('idcol')['interaction_score'].rank(method='max', ascending=False)

# Sort dataframe by item rank
predict_user = predict_user.sort_values(by='item_rank')

predict_user.head(-1)

true_items = predict_user['item'].tolist()
true_items = list(set(true_items))


predict_user[u_cols].head(10)
# Sit langs mekaar:
# model recommendation, user se eie actual interactions, all items popularity over entire dataset, all item popularity for user segment, all item popularity for beh_segment

In [None]:



map = dataset._user_id_mapping
index = map[target_idcol]
print(index)

scores = model.predict(index, np.arange(103), user_features=user_features, item_features=item_features)

# scores = model.predict(index, np.arange(103))
# scores = model.predict(index, np.arange(103))
# print(scores)
# print(user.iloc[index,:])
top_items = test_items.iloc[np.argsort(-scores)]
# print(scores)
# top_items.head()
# # print(item.shape)
# # print(top_items)
# known_positives = item.iloc[interactions.tocsr()[index].indices]

# top_items[0:10]
recommended_list = top_items['item'].tolist()
recommended_list = list(set(recommended_list))

recommended_list_types = top_items['item_type'].tolist()
recommended_list_types = list(set(recommended_list_types))

print(true_items)
print(recommended_list_types)

# def precision_at_k(recommended_list, actual_list, k=10):
#     # Get the intersection of the recommended list and the actual list up to k
#     intersection = set(recommended_list[:k]) & set(actual_list)
    
#     # Calculate precision@k
#     precision = len(intersection) / k
    
#     return precision

# # Calculate precision@k=10
# precision = precision_at_k(recommended_list, true_items, k=5)
# print("Precision@k=10:", precision)
data.head()
print(recommended_list.index("CTLN"))

In [None]:
# Rank items over the entire dataset
unique_user_counts = data.groupby('item')['idcol'].nunique().reset_index(name='unique_user_count')

# Sort the items based on the count of unique users in descending order
unique_user_counts = unique_user_counts.sort_values(by='unique_user_count', ascending=False)

# Display the result
ranked_items = unique_user_counts["item"].tolist()
print(ranked_items)

In [None]:
# # Rank items over the active user's segment

segment_of_interest = predict_user["segment"].iloc[0]
segment_df = data[data['segment'] == segment_of_interest]

unique_user_counts = segment_df.groupby('item')['idcol'].nunique().reset_index(name='unique_user_count')

# Sort the items based on the count of unique users in descending order
unique_user_counts = unique_user_counts.sort_values(by='unique_user_count', ascending=False)

# Display the result
ranked_items_bysegment = unique_user_counts["item"].tolist()
print(ranked_items_bysegment)


In [None]:
# # Rank items over the active user's beh_segment


segment_of_interest = predict_user["beh_segment"].iloc[0]
segment_df = data[data['beh_segment'] == segment_of_interest]

unique_user_counts = segment_df.groupby('item')['idcol'].nunique().reset_index(name='unique_user_count')

# Sort the items based on the count of unique users in descending order
unique_user_counts = unique_user_counts.sort_values(by='unique_user_count', ascending=False)

# Display the result
ranked_items_bybehsegment = unique_user_counts["item"].tolist()
print(ranked_items_bybehsegment)

In [None]:
# Show the true items for the 2 next most similar users
# First, lets find the two most similar users:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)
    # user_bias ,user_representations = model.get_user_representations()

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar
    
# map = dataset._user_id_mapping
# index = map[77196041]
similar_item_list = similar_users(index,model, N = 3, norm=True)
display(similar_item_list)
similar_idx = [x[0] for x in similar_item_list]
filtered_data = user.loc[similar_idx, :]
filtered_data.head(20)

two_users = filtered_data["idcol"].tolist()[1:]
print(two_users)

next_two_users_items = []
for id in two_users:
    predict_user = data[data["idcol"] == id]
    true = predict_user['item'].tolist()
    true = list(set(true))
    next_two_users_items.append(true)


print(next_two_users_items[1])

# Function to look up item_type by item_id
def get_item_type(item):
    # Use loc to find the row where item_id matches and get the item_type
    item_type = test_items.loc[test_items['item'] == item, 'item_type']
    # Return the item_type if found, otherwise return None
    return item_type.iloc[0] if not item_type.empty else None

itm_tps = []
for x in recommended_list:
    itm_tps.append(get_item_type(x))

true_item_types = []
for x in true_items:
    true_item_types.append(get_item_type(x))

ranked_tps = []
for x in ranked_items:
    ranked_tps.append(get_item_type(x))

rank_seg_tps = []
for x in ranked_items_bysegment:
    rank_seg_tps.append(get_item_type(x))

rank_bseg_tps = []
for x in ranked_items_bybehsegment:
    rank_bseg_tps.append(get_item_type(x))

In [None]:
# Combine all those into a single dataframe so I can see them side by side
num_item = len(true_items)
max_length = 30
# print(num_item)


recdata = {
    'True Items': true_items + [None] * (max_length - num_item),
    'typs': true_item_types + [None] * (max_length - len(true_item_types)),
    'User2' : next_two_users_items[0] + [None] * (max_length - len(next_two_users_items[0])),
    'User3' : next_two_users_items[1] + [None] * (max_length - len(next_two_users_items[1])),
    'Recommended Items': recommended_list[:max_length],
    "Rec itm types": itm_tps[:max_length],
    'Most Popular Tot': ranked_items[:max_length],
    "pop types": ranked_tps[:max_length],
    'Most Popular Seg': ranked_items_bysegment[:max_length],
    "popseg types": rank_seg_tps[:max_length],
    'Most Popular BSeg': ranked_items_bybehsegment[:max_length],
    "popbseg types": rank_bseg_tps[:max_length],
}

# Create DataFrame from dictionary
df = pd.DataFrame(recdata)

def listwise_precision_at_k(recommended_list, actual_list, k=20):
    # Get the intersection of the recommended list and the actual list up to k
    intersection = set(recommended_list[:k]) & set(actual_list)
    
    # Calculate precision@k
    precision = len(intersection) / k
    
    return precision

print(listwise_precision_at_k(recommended_list, true_items))

df.head(-1)


# True items for next 2 most similar users
# Recommend items' types


## Similar Item Calculation using cosine similarity

In [None]:
def similar_items(item_id, model, N=10, norm = True):
    item_bias ,item_representations = model.get_item_representations(features=item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)

    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]/ item_norms[item_id] ), key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] ), key=lambda x: -x[1])
    return similar


value_to_find = 'EBSH'
value_to_compare = "SEVP"
index = test_items['item'].eq(value_to_find).idxmax()
print(index)
similar_item_list = similar_items(index, model, N=103)

simscores = [x[1] for x in similar_item_list]


similar_idx = [x[0] for x in similar_item_list ]
siitms = test_items.iloc[similar_idx]# Can also add the other

siitms["scores"] = simscores

siitms.head(104)


scores_column_name = "scores"  # Replace this with the actual column name

# Filter the DataFrame based on the condition value_to_compare
filtered_items = siitms[siitms["item"] == value_to_compare]

# Check if any items match the condition
if not filtered_items.empty:
    # Extract the score from the first matching item
    compare_score = filtered_items.iloc[0][scores_column_name]
    print("Comparison score:", compare_score)
else:
    print("No items matching the condition:", value_to_compare)

# print(compare_score)

In [None]:
def similar_items(item_id, model, N=10, norm = True):
    item_bias ,item_representations = model.get_item_representations(features=item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)

    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]/ item_norms[item_id] ), key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] ), key=lambda x: -x[1])
    return similar


value_to_find = 'EBSH'
index = test_items['item'].eq(value_to_find).idxmax()
print(index)
similar_item_list = similar_items(index, model, N=10)

simscores = [x[1] for x in similar_item_list]


similar_idx = [x[0] for x in similar_item_list ]
siitms = test_items.iloc[similar_idx]# Can also add the other

siitms["scores"] = simscores

siitms.head(104)


In [None]:
nan_counts = siitms.isna().sum()
nan_counts

## Similar User Calculation

In [None]:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar
    
# map = dataset._user_id_mapping
# index = map[77196041]
similar_item_list = similar_users(index,model, N = 3)
print(similar_item_list)
similar_idx = [x[0] for x in similar_item_list]
filtered_data = user.loc[similar_idx, :]
filtered_data.head(20)



In [None]:
# print(u_cols)


# i = 0
# lists = []
# for index, row in filtered_data.iterrows():
#     print(row)
#     break
#     userlst = []
#     pos_idxs = row[row == 1].index.tolist()
#     userlst.append(filtered_data.iloc[i,0])
#     userlst += pos_idxs
    
#     i+=1
#     lists.append(userlst)


# new_df = pd.DataFrame(data = lists, columns = u_cols)
# new_df.head(-1)



## Cold Start Problem

In [None]:
# import random

# from scipy import sparse

# def format_newuser_input(user_feature_map, user_feature_list):
#   num_features = len(user_feature_list)
#   normalised_val = 1.0 
#   target_indices = []
#   for feature in user_feature_list:
#     try:
#         target_indices.append(user_feature_map[feature])
#     except KeyError:
#         print("new user feature encountered '{}'".format(feature))
#         pass

#   new_user_features = np.zeros(len(user_feature_map.keys()))
#   for i in target_indices:
#     new_user_features[i] = normalised_val
#   new_user_features = sparse.csr_matrix(new_user_features)
#   return(new_user_features)

# user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
# user_feature_list = ["segment4", "B01", "Cold Start"]

# new_user_features = format_newuser_input(user_feature_map, u_cols)
# scores = model.predict(0, np.arange(104), user_features=new_user_features)

# top_items = item.iloc[np.argsort(-scores)]

# top_items.head()

# new_user = pd.DataFrame(np.zeros(len(user_features_col))).T
# new_user.columns = user_features_col
# # print(new_user)

# new_user.head()


# new_user_id = 86000
# new_user['segment4'] = 1
# new_user['B50'] = 1
# new_user['Cold Start'] = 1

# new_user = csr_matrix(new_user)
# scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user)
# top_items_new_user = item.iloc[np.argsort(-scores)]
# top_items_new_user[0:10]

In [None]:
# Use our isolated user to check our algorithm:
# We predict items for this user, then check the precision at k=5.

selected_user_df.head()
idcol = selected_user_df.iloc[0]["idcol"]
segment = selected_user_df.iloc[0]["segment"]
beh_segment = selected_user_df.iloc[0]["beh_segment"]
active_ind = selected_user_df.iloc[0]["active_ind"]

column_names = u_cols
# Populate the new DataFrame with relevant information from the original DataFrame
new_user_data = {
    'idcol': [idcol],
    'segment': [segment],
    'beh_segment': [beh_segment],
    'active_ind': [active_ind],
    'most_clicked_item': [np.nan],
    'most_bought_item': [np.nan],
    'most_clicked_item_type': [np.nan],
    'most_bought_item_type': [np.nan],
    'daily_activity_score': [np.nan],
    'activity_rate': [np.nan]
}

print(new_user_data)

new_user = pd.DataFrame(new_user_data)

# Function to fill NaN values in the new row
def fill_na_with_mode_or_mean(data, cold_start_user):
    filled_row = cold_start_user.copy()
    
    for column in data.columns:
        if cold_start_user[column].isna().any():
            if data[column].dtype == 'object':  # Categorical data
                mode_value = data[column].mode()[0]
                filled_row[column].fillna(mode_value, inplace=True)
            else:  # Numerical data
                mean_value = data[column].mean()
                filled_row[column].fillna(mean_value, inplace=True)
    
    return filled_row

new_user_completed = fill_na_with_mode_or_mean(user, new_user)
# new_user_sparse = pd.get_dummies(new_user_completed,dtype = int, prefix="", prefix_sep="")
# # new_user_sparse.head()
selected_user_df.head(-1)

In [None]:
new_user_wide = pd.DataFrame(np.zeros(len(user_features_col))).T
new_user_wide.columns = user_features_col
# print(new_user)

# Populate the DataFrame with the provided values
new_user_wide.at[0, 'daily_activity_score'] = new_user_completed.iloc[0]['daily_activity_score']
new_user_wide.at[0, 'activity_rate'] = new_user_completed.iloc[0]['activity_rate']
new_user_wide.at[0, new_user_completed.iloc[0]['segment']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['beh_segment']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['active_ind']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['most_clicked_item']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_bought_item']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_clicked_item_type']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_bought_item_type']] = 1.0
# new_user_wide.at[0, 'INSURE1'] = 1.0  # Populate the first occurrence of 'INSURE' with 1.0
new_user_wide.head()

In [None]:
new_user_csr = csr_matrix(new_user_wide)
# new_user = csr_matrix(new_user)
print(new_user_csr.shape)
scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user_csr)
top_items_new_user = item.iloc[np.argsort(-scores)]
top_items_new_user[0:20]

In [None]:
selected_user_df.head()

# Rank the items based on interactions
# Assign scores to interaction types
interaction_scores = {'DISPLAY': 0, 'CLICK': 1, 'CHECKOUT': 2}

# Map interaction types to scores
selected_user_df['interaction_score'] = selected_user_df['interaction'].map(interaction_scores)

# Rank items based on scores
selected_user_df['item_rank'] = selected_user_df.groupby('idcol')['interaction_score'].rank(method='min', ascending=False)

# Sort dataframe by item rank
selected_user_df = selected_user_df.sort_values(by='item_rank')

selected_user_df.head(-1)

true_items = selected_user_df['item'].tolist()
true_items = list(set(true_items))

print(true_items)

recommended_list = top_items_new_user['item'].tolist()
recommended_list = list(set(recommended_list))

print(recommended_list)

In [None]:
# def precision_at_k(recommended_list, actual_list, k=10):
#     # Get the intersection of the recommended list and the actual list up to k
#     intersection = set(recommended_list[:k]) & set(actual_list)
    
#     # Calculate precision@k
#     precision = len(intersection) / k
    
#     return precision

# # Calculate precision@k=10
# precision = precision_at_k(recommended_list, true_items, k=10)
# print("Precision@k=10:", precision)