# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix, vstack

# Data Prep

## Load Data

In [2]:
data = pd.read_csv("./fnb_datav2.csv")

# Remove 
data = data.drop(columns = ["item_descrip", "tod", "page"])
data.head(20)

# TEST:
# Drop rows where "item" column contains "NONE"
# data = data[data["active_ind"] != "Cold Start"]
data = data[data["item"] != "NONE"]

data.head(20)
# data.shape


Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
16,14454,CLICK,08FEB2023,CAFM,TRANSACT,segment2,B01,Active
17,14454,CHECKOUT,08FEB2023,CAFM,TRANSACT,segment2,B01,Active
18,15000,CLICK,31JAN2023,CARF,LEND,segment3,B01,Cold Start
19,15000,CHECKOUT,31JAN2023,CARF,LEND,segment3,B01,Cold Start
26,22924,CLICK,26FEB2023,FIWL,INVEST,segment2,B01,Active
27,22924,CHECKOUT,26FEB2023,FIWL,INVEST,segment2,B01,Active
37,23484,CHECKOUT,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start
38,23484,CLICK,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start


In [3]:
# I remve a random Active user to test the cold start approach later:
# Step 1: Filter users with active_ind equal to 'Active'
active_users = data[data['active_ind'] == 'Active']

# Check if there are any active users
if not active_users.empty:
    # Step 2: Randomly select one of these users
    selected_user = active_users.sample(n=1)

    # Get the user id of the selected user
    selected_user_id = selected_user['idcol'].values[0]

    # Step 3: Move all entries of this selected user to a new dataframe
    selected_user_df = data[data['idcol'] == selected_user_id]

    # Step 4: Remove this user's entries from the original dataframe
    data = data[data['idcol'] != selected_user_id]

selected_user_df.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
322388,395007836,CLICK,08JAN2023,CBCC,CONNECT,segment1,B44,Active
322389,395007836,CLICK,22FEB2023,FILS,INSURE,segment1,B44,Active
322390,395007836,CLICK,27JAN2023,FILS,INSURE,segment1,B44,Active
322391,395007836,CLICK,01JAN2023,FILS,INSURE,segment1,B44,Active
322392,395007836,CHECKOUT,01JAN2023,FILS,INSURE,segment1,B44,Active


In [4]:
# Checking transactions for a specific ID:
id = 77196041
data[data["idcol"]==id]

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
155295,77196041,CLICK,27MAR2023,FICQ,INSURE,segment2,B01,Active
155296,77196041,CHECKOUT,27MAR2023,FIWL,INVEST,segment2,B01,Active
155297,77196041,CHECKOUT,27MAR2023,FILS,INSURE,segment2,B01,Active
155298,77196041,CLICK,27MAR2023,FILS,INSURE,segment2,B01,Active
155299,77196041,CLICK,27MAR2023,FIHC,INSURE,segment2,B01,Active
...,...,...,...,...,...,...,...,...
155353,77196041,CLICK,27MAR2023,CACU,TRANSACT,segment2,B01,Active
155354,77196041,CHECKOUT,27MAR2023,CABC,INVEST,segment2,B01,Active
155355,77196041,CLICK,27MAR2023,CABC,INVEST,segment2,B01,Active
155356,77196041,CHECKOUT,27MAR2023,SEVP,TRANSACT,segment2,B01,Active


In [5]:
data.nunique()

idcol          42605
interaction        2
int_date          88
item             103
item_type          6
segment            4
beh_segment       48
active_ind         3
dtype: int64

In [6]:
# Add the following user features
"""
- weekly interaction frequency, 
- most frequenctly item interacted with (other than ALL),

"""

# Give scores to the interactions, and then drop the interaction column:L
# interaction_scores = {
#     'CLICK': 1,
#     'CHECKOUT': 2
# }

interaction_scores = {
    # 'DISPLAY': 0,
    'CLICK': 5,
    'CHECKOUT': 10
}

# Map interaction scores, fill missing values with 0
data['interaction_scores'] = data['interaction'].map(interaction_scores).fillna(0)

# Add ids for each unique item
data['item_id'] = pd.factorize(data['item'])[0]

data.drop(columns = ["interaction"])
data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,5,0
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,10,0
16,14454,CLICK,08FEB2023,CAFM,TRANSACT,segment2,B01,Active,5,1
17,14454,CHECKOUT,08FEB2023,CAFM,TRANSACT,segment2,B01,Active,10,1
18,15000,CLICK,31JAN2023,CARF,LEND,segment3,B01,Cold Start,5,2


### Add User Features:

#### Add the following user features
- weekly interaction frequency - on average, over the whole dataset, how many times does the user interact per week
- daily interaction frequency - similar to above
- monthly interaction frequency - how many times 
- most frequenctly item interacted with (other than ALL) over the whole 
- most frequently interacted with item type
- Ratio of checkout to click for each user-item combination (ask Lize)
- Add an activity score, which is a metric that says how often, over the entire dataset, is the user active
    - Done by dividing the number of unique active days by the number of days in the dataset
- Potential other features to add from TOD:
    - average time between clicking item
    - average time between checking out the item


In [7]:
original_data =  data.copy()

target_idcol = 77196041
index = int(original_data[original_data['idcol'] == target_idcol].index[0])


In [8]:
# Add user daily activity score:
# On days that they are active, how many interactions do they make?
# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# Group by idcol to calculate total interactions and unique active days
user_interactions = data.groupby('idcol').size().reset_index(name='total_interactions')
active_days = data.groupby('idcol')['int_date'].nunique().reset_index(name='unique_active_days')

# Merge the results to calculate daily_activity_score
user_activity = pd.merge(user_interactions, active_days, on='idcol')
user_activity['daily_activity_score'] = user_activity['total_interactions'] / user_activity['unique_active_days']

# Merge the daily_activity_score back to the original DataFrame
data = pd.merge(data, user_activity[['idcol', 'daily_activity_score']], on='idcol', how='left')

data.head(40)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,10,2,2.0
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,5,3,2.0
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,10,3,2.0
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,10,4,2.0
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,5,4,2.0


In [9]:
# Add user activity frequency:
# Count the number of days that the user was active, and divide it by the number of days in the dataset:

# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'])

# Calculate the total number of unique days in the dataset
total_days = data["int_date"].nunique()

# Calculate the number of unique days each user had interactions
user_unique_days = data.groupby('idcol')['int_date'].nunique()

# Calculate the activity rate
activity_rate = user_unique_days / total_days

# Merge the activity_rate back into the original dataframe
data = data.merge(activity_rate.rename('activity_rate'), on='idcol')

data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0,0.011364
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0,0.011364
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0,0.011364
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0,0.011364
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0,0.011364
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,10,2,2.0,0.011364
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,5,3,2.0,0.011364
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,10,3,2.0,0.011364
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,10,4,2.0,0.011364
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,5,4,2.0,0.011364


In [10]:
# # Adding daily, weekly and monthly interaction frequencies, over entire dataset
# # The interactions can be click, checkout or display. This is more a value for how often the user opens the app,
# # rather than how often they actually view or buy an item

# # Daily 

# data = original_data.copy()
# # Convert int_date to datetime
# data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# # Calculate daily interaction frequency
# daily_freq = data.groupby(['idcol', 'int_date']).size().groupby(level=0).mean().reset_index(name='avg_daily_freq')

# # Calculate weekly interaction frequency
# data['week'] = data['int_date'].dt.isocalendar().week
# weekly_freq = data.groupby(['idcol', 'week']).size().groupby(level=0).mean().reset_index(name='avg_weekly_freq')

# # # Calculate monthly interaction frequency
# # data['month'] = data['int_date'].dt.to_period('M')
# # monthly_freq = data.groupby(['idcol', 'month']).size().groupby(level=0).mean().reset_index(name='avg_monthly_freq')

# # Merge frequencies back into the original DataFrame
# data = data.merge(daily_freq, on='idcol')
# data = data.merge(weekly_freq, on='idcol')
# data = data.merge(monthly_freq, on='idcol')

# data.head(20)

In [11]:
# # Add the activity score:
# # Convert 'int_date' to datetime format
# data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# # Calculate the total number of unique days in the dataset
# total_unique_days = data['int_date'].nunique()

# data = data.sort_values(by='int_date', ascending=False)


# # # Calculate the number of unique days each user was active
# user_unique_days = df.groupby('idcol')['int_date'].nunique().reset_index(name='active_days')

# # Calculate the activity score for each user
# user_unique_days['activity_score'] = user_unique_days['active_days'] / total_unique_days

# # Merge the activity score back into the original dataframe
# df = df.merge(user_unique_days[['idcol', 'activity_score']], on='idcol')

# print("DataFrame with activity score:")
# print(df)

In [12]:
# Adding most clicked and most bought items and item types for each user:
clicks = data[data['interaction'] == 'CLICK']
most_clicked = clicks.groupby(['idcol', 'item']).size().reset_index(name='click_count')
most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item']]
most_clicked.rename(columns={'item': 'most_clicked_item'}, inplace=True)

# Determine the most bought item for each user
checkouts = data[data['interaction'] == 'CHECKOUT']
most_bought = checkouts.groupby(['idcol', 'item']).size().reset_index(name='checkout_count')
most_bought = most_bought.loc[most_bought.groupby('idcol')['checkout_count'].idxmax()][['idcol', 'item']]
most_bought.rename(columns={'item': 'most_bought_item'}, inplace=True)

# Merge the most clicked and most bought items back into the original DataFrame
data = data.merge(most_clicked, on='idcol', how='left')
data = data.merge(most_bought, on='idcol', how='left')

data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_clicked_item,most_bought_item
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0,0.011364,IBAB,IBAB
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0,0.011364,IBAB,IBAB
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0,0.011364,CAFM,CAFM
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0,0.011364,CAFM,CAFM
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0,0.011364,CARF,CARF


In [13]:
# Adding most clicked and most bought items and item types for each user:
clicks = data[data['interaction'] == 'CLICK']
most_clicked = clicks.groupby(['idcol', 'item_type']).size().reset_index(name='click_count')
most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item_type']]
most_clicked.rename(columns={'item_type': 'most_clicked_item_type'}, inplace=True)

# Determine the most bought item for each user
checkouts = data[data['interaction'] == 'CHECKOUT']
most_bought = checkouts.groupby(['idcol', 'item_type']).size().reset_index(name='checkout_count')
most_bought = most_bought.loc[most_bought.groupby('idcol')['checkout_count'].idxmax()][['idcol', 'item_type']]
most_bought.rename(columns={'item_type': 'most_bought_item_type'}, inplace=True)

# Merge the most clicked and most bought items back into the original DataFrame
data = data.merge(most_clicked, on='idcol', how='left')
data = data.merge(most_bought, on='idcol', how='left')

data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0,0.011364,CARF,CARF,LEND,LEND


In [14]:
# # For each unique user/item combination, what is the ratio of CHECKOUT to CLICK?# First, filter the DataFrame to include only CHECKOUT and CLICK interactions
# # NOTE: This isn't valid because an item doesn't need to be clicked to check it out - that is weird. I can intelligently fill NaN values with 1 if there was a 
# # checkout but no click and 0 otherwise, but ask Lizes opinion first

# checkout_click_df = data[data['interaction'].isin(['CHECKOUT', 'CLICK'])]

# # Group by unique idcol-item combinations and count the occurrences of each interaction type
# interaction_counts = checkout_click_df.groupby(['idcol', 'item', 'interaction']).size().unstack(fill_value=0)

# # Calculate the ratio of CHECKOUT to CLICK interactions
# interaction_counts['checkout_click_ratio'] = interaction_counts['CHECKOUT'] / interaction_counts['CLICK']


# # # Merge the ratio back to the original DataFrame based on idcol and item
# # data = data.merge(interaction_counts.reset_index()[['idcol', 'item', 'checkout_click_ratio']], on=['idcol', 'item'], how='left')

# # # Display the DataFrame with the new column
# # data.head(20)

In [15]:
# # On which page did the user checkout on the item most frequently? (NOT USING)
# # Filter the DataFrame to include only CLICK interactions
# click_df = data[data['interaction'] == 'CHECKOUT']

# # Group by user-item-screen combination and count the occurrences
# click_counts = click_df.groupby(['idcol', 'item', 'page'])['interaction'].count().reset_index()

# # Find the screen with the maximum count for each user-item combination
# max_click_screen = click_counts.groupby(['idcol', 'item']).apply(lambda x: x.loc[x['interaction'].idxmax()]).reset_index(drop=True)

# # Merge the result back to the original DataFrame based on user-item combination
# data = data.merge(max_click_screen[['idcol', 'item', 'page']], on=['idcol', 'item'], how='left')
# data.rename(columns={'page': 'most_checked_out_screen'}, inplace=True)

# # Display the DataFrame with the new column
# data.head(20)

In [16]:
# I will fill NaN values with the mode of the data, for people of the same segment and BEH segment, for people that have not 
# clicked or bought any items

def fill_nan_with_mode_for_segment_and_beh_segment(data):
    # Get unique combinations of segment and beh_segment
    unique_combinations = data[['segment', 'beh_segment']].drop_duplicates()

    for index, row in unique_combinations.iterrows():
        segment = row['segment']
        beh_segment = row['beh_segment']
        
        # Filter the DataFrame based on the current segment and beh_segment
        filtered_data = data[(data['segment'] == segment) & (data['beh_segment'] == beh_segment)]
        
        # Calculate mode for each column within the segment and beh_segment group
        mode_values = filtered_data.mode().iloc[0]
        
        # Replace NaN values in the original DataFrame with mode values for the current segment and beh_segment
        data.loc[(data['segment'] == segment) & (data['beh_segment'] == beh_segment)] = \
            filtered_data.fillna(mode_values)
    
    return data, mode_values


test, mode_values = fill_nan_with_mode_for_segment_and_beh_segment(data)

test.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0,0.011364,CARF,CARF,LEND,LEND


### Add Item Features
- Most bought by segment
- Most bought by beh segment
- Most clicked by segment
- Most clicked by beh_segment
- For this item, what is the ratio of checkouts to clicks over the entire dataset?
- WHich screen was this item accessed from the most?

In [17]:
original_data = data.copy()
data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0,0.011364,CARF,CARF,LEND,LEND


In [18]:
# data = original_data.copy()
# # data.head()

# # Which segment clicked this item the most?
# # Step 1: Filter to only include "CLICK" interactions
# click_data = data[data['interaction'] == 'CLICK']
# # Step 2: Group by item and segment, count the number of "CLICK" interactions
# click_counts = click_data.groupby(['item', 'segment']).size().reset_index(name='click_count')

# # Step 3: Determine the segment with the maximum "CLICK" interactions for each item
# most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# # Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
# item_to_segment = most_clicked_by_seg.set_index('item')['segment'].to_dict()

# # Step 5: Map this information back to the original dataframe
# data['most_clicked_by_seg'] = data['item'].map(item_to_segment)

# data.head(20)


In [19]:
# # Which beh_segment clicked this item the most?
# # Step 1: Filter to only include "CLICK" interactions
# click_data = data[data['interaction'] == 'CLICK']
# # Step 2: Group by item and segment, count the number of "CLICK" interactions
# click_counts = click_data.groupby(['item', 'beh_segment']).size().reset_index(name='click_count')

# # Step 3: Determine the segment with the maximum "CLICK" interactions for each item
# most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# # Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
# item_to_segment = most_clicked_by_seg.set_index('item')['beh_segment'].to_dict()

# # Step 5: Map this information back to the original dataframe
# data['most_clicked_by_beh_seg'] = data['item'].map(item_to_segment)

# data.head(20)

In [20]:
# # Which segment bought this item the most?
# # Step 1: Filter to only include "CLICK" interactions
# click_data = data[data['interaction'] == 'CHECKOUT']
# # Step 2: Group by item and segment, count the number of "CLICK" interactions
# click_counts = click_data.groupby(['item', 'segment']).size().reset_index(name='click_count')

# # Step 3: Determine the segment with the maximum "CLICK" interactions for each item
# most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# # Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
# item_to_segment = most_clicked_by_seg.set_index('item')['segment'].to_dict()

# # Step 5: Map this information back to the original dataframe
# data['most_bought_by_seg'] = data['item'].map(item_to_segment)

# data.head(20)

In [21]:
# # Which beh_segment bought this item the most?
# # # Step 1: Filter to only include "CLICK" interactions
# # click_data = data[data['interaction'] == 'CHECKOUT']
# # # Step 2: Group by item and segment, count the number of "CLICK" interactions
# # click_counts = click_data.groupby(['item', 'beh_segment']).size().reset_index(name='click_count')

# # # Step 3: Determine the segment with the maximum "CLICK" interactions for each item
# # most_clicked_by_seg = click_counts.loc[click_counts.groupby('item')['click_count'].idxmax()]

# # # Step 4: Create a dictionary to map items to the segment with the most "CLICK" interactions
# # item_to_segment = most_clicked_by_seg.set_index('item')['beh_segment'].to_dict()

# # # Step 5: Map this information back to the original dataframe
# # data['most_bought_by_beh_seg'] = data['item'].map(item_to_segment)

# # data.head(20)

# # Step 1: Filter to only include "CHECKOUT" interactions
# checkout_data = data[data['interaction'] == 'CHECKOUT']

# # Step 2: Group by item and beh_segment, count the number of "CHECKOUT" interactions
# checkout_counts = checkout_data.groupby(['item', 'beh_segment']).size().reset_index(name='click_count')

# # Step 3: Calculate the total number of users in each beh_segment
# segment_user_counts = data.groupby('beh_segment')['idcol'].nunique()

# # Step 4: Merge checkout counts with segment_user_counts to normalize the counts
# checkout_counts = checkout_counts.merge(segment_user_counts, on='beh_segment')

# # Step 5: Normalize the counts by dividing by the total number of users in each segment
# checkout_counts['normalized_click_count'] = checkout_counts['click_count'] / checkout_counts['idcol']

# # Step 6: Determine the segment with the maximum normalized "CHECKOUT" interactions for each item
# most_clicked_by_seg = checkout_counts.loc[checkout_counts.groupby('item')['normalized_click_count'].idxmax()]

# # Step 7: Create a dictionary to map items to the segment with the most normalized "CHECKOUT" interactions
# item_to_segment = most_clicked_by_seg.set_index('item')['beh_segment'].to_dict()

# # Step 8: Map this information back to the original dataframe
# data['most_bought_by_beh_seg'] = data['item'].map(item_to_segment)

# data.head(20)


In [22]:
# # Checout to click ratio for this item across the entire dataset: (I assume each item was checkout and clicked at least once)

# # Filter the DataFrame to include only CHECKOUT and CLICK interactions
# checkout_click_df = data[data['interaction'].isin(['CHECKOUT', 'CLICK'])]

# # Group by item and count the occurrences of each interaction type
# interaction_counts = checkout_click_df.groupby('item')['interaction'].value_counts().unstack(fill_value=0)

# # Calculate the ratio of CHECKOUT to CLICK interactions for each item
# interaction_counts['item_checkout_click_ratio'] = interaction_counts['CHECKOUT'] / interaction_counts['CLICK']

# # Merge the ratio back to the original DataFrame based on item
# data = data.merge(interaction_counts['item_checkout_click_ratio'].reset_index(), on='item', how='left')



# # Display the DataFrame with the new column
# data.head(20)

# # Check NANS
# # Assuming your DataFrame is named df
# # Check for NaN values in each column
# nan_columns = data.columns[data.isna().any()].tolist()

# # Print the columns with NaN values
# print("Columns with NaN values:", nan_columns)

In [23]:
# # WHich screen was this item accessed forom the most frequently (clicked or checked out)

# # Filter the DataFrame to include only CLICK and CHECKOUT interactions
# click_checkout_df = data[data['interaction'].isin(['CLICK', 'CHECKOUT'])]

# # Group by item and screen and count the occurrences of each combination
# screen_counts = click_checkout_df.groupby(['item', 'page'])['interaction'].count().reset_index()

# # Find the screen with the highest count for each item
# max_screen = screen_counts.loc[screen_counts.groupby('item')['interaction'].idxmax()]
# max_screen.rename(columns={'page': 'most_frequent_screen'}, inplace=True)

# # # Merge the screen information back to the original DataFrame based on item
# data = data.merge(max_screen[['item', 'most_frequent_screen']], on='item', how='left')

# # # Rename the column to indicate the most frequent screen
# # data.rename(columns={'page': 'most_frequent_screen'}, inplace=True)

# # # Display the DataFrame with the new column
# data.head()

# Prep Features:

In [24]:
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,5,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,10,0,2.0,0.011364,IBAB,IBAB,INSURE,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,5,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,10,1,2.0,0.011364,CAFM,CAFM,TRANSACT,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,5,2,2.0,0.011364,CARF,CARF,LEND,LEND
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,10,2,2.0,0.011364,CARF,CARF,LEND,LEND
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,5,3,2.0,0.011364,FIWL,FIWL,INVEST,INVEST
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,10,3,2.0,0.011364,FIWL,FIWL,INVEST,INVEST
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,10,4,2.0,0.011364,CUSS,CUSS,INVEST,INVEST
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,5,4,2.0,0.011364,CUSS,CUSS,INVEST,INVEST


In [50]:
# Define the user columns, item columns and interaction columns:
u_cols = ["idcol", "segment", "beh_segment", "active_ind", "most_clicked_item", "most_bought_item",
          "most_clicked_item_type", "most_bought_item_type", "daily_activity_score", "activity_rate"] # TODO Include the checlout-click ratio if useful
# u_cols = ["idcol", "segment", "beh_segment", "active_ind", "avg_daily_freq", "avg_weekly_freq", "avg_monthly_freq"]
# item_cols = ["item_id", "item", "item_type", "most_bought_by_beh_seg", "most_bought_by_seg", "most_clicked_by_beh_seg",
#              "most_clicked_by_seg"]

item_cols = ["item_id", "item_type"]
interact_cols = ["idcol", "item_id", "interaction_scores"] # I include idcol and item for now, to basically say "this user did this item with this score at this date"


test_item_cols = ["item_id", "item_type", "item"]
# Experiment 4:
# u_cols = ["idcol", "segment", "beh_segment", "active_ind"]
# item_cols = ["item_id", "item", "item_type"]

# I want the interact_cols data in the following format:
# 

user, item, rating = data[u_cols].copy(), data[item_cols].copy(), data[interact_cols].copy()
test_items = data[test_item_cols].copy()
# Accumulate rating data so that, for each unique user-item combination, there is a single row, otherwise the train-test split 
# have shared interactions. Can do an intelligent split, and use LightFM's built in weighting method,
# but the weights matrix generated with that method is the exact same, even if I do the below:
# This also allows us to incorporate the number of times that a user has interacted with an item as interaction weighting, implicitly
rating = rating.groupby(['idcol', 'item_id'], as_index=False)['interaction_scores'].sum()


# Drop duplicates, because I only need the unique items' and users' features:
item = item.drop_duplicates()
# item = item.drop(columns=["item"], inplace=True)
item = item.reset_index(drop=True)

user = user.drop_duplicates()
user = user.reset_index(drop=True)

print(rating.shape)
print(item.shape)
print(user.shape)

item.head(20)

# drop the first row, we don't want to include ALL



(93707, 3)
(103, 2)
(42605, 10)


Unnamed: 0,item_id,item_type
0,0,INSURE
1,1,TRANSACT
2,2,LEND
3,3,INVEST
4,4,INVEST
5,5,LIFESTYLE
6,6,INSURE
7,7,LEND
8,8,LEND
9,9,CONNECT


In [26]:
user.head(20)

Unnamed: 0,idcol,segment,beh_segment,active_ind,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,daily_activity_score,activity_rate
0,4521,segment1,B07,Semi Active,IBAB,IBAB,INSURE,INSURE,2.0,0.011364
1,14454,segment2,B01,Active,CAFM,CAFM,TRANSACT,TRANSACT,2.0,0.011364
2,15000,segment3,B01,Cold Start,CARF,CARF,LEND,LEND,2.0,0.011364
3,22924,segment2,B01,Active,FIWL,FIWL,INVEST,INVEST,2.0,0.011364
4,23484,segment2,B01,Cold Start,CUSS,CUSS,INVEST,INVEST,2.0,0.011364
5,24982,segment1,B08,Cold Start,EBSH,EBSH,LIFESTYLE,LIFESTYLE,2.0,0.011364
6,25577,segment3,B01,Semi Active,FILS,FILS,INSURE,INSURE,2.0,0.011364
7,27824,segment1,B08,Active,CTLN,CTLN,LEND,LEND,2.0,0.011364
8,28951,segment1,B07,Semi Active,CTLN,CTLN,LEND,LEND,2.375,0.090909
9,29630,segment4,B01,Semi Active,CCCS,CCCS,INSURE,TRANSACT,1.5,0.022727


In [27]:
item.head(20)

Unnamed: 0,item_id,item_type
0,0,INSURE
1,1,TRANSACT
2,2,LEND
3,3,INVEST
4,4,INVEST
5,5,LIFESTYLE
6,6,INSURE
7,7,LEND
8,8,LEND
9,9,CONNECT


In [28]:
rating.head(20)

Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,15
1,14454,1,15
2,15000,2,15
3,22924,3,15
4,23484,4,15
5,24982,5,15
6,25577,6,15
7,27824,7,15
8,28951,7,45
9,28951,8,15


In [29]:
# Normalise the interaction scores:
# Group by idcol and calculate the sum of interaction scores for each user
sum_interaction_scores = rating.groupby('idcol')['interaction_scores'].sum()

# Merge the sum of interaction scores back to the original DataFrame based on idcol
rating = rating.merge(sum_interaction_scores.reset_index(), on='idcol', suffixes=('', '_sum'))

# Calculate normalized interaction scores by dividing each interaction score by the sum
rating['normalized_interaction_score'] = rating['interaction_scores'] / rating['interaction_scores_sum']

# Replace NaN values with 0 in the normalized interaction scores column
rating['normalized_interaction_score'].fillna(0, inplace=True)

# Drop the temporary sum column
rating.drop(columns=['interaction_scores_sum'], inplace=True)

# Drop the unnormalised column
rating.drop(columns=['interaction_scores'], inplace=True)

# rename
rating.rename(columns={'normalized_interaction_score': 'interaction_scores'}, inplace=True)

# Display the DataFrame with the normalized interaction scores
rating.head(20)



Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,1.0
1,14454,1,1.0
2,15000,2,1.0
3,22924,3,1.0
4,23484,4,1.0
5,24982,5,1.0
6,25577,6,1.0
7,27824,7,1.0
8,28951,7,0.333333
9,28951,8,0.111111


In [30]:
# # Normalise the user averages:
# # Function to normalize a column
# def normalize_column(df, column_name):
#     min_value = df[column_name].min()
#     max_value = df[column_name].max()
#     df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
#     return df

# # Normalize the numerical columns
# numerical_columns = ['avg_daily_freq', 'avg_weekly_freq', 'avg_monthly_freq']

# for column in numerical_columns:
#     user = normalize_column(user, column)


## User Features Data preparation

In [31]:
user.head()


Unnamed: 0,idcol,segment,beh_segment,active_ind,most_clicked_item,most_bought_item,most_clicked_item_type,most_bought_item_type,daily_activity_score,activity_rate
0,4521,segment1,B07,Semi Active,IBAB,IBAB,INSURE,INSURE,2.0,0.011364
1,14454,segment2,B01,Active,CAFM,CAFM,TRANSACT,TRANSACT,2.0,0.011364
2,15000,segment3,B01,Cold Start,CARF,CARF,LEND,LEND,2.0,0.011364
3,22924,segment2,B01,Active,FIWL,FIWL,INVEST,INVEST,2.0,0.011364
4,23484,segment2,B01,Cold Start,CUSS,CUSS,INVEST,INVEST,2.0,0.011364


In [32]:
user_train = pd.get_dummies(user,dtype = int, prefix="", prefix_sep="")
user_features_col = user_train.drop(columns =['idcol']).columns.values
user_feat = user_train.drop(columns =['idcol']).to_dict(orient='records')

user_train = user_train.sort_values(by='idcol', ascending=True)
# print(user_feat)
# user.shape
# user.shape
user_train.head(20)


Unnamed: 0,idcol,daily_activity_score,activity_rate,segment1,segment2,segment3,segment4,B01,B02,B03,...,INVEST,LEND,LIFESTYLE,TRANSACT,CONNECT,INSURE,INVEST.1,LEND.1,LIFESTYLE.1,TRANSACT.1
0,4521,2.0,0.011364,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,14454,2.0,0.011364,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,1
2,15000,2.0,0.011364,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
3,22924,2.0,0.011364,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
4,23484,2.0,0.011364,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
5,24982,2.0,0.011364,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
6,25577,2.0,0.011364,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,27824,2.0,0.011364,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
8,28951,2.375,0.090909,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
9,29630,1.5,0.022727,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Item Features Data prep:

In [33]:
# item.head()

item_features = pd.get_dummies(item, dtype = int, prefix="", prefix_sep="")
# item_features["idcol"] = data["idcol"]
item_features_col = item_features.drop(columns=['item_id']).columns.values


# Need some for of identification for the item features
# item_features["idcol"] = data["idcol"]


item_features.fillna(value = 0, inplace=True)
# item_features.shape
# print(item_feat[0])
# item.head()
# print(item_features.iloc[0,:])
# print(item_features_col)
item_features.head()

nan_columns = item_features.columns[item_features.isna().any()].tolist()

item_feat = item_features.drop(columns =['item_id']).to_dict(orient='records')

item_features.head()

Unnamed: 0,item_id,CONNECT,INSURE,INVEST,LEND,LIFESTYLE,TRANSACT
0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,1
2,2,0,0,0,1,0,0
3,3,0,0,1,0,0,0
4,4,0,0,1,0,0,0


## Fit into LightFM Dataset

In [34]:
dataset = Dataset()
dataset.fit(users=[x for x in user_train['idcol']], items=[x for x in item['item_id']], item_features=item_features_col, user_features=user_features_col)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))


Num users: 42605, num_items 103.


## Build Item Features to be fitted into model

In [35]:
# 
item_features = dataset.build_item_features((x,y) for x,y in zip(item_features['item_id'],item_feat))
# item_features = dataset.build_item_features((x,item_features_col) for x in item_features['item_id'])
# for (x,y) in zip(item_features['item_id'],item_feat):
#     print(x)
#     print(y)
#     break

In [36]:
print(item_features.shape)
user_train.info()
# user.head()

(103, 109)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42605 entries, 0 to 42604
Columns: 275 entries, idcol to TRANSACT
dtypes: float64(2), int64(273)
memory usage: 89.4 MB


## Build User Features to be fit into model

In [37]:
# print(user_feat[0])
user_features = dataset.build_user_features((x,y) for x,y in zip(user_train['idcol'],user_feat))


In [38]:
print(user_features.shape)

(42605, 42771)


## Build interactions (user — item) and its respective weights (in this case our custom weights - 0, 1, 2)

In [39]:
# from sklearn.model_selection import train_test_split


# # We split the data into train and test by taking 20% of interactions for each user and moving that to the test set, i.e the training set will contain 80% of the items
# # that the user interacted with

# # Custom train-test split: Split the data into train and test before building interactions:
# train_interactions = pd.DataFrame()
# test_interaction = pd.DataFrame()

# for user_id, group in rating.groupby('idcol'):
#     if len(group) == 1:
#         train_interactions = pd.concat([train_interactions, group])
#     else:

#         train_group, test_group = train_test_split(group, test_size=0.2, train_size=0.8, random_state=42)
#         train_interactions = pd.concat([train_interactions, train_group])
#         test_interaction = pd.concat([test_interaction, test_group])





In [40]:
# print(train_interactions.shape)
# print(test_interaction.shape)

In [41]:
(interactions, weights) = dataset.build_interactions((x, y) for x,y in zip(rating['idcol'], rating['item_id']))

# With custom train/test splitting:
# train, train_w = dataset.build_interactions((x, y) for x,y in zip(train_interactions['idcol'], train_interactions['item_id']))

# test, test_w = dataset.build_interactions((x, y) for x,y in zip(test_interaction['idcol'], test_interaction['item_id']))


# print(weights.todense())
print(interactions.shape)

(42605, 103)


# Model Training

## Train Test Split

In [42]:
# ORIGINAL:
train, test = random_train_test_split(interactions,test_percentage=0.2, random_state=42)
train_w, test_w = random_train_test_split(weights, test_percentage=0.2, random_state=42)

# Make a custom train-test split that uses either a_ the last 20% of interactions by date, or a random 20% of interactions for the test split
# This ensures that there are no cold start users in the testing set. We will do cold-start testing in a different manner

# SPlit the data from original data and then do all the data processing steps for each step separately. This ensures that the data doesn't bleed over into the test set.


## Model

In [43]:
# Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
# Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
no_components = 50
loss = 'warp'
epoch = 30
num_thread = 8
learning_rate = 0.08062443053534539
n = 5.4809279704140055
k = 9.583359248210815
model = LightFM(no_components= no_components, loss=loss, random_state = 42, learning_rate=learning_rate, n = n, k= k)
# model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

# Experiment 1,2,3:
# model.fit(train, epochs=epoch,num_threads = num_thread)

# Experiment 4, 5, 6:
# model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread)

# Experiemt 7
model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)


<lightfm.lightfm.LightFM at 0x72d0593aea70>

## Model Evaluation

In [44]:
k=5

train_precision = precision_at_k(model, train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
test_precision = precision_at_k(model, test,train_interactions=train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()

train_recall = recall_at_k(model, train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=k,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()

train_auc = auc_score(model, train,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
test_auc = auc_score(model, test, train_interactions=train,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()


test_rr = reciprocal_rank(model, test, train_interactions=train, user_features=user_features, item_features=item_features).mean()
train_rr = reciprocal_rank(model, train, user_features=user_features, item_features=item_features).mean()
# No features:
# train_precision = precision_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_precision = precision_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_recall = recall_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_recall = recall_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_auc = auc_score(model, train, num_threads=num_thread).mean()
# test_auc = auc_score(model, test, train_interactions=train, num_threads=num_thread).mean()

print('Precision: train %.4f' % (train_precision))
print('Precision: test %.4f' % (test_precision))

print('Recall: train %.4f' % (train_recall))
print('Recall: test %.4f' % (test_recall))

print('AUC: train %.4f' % (train_auc))
print('AUC: test %.4f' % (test_auc))

print('RR: train %.4f' % (train_rr))
print('RR: test %.4f' % (test_rr))

# best sofar = 0.18758663535118103
# Precision: train 0.2957
# Precision: test 0.1881
# Recall: train 0.9053
# Recall: test 0.8069
# AUC: train 0.9869
# AUC: test 0.9553

Precision: train 0.3088
Precision: test 0.1409
Recall: train 0.8825
Recall: test 0.5922
AUC: train 0.9862
AUC: test 0.9015
RR: train 0.8896
RR: test 0.5550


In [None]:
print(test_precision)

# Automated Hyperparameter Optimisation

In [None]:
# import optuna
# from lightfm import LightFM
# from lightfm.datasets import fetch_movielens
# from lightfm.evaluation import auc_score

# # Fetch the dataset

# def objective(trial):

#     # Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
#     #  Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
#     # Define the hyperparameters to be tuned
#     no_components = trial.suggest_int('no_components', 10, 50)
#     learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
#     # item_alpha = trial.suggest_loguniform('item_alpha', 1e-6, 1e-1)
#     # user_alpha = trial.suggest_loguniform('user_alpha', 1e-6, 1e-1)
#     k = trial.suggest_loguniform('k', 5, 25)
#     n = trial.suggest_loguniform('n', 5, 25)
    
#     # Create the LightFM model
#     model = LightFM(
#         loss='warp',
#         no_components=no_components,
#         learning_rate=learning_rate,
#         # item_alpha=item_alpha,
#         # user_alpha=user_alpha,
#         k=k,
#         n=n
#     )
#     model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)
    
#     # Evaluate the model
#     pak = precision_at_k(model, test,train_interactions=train, k=5,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
    
#     return pak

# # Run the optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=400)

# print('Best hyperparameters: ', study.best_params)
# np.save("BestParams.npy", study.best_params)
# print('Best precision@k=5: ', study.best_value)


# Predictions

In [57]:
target_idcol = 77196041
selected_user = data[data["active_ind"]=="Active"].sample(n=1)
target_idcol = selected_user["idcol"].iloc[0]
print(target_idcol)
# Rank the items based on interactions
# Assign scores to interaction types
# interaction_scores = {'DISPLAY': 0, 'CLICK': 1, 'CHECKOUT': 2}

predict_user = data[data["idcol"] == target_idcol]

predict_user.head()

# Map interaction types to scores
predict_user['interaction_score'] = predict_user['interaction'].map(interaction_scores)

# Rank items based on scores
predict_user['item_rank'] = predict_user.groupby('idcol')['interaction_score'].rank(method='min', ascending=False)

# Sort dataframe by item rank
predict_user = predict_user.sort_values(by='item_rank')

predict_user.head(-1)

true_items = predict_user['item'].tolist()
true_items = list(set(true_items))


# Sit langs mekaar:
# model recommendation, user se eie actual interactions, all items popularity over entire dataset, all item popularity for user segment, all item popularity for beh_segment

232787113


In [58]:



map = dataset._user_id_mapping
index = map[target_idcol]

scores = model.predict(index, np.arange(103), user_features=user_features, item_features = item_features)
# print(user.iloc[index,:])
top_items = test_items.iloc[np.argsort(-scores)]
# print(scores)
# top_items.head()
# # print(item.shape)
# # print(top_items)
# known_positives = item.iloc[interactions.tocsr()[index].indices]

# top_items[0:10]
recommended_list = top_items['item'].tolist()
recommended_list = list(set(recommended_list))

print(true_items)
print(recommended_list)

def precision_at_k(recommended_list, actual_list, k=10):
    # Get the intersection of the recommended list and the actual list up to k
    intersection = set(recommended_list[:k]) & set(actual_list)
    
    # Calculate precision@k
    precision = len(intersection) / k
    
    return precision

# Calculate precision@k=10
precision = precision_at_k(recommended_list, true_items, k=5)
print("Precision@k=10:", precision)


['NACS', 'FIHC', 'MMMC', 'EBTV', 'CTLN', 'EBSB', 'IBAA']
['CBDS', 'CUPL', 'CCCU', 'MMMC', 'IBAM', 'CBTULS', 'CUSZ', 'CCCS', 'FILS', 'EBQF', 'MMSM', 'NAFW', 'EBSH', 'IBAC', 'FIWL', 'CCLI', 'FIHC', 'CUSB', 'CBEL', 'CTLN', 'CASD', 'EBEM', 'IBAA', 'CARF', 'CAFM', 'CBPB', 'CSPL', 'XCFL', 'CUSS', 'IBAB']
Precision@k=10: 0.2


In [59]:
# Rank items over the entire dataset
# Group by item and sum the interaction scores
item_scores = data.groupby('item')['interaction_scores'].sum()

# Sort items by cumulative interaction scores in descending order
sorted_items = item_scores.sort_values(ascending=False)

# Return list of unique items sorted by their cumulative interaction scores
ranked_items = sorted_items.index.tolist()

print(ranked_items)

['CTLN', 'IBAA', 'CUPL', 'IBAB', 'CACU', 'FIWL', 'FILS', 'CUPX', 'FIHC', 'IBAC', 'CBPA', 'EBSH', 'CASD', 'NATR', 'MMMC', 'NAFW', 'FICQ', 'CSPL', 'EBEM', 'CBLT', 'CBPB', 'NACS', 'IPRA', 'IBAM', 'XCFL', 'EBWP', 'CBVC', 'CBEL', 'IPTF', 'CCLI', 'MMSM', 'IBIC', 'EBQF', 'NASD', 'CABC', 'CAFM', 'EBKA', 'EBET', 'CCNC', 'CUHS', 'CCAI', 'CUSS', 'CARF', 'EBSB', 'CBTULS', 'CCCU', 'CAFI', 'EBTV', 'FLIS', 'CARE', 'CBCC', 'EVCU', 'IBDP', 'CUSZ', 'HLGG', 'CASV', 'IBGC', 'IBPP', 'CCCS', 'EBSP', 'IPSG', 'FIFS', 'EVGW', 'ISBCU', 'CUSI', 'CAFS', 'EBIB', 'GASS', 'SEVP', 'CAFU', 'EBGA', 'CUSB', 'CCAN', 'CBTUD', 'CALI', 'HLGH', 'CBTMT', 'CBDS', 'EBUD', 'CAPO', 'EBXM', 'CAFB', 'EVAP', 'IBDL', 'EBBF', 'EBSL', 'EBPD', 'DOAA', 'KYCA', 'IPFD', 'FHIS', 'HLGE', 'CANL', 'GAFC', 'EBGM', 'EBQB', 'DOSW', 'WHCR', 'FIWR', 'IPST', 'IPFN', 'IPMX', 'IPSD']


In [60]:
# Rank items over the active user's segment
segment_of_interest = predict_user["segment"].iloc[0]
segment_df = data[data['segment'] == segment_of_interest]
item_scores = segment_df.groupby('item')['interaction_scores'].sum()

# Sort items by cumulative interaction scores in descending order
sorted_items = item_scores.sort_values(ascending=False)

# Return list of unique items sorted by their cumulative interaction scores
ranked_items_bysegment = sorted_items.index.tolist()

print(ranked_items_bysegment)

['CTLN', 'IBAB', 'IBAA', 'CUPX', 'CBPA', 'IBAC', 'FIWL', 'CACU', 'FIHC', 'FILS', 'EBSH', 'CBLT', 'CUPL', 'CBPB', 'MMMC', 'CBEL', 'NACS', 'IBAM', 'MMSM', 'IBIC', 'CBVC', 'CABC', 'XCFL', 'CSPL', 'CASD', 'CBTULS', 'NATR', 'EBET', 'CCNC', 'NAFW', 'CAFM', 'EVCU', 'CBCC', 'HLGG', 'NASD', 'CCLI', 'EBEM', 'EBSP', 'CASV', 'EBSB', 'EBTV', 'FICQ', 'IPSG', 'CBTUD', 'FIFS', 'CCAN', 'IBDP', 'FLIS', 'CAPO', 'CAFI', 'EVGW', 'GASS', 'ISBCU', 'CCAI', 'EBGA', 'IBPP', 'CBDS', 'SEVP', 'EBUD', 'EBIB', 'IBGC', 'EVAP', 'CCCS', 'KYCA', 'CARF', 'IPRA', 'EBXM', 'IPFD', 'CCCU', 'CUSZ', 'IPTF', 'CUHS', 'CALI', 'DOAA', 'CUSS', 'CAFU', 'EBBF', 'CAFS', 'EBQB', 'DOSW', 'CANL', 'IPST', 'CAFB', 'IBDL', 'CARE', 'GAFC', 'EBGM', 'CBTMT', 'IPMX', 'IPFN', 'EBWP', 'FIWR', 'CUSI', 'HLGE', 'EBKA', 'EBQF']


In [61]:
# Rank items over the active user's beh_segment
beh_segment_of_interest = predict_user["beh_segment"].iloc[0]
segment_df = data[data['beh_segment'] == beh_segment_of_interest]
item_scores = segment_df.groupby('item')['interaction_scores'].sum()

# Sort items by cumulative interaction scores in descending order
sorted_items = item_scores.sort_values(ascending=False)

# Return list of unique items sorted by their cumulative interaction scores
ranked_items_bybehsegment = sorted_items.index.tolist()

print(ranked_items_bybehsegment)

['CTLN', 'CUPL', 'IBAA', 'CACU', 'CASD', 'EBEM', 'IBAB', 'EBWP', 'FICQ', 'FILS', 'NATR', 'EBQF', 'NAFW', 'CCLI', 'FIWL', 'EBSH', 'CSPL', 'EBKA', 'CUPX', 'FIHC', 'CAFM', 'IPRA', 'IPTF', 'CCAI', 'XCFL', 'IBAC', 'NASD', 'EBSB', 'MMMC', 'CARF', 'NACS', 'CCCU', 'EBET', 'CBPA', 'CCNC', 'CBVC', 'CBPB', 'CUHS', 'CARE', 'CAFI', 'CABC', 'CBLT', 'IBAM', 'CUSS', 'EBTV', 'MMSM', 'FLIS', 'CBEL', 'IBIC', 'IBGC', 'IBPP', 'IBDP', 'CCCS', 'CAFS', 'CAFU', 'CUSZ', 'CBTULS', 'HLGG', 'CUSI', 'EVGW', 'EBIB', 'CASV', 'CALI', 'HLGH', 'FIFS', 'CUSB', 'EBGA', 'CBCC', 'ISBCU', 'IPSG', 'SEVP', 'GASS', 'EBUD', 'EBSP', 'EVCU', 'CBTMT', 'CAFB', 'EBXM', 'CCAN', 'IBDL', 'EBSL', 'CBDS', 'EBPD', 'EBBF', 'EVAP', 'DOAA', 'CBTUD', 'CAPO', 'FHIS', 'KYCA', 'HLGE', 'IPFD', 'EBGM', 'GAFC', 'WHCR', 'CANL', 'EBQB', 'FIWR', 'DOSW', 'IPMX', 'IPFN', 'IPST', 'IPSD']


In [62]:
# Combine all those into a single dataframe so I can see them side by side
num_item = len(true_items)
max_length = 30
print(num_item)
recdata = {
    'True Items': true_items + [None] * (max_length - num_item),
    'Recommended Items': recommended_list[:max_length],
    'Most Popular Tot': ranked_items[:max_length],
    'Most Popular Seg': ranked_items_bysegment[:max_length],
    'Most Popular BSeg': ranked_items_bybehsegment[:max_length],
}

# Create DataFrame from dictionary
df = pd.DataFrame(recdata)

df.head(-1)




7


Unnamed: 0,True Items,Recommended Items,Most Popular Tot,Most Popular Seg,Most Popular BSeg
0,NACS,CBDS,CTLN,CTLN,CTLN
1,FIHC,CUPL,IBAA,IBAB,CUPL
2,MMMC,CCCU,CUPL,IBAA,IBAA
3,EBTV,MMMC,IBAB,CUPX,CACU
4,CTLN,IBAM,CACU,CBPA,CASD
5,EBSB,CBTULS,FIWL,IBAC,EBEM
6,IBAA,CUSZ,FILS,FIWL,IBAB
7,,CCCS,CUPX,CACU,EBWP
8,,FILS,FIHC,FIHC,FICQ
9,,EBQF,IBAC,FILS,FILS


## Similar Item Calculation using cosine similarity

In [56]:
def similar_items(item_id, model, N=10, norm = True):
    item_bias ,item_representations = model.get_item_representations(features=item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)

    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]/ item_norms[item_id] ), key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] ), key=lambda x: -x[1])
    return similar


similar_item_list = similar_items(2, model)
similar_idx = [x[0] for x in similar_item_list ]
item.iloc[similar_idx]# Can also add the other

Unnamed: 0,item_id,item_type
2,2,LEND
12,12,LEND
59,59,LEND
90,90,LEND
44,44,LEND
70,70,LEND
27,27,LEND
67,67,LEND
31,31,LEND
35,35,LEND


## Similar User Calculation

In [None]:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar
    
map = dataset._user_id_mapping
index = map[77196041]
similar_item_list = similar_users(index,model, N = 400)
print(similar_item_list)
similar_idx = [x[0] for x in similar_item_list]
filtered_data = user.loc[similar_idx, :]
filtered_data.head(20)



In [None]:
# print(u_cols)


# i = 0
# lists = []
# for index, row in filtered_data.iterrows():
#     print(row)
#     break
#     userlst = []
#     pos_idxs = row[row == 1].index.tolist()
#     userlst.append(filtered_data.iloc[i,0])
#     userlst += pos_idxs
    
#     i+=1
#     lists.append(userlst)


# new_df = pd.DataFrame(data = lists, columns = u_cols)
# new_df.head(-1)



## Cold Start Problem

In [None]:
# import random

# from scipy import sparse

# def format_newuser_input(user_feature_map, user_feature_list):
#   num_features = len(user_feature_list)
#   normalised_val = 1.0 
#   target_indices = []
#   for feature in user_feature_list:
#     try:
#         target_indices.append(user_feature_map[feature])
#     except KeyError:
#         print("new user feature encountered '{}'".format(feature))
#         pass

#   new_user_features = np.zeros(len(user_feature_map.keys()))
#   for i in target_indices:
#     new_user_features[i] = normalised_val
#   new_user_features = sparse.csr_matrix(new_user_features)
#   return(new_user_features)

# user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
# user_feature_list = ["segment4", "B01", "Cold Start"]

# new_user_features = format_newuser_input(user_feature_map, u_cols)
# scores = model.predict(0, np.arange(104), user_features=new_user_features)

# top_items = item.iloc[np.argsort(-scores)]

# top_items.head()

# new_user = pd.DataFrame(np.zeros(len(user_features_col))).T
# new_user.columns = user_features_col
# # print(new_user)

# new_user.head()


# new_user_id = 86000
# new_user['segment4'] = 1
# new_user['B50'] = 1
# new_user['Cold Start'] = 1

# new_user = csr_matrix(new_user)
# scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user)
# top_items_new_user = item.iloc[np.argsort(-scores)]
# top_items_new_user[0:10]

In [None]:
# Use our isolated user to check our algorithm:
# We predict items for this user, then check the precision at k=5.

selected_user_df.head()
idcol = selected_user_df.iloc[0]["idcol"]
segment = selected_user_df.iloc[0]["segment"]
beh_segment = selected_user_df.iloc[0]["beh_segment"]
active_ind = selected_user_df.iloc[0]["active_ind"]

column_names = u_cols
# Populate the new DataFrame with relevant information from the original DataFrame
new_user_data = {
    'idcol': [idcol],
    'segment': [segment],
    'beh_segment': [beh_segment],
    'active_ind': [active_ind],
    'most_clicked_item': [np.nan],
    'most_bought_item': [np.nan],
    'most_clicked_item_type': [np.nan],
    'most_bought_item_type': [np.nan],
    'daily_activity_score': [np.nan],
    'activity_rate': [np.nan]
}

print(new_user_data)

new_user = pd.DataFrame(new_user_data)

# Function to fill NaN values in the new row
def fill_na_with_mode_or_mean(data, cold_start_user):
    filled_row = cold_start_user.copy()
    
    for column in data.columns:
        if cold_start_user[column].isna().any():
            if data[column].dtype == 'object':  # Categorical data
                mode_value = data[column].mode()[0]
                filled_row[column].fillna(mode_value, inplace=True)
            else:  # Numerical data
                mean_value = data[column].mean()
                filled_row[column].fillna(mean_value, inplace=True)
    
    return filled_row

new_user_completed = fill_na_with_mode_or_mean(user, new_user)
# new_user_sparse = pd.get_dummies(new_user_completed,dtype = int, prefix="", prefix_sep="")
# # new_user_sparse.head()
selected_user_df.head(-1)

In [None]:
new_user_wide = pd.DataFrame(np.zeros(len(user_features_col))).T
new_user_wide.columns = user_features_col
# print(new_user)

# Populate the DataFrame with the provided values
new_user_wide.at[0, 'daily_activity_score'] = new_user_completed.iloc[0]['daily_activity_score']
new_user_wide.at[0, 'activity_rate'] = new_user_completed.iloc[0]['activity_rate']
new_user_wide.at[0, new_user_completed.iloc[0]['segment']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['beh_segment']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['active_ind']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['most_clicked_item']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_bought_item']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_clicked_item_type']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_bought_item_type']] = 1.0
# new_user_wide.at[0, 'INSURE1'] = 1.0  # Populate the first occurrence of 'INSURE' with 1.0
new_user_wide.head()

In [None]:
new_user_csr = csr_matrix(new_user_wide)
# new_user = csr_matrix(new_user)
print(new_user_csr.shape)
scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user_csr)
top_items_new_user = item.iloc[np.argsort(-scores)]
top_items_new_user[0:20]

In [None]:
selected_user_df.head()

# Rank the items based on interactions
# Assign scores to interaction types
interaction_scores = {'DISPLAY': 0, 'CLICK': 1, 'CHECKOUT': 2}

# Map interaction types to scores
selected_user_df['interaction_score'] = selected_user_df['interaction'].map(interaction_scores)

# Rank items based on scores
selected_user_df['item_rank'] = selected_user_df.groupby('idcol')['interaction_score'].rank(method='min', ascending=False)

# Sort dataframe by item rank
selected_user_df = selected_user_df.sort_values(by='item_rank')

selected_user_df.head(-1)

true_items = selected_user_df['item'].tolist()
true_items = list(set(true_items))

print(true_items)

recommended_list = top_items_new_user['item'].tolist()
recommended_list = list(set(recommended_list))

print(recommended_list)

In [None]:
def precision_at_k(recommended_list, actual_list, k=10):
    # Get the intersection of the recommended list and the actual list up to k
    intersection = set(recommended_list[:k]) & set(actual_list)
    
    # Calculate precision@k
    precision = len(intersection) / k
    
    return precision

# Calculate precision@k=10
precision = precision_at_k(recommended_list, true_items, k=10)
print("Precision@k=10:", precision)