# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix, vstack

# Data Prep

## Load Data

In [2]:
data = pd.read_csv("./fnb_datav2.csv")

# Remove 
data = data.drop(columns = ["item_descrip", "tod", "page"])
data.head(20)

# TEST:
# Drop rows where "item" column contains "NONE"
# data = data[data["active_ind"] != "Cold Start"]
data = data[data["item"] != "NONE"]

data.head(20)
# data.shape


Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
16,14454,CLICK,08FEB2023,CAFM,TRANSACT,segment2,B01,Active
17,14454,CHECKOUT,08FEB2023,CAFM,TRANSACT,segment2,B01,Active
18,15000,CLICK,31JAN2023,CARF,LEND,segment3,B01,Cold Start
19,15000,CHECKOUT,31JAN2023,CARF,LEND,segment3,B01,Cold Start
26,22924,CLICK,26FEB2023,FIWL,INVEST,segment2,B01,Active
27,22924,CHECKOUT,26FEB2023,FIWL,INVEST,segment2,B01,Active
37,23484,CHECKOUT,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start
38,23484,CLICK,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start


In [3]:
# # I remve a random Active user to test the cold start approach later:
# # Step 1: Filter users with active_ind equal to 'Active'
# active_users = data[data['active_ind'] == 'Active']

# # Check if there are any active users
# if not active_users.empty:
#     # Step 2: Randomly select one of these users
#     selected_user = active_users.sample(n=1)

#     # Get the user id of the selected user
#     selected_user_id = selected_user['idcol'].values[0]

#     # Step 3: Move all entries of this selected user to a new dataframe
#     selected_user_df = data[data['idcol'] == selected_user_id]

#     # Step 4: Remove this user's entries from the original dataframe
#     data = data[data['idcol'] != selected_user_id]

# selected_user_df.head()

In [4]:
# Checking transactions for a specific ID:
id = 77196041
data[data["idcol"]==id]

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
155295,77196041,CLICK,27MAR2023,FICQ,INSURE,segment2,B01,Active
155296,77196041,CHECKOUT,27MAR2023,FIWL,INVEST,segment2,B01,Active
155297,77196041,CHECKOUT,27MAR2023,FILS,INSURE,segment2,B01,Active
155298,77196041,CLICK,27MAR2023,FILS,INSURE,segment2,B01,Active
155299,77196041,CLICK,27MAR2023,FIHC,INSURE,segment2,B01,Active
...,...,...,...,...,...,...,...,...
155353,77196041,CLICK,27MAR2023,CACU,TRANSACT,segment2,B01,Active
155354,77196041,CHECKOUT,27MAR2023,CABC,INVEST,segment2,B01,Active
155355,77196041,CLICK,27MAR2023,CABC,INVEST,segment2,B01,Active
155356,77196041,CHECKOUT,27MAR2023,SEVP,TRANSACT,segment2,B01,Active


In [5]:
data.nunique()

idcol          42606
interaction        2
int_date          88
item             103
item_type          6
segment            4
beh_segment       48
active_ind         3
dtype: int64

In [6]:
# Add the following user features
"""
- weekly interaction frequency, 
- most frequenctly item interacted with (other than ALL),

"""

# Give scores to the interactions, and then drop the interaction column:L
# interaction_scores = {
#     'CLICK': 1,
#     'CHECKOUT': 2
# }

interaction_scores = {
    'DISPLAY': 0,
    'CLICK': 1,
    'CHECKOUT': 2
}

# Map interaction scores, fill missing values with 0
data['interaction_scores'] = data['interaction'].map(interaction_scores).fillna(0)

# Add ids for each unique item
data['item_id'] = pd.factorize(data['item'])[0]

# data.drop(columns = ["interaction"])
data.head(40)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,1,0
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,2,0
16,14454,CLICK,08FEB2023,CAFM,TRANSACT,segment2,B01,Active,1,1
17,14454,CHECKOUT,08FEB2023,CAFM,TRANSACT,segment2,B01,Active,2,1
18,15000,CLICK,31JAN2023,CARF,LEND,segment3,B01,Cold Start,1,2
19,15000,CHECKOUT,31JAN2023,CARF,LEND,segment3,B01,Cold Start,2,2
26,22924,CLICK,26FEB2023,FIWL,INVEST,segment2,B01,Active,1,3
27,22924,CHECKOUT,26FEB2023,FIWL,INVEST,segment2,B01,Active,2,3
37,23484,CHECKOUT,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start,2,4
38,23484,CLICK,22FEB2023,CUSS,INVEST,segment2,B01,Cold Start,1,4


### Add User Features:

#### Add the following user features
- weekly interaction frequency - on average, over the whole dataset, how many times does the user interact per week
- daily interaction frequency - similar to above
- monthly interaction frequency - how many times 
- most frequenctly item interacted with (other than ALL) over the whole 
- most frequently interacted with item type
- Ratio of checkout to click for each user-item combination (ask Lize)
- Add an activity score, which is a metric that says how often, over the entire dataset, is the user active
    - Done by dividing the number of unique active days by the number of days in the dataset
- Potential other features to add from TOD:
    - average time between clicking item
    - average time between checking out the item


In [7]:
original_data =  data.copy()

target_idcol = 77196041
index = int(original_data[original_data['idcol'] == target_idcol].index[0])


In [8]:
# Add user daily activity score:
# On days that they are active, how many interactions do they make?
# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# Group by idcol to calculate total interactions and unique active days
user_interactions = data.groupby('idcol').size().reset_index(name='total_interactions')
active_days = data.groupby('idcol')['int_date'].nunique().reset_index(name='unique_active_days')

# Merge the results to calculate daily_activity_score
user_activity = pd.merge(user_interactions, active_days, on='idcol')
user_activity['daily_activity_score'] = user_activity['total_interactions'] / user_activity['unique_active_days']

# Merge the daily_activity_score back to the original DataFrame
data = pd.merge(data, user_activity[['idcol', 'daily_activity_score']], on='idcol', how='left')

data.head(40)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0


In [9]:
# Add user activity frequency:
# Count the number of days that the user was active, and divide it by the number of days in the dataset:

# Convert int_date to datetime
data['int_date'] = pd.to_datetime(data['int_date'])

# Calculate the total number of unique days in the dataset
total_days = data["int_date"].nunique()

# Calculate the number of unique days each user had interactions
user_unique_days = data.groupby('idcol')['int_date'].nunique()

# Calculate the activity rate
activity_rate = user_unique_days / total_days

# Merge the activity_rate back into the original dataframe
data = data.merge(activity_rate.rename('activity_rate'), on='idcol')

data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,0.011364
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,0.011364
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,0.011364
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,0.011364
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,0.011364
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0,0.011364
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0,0.011364
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0,0.011364
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0,0.011364
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0,0.011364


In [10]:
# # Adding daily, weekly and monthly interaction frequencies, over entire dataset
# # The interactions can be click, checkout or display. This is more a value for how often the user opens the app,
# # rather than how often they actually view or buy an item

# # Daily 

# data = original_data.copy()
# # Convert int_date to datetime
# data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# # Calculate daily interaction frequency
# daily_freq = data.groupby(['idcol', 'int_date']).size().groupby(level=0).mean().reset_index(name='avg_daily_freq')

# # Calculate weekly interaction frequency
# data['week'] = data['int_date'].dt.isocalendar().week
# weekly_freq = data.groupby(['idcol', 'week']).size().groupby(level=0).mean().reset_index(name='avg_weekly_freq')

# # # Calculate monthly interaction frequency
# # data['month'] = data['int_date'].dt.to_period('M')
# # monthly_freq = data.groupby(['idcol', 'month']).size().groupby(level=0).mean().reset_index(name='avg_monthly_freq')

# # Merge frequencies back into the original DataFrame
# data = data.merge(daily_freq, on='idcol')
# data = data.merge(weekly_freq, on='idcol')
# data = data.merge(monthly_freq, on='idcol')

# data.head(20)

In [11]:
# # Add the activity score:
# # Convert 'int_date' to datetime format
# data['int_date'] = pd.to_datetime(data['int_date'], format='%d%b%Y')

# # Calculate the total number of unique days in the dataset
# total_unique_days = data['int_date'].nunique()

# data = data.sort_values(by='int_date', ascending=False)


# # # Calculate the number of unique days each user was active
# user_unique_days = df.groupby('idcol')['int_date'].nunique().reset_index(name='active_days')

# # Calculate the activity score for each user
# user_unique_days['activity_score'] = user_unique_days['active_days'] / total_unique_days

# # Merge the activity score back into the original dataframe
# df = df.merge(user_unique_days[['idcol', 'activity_score']], on='idcol')

# print("DataFrame with activity score:")
# print(df)

In [12]:


# # Adding most clicked and most bought items and item types for each user:
# clicks = data[data['interaction'] != 'DISPLAY']
# most_clicked = clicks.groupby(['idcol', 'item']).size().reset_index(name='click_count')
# most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item']]
# most_clicked.rename(columns={'item': 'most_interacted_item'}, inplace=True)

# # Merge the most clicked and most bought items back into the original DataFrame
# data = data.merge(most_clicked, on='idcol', how='left')
# # data = data.merge(most_bought, on='idcol', how='left')

# data.head()

# Filter out 'DISPLAY' interactions
clicks = data[data['interaction'] != 'DISPLAY']

# Group by idcol and item, and count the number of interactions
interaction_counts = clicks.groupby(['idcol', 'item']).size().reset_index(name='interaction_count')

# Sort interaction counts within each user group
interaction_counts = interaction_counts.sort_values(by=['idcol', 'interaction_count'], ascending=[True, False])

# Identify the most and second most interacted items
most_interacted = interaction_counts.groupby('idcol').nth(0).reset_index()[['idcol', 'item']]
most_interacted.rename(columns={'item': 'most_interacted_item'}, inplace=True)

second_most_interacted = interaction_counts.groupby('idcol').nth(1).reset_index()[['idcol', 'item']]
second_most_interacted.rename(columns={'item': 'second_most_interacted_item'}, inplace=True)

# Merge the most and second most interacted items back into the original DataFrame
data = data.merge(most_interacted, on='idcol', how='left')
data = data.merge(second_most_interacted, on='idcol', how='left')

# Display the result
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_interacted_item,second_most_interacted_item
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,0.011364,IBAB,
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,0.011364,IBAB,
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,0.011364,CAFM,
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,0.011364,CAFM,
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,0.011364,CARF,
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,2.0,0.011364,CARF,
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,2.0,0.011364,FIWL,
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,2.0,0.011364,FIWL,
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,2.0,0.011364,CUSS,
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,2.0,0.011364,CUSS,


In [13]:
# Adding most clicked and most bought items and item types for each user:
clicks = data[data['interaction'] != "DISPLAY"]
most_clicked = clicks.groupby(['idcol', 'item_type']).size().reset_index(name='click_count')
most_clicked = most_clicked.loc[most_clicked.groupby('idcol')['click_count'].idxmax()][['idcol', 'item_type']]
most_clicked.rename(columns={'item_type': 'most_interacted'}, inplace=True)

# # Determine the most bought item for each user
# checkouts = data[data['interaction'] == 'CHECKOUT']
# most_bought = checkouts.groupby(['idcol', 'item_type']).size().reset_index(name='checkout_count')
# most_bought = most_bought.loc[most_bought.groupby('idcol')['checkout_count'].idxmax()][['idcol', 'item_type']]
# most_bought.rename(columns={'item_type': 'most_bought_item_type'}, inplace=True)

# Merge the most clicked and most bought items back into the original DataFrame
data = data.merge(most_clicked, on='idcol', how='left')
# data = data.merge(most_bought, on='idcol', how='left')

data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_interacted_item,second_most_interacted_item,most_interacted
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,0.011364,IBAB,,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,0.011364,IBAB,,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,0.011364,CAFM,,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,0.011364,CAFM,,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,0.011364,CARF,,LEND


In [14]:
# # For each unique user/item combination, what is the ratio of CHECKOUT to CLICK?# First, filter the DataFrame to include only CHECKOUT and CLICK interactions
# # NOTE: This isn't valid because an item doesn't need to be clicked to check it out - that is weird. I can intelligently fill NaN values with 1 if there was a 
# # checkout but no click and 0 otherwise, but ask Lizes opinion first

# checkout_click_df = data[data['interaction'].isin(['CHECKOUT', 'CLICK'])]

# # Group by unique idcol-item combinations and count the occurrences of each interaction type
# interaction_counts = checkout_click_df.groupby(['idcol', 'item', 'interaction']).size().unstack(fill_value=0)

# # Calculate the ratio of CHECKOUT to CLICK interactions
# interaction_counts['checkout_click_ratio'] = interaction_counts['CHECKOUT'] / interaction_counts['CLICK']


# # # Merge the ratio back to the original DataFrame based on idcol and item
# # data = data.merge(interaction_counts.reset_index()[['idcol', 'item', 'checkout_click_ratio']], on=['idcol', 'item'], how='left')

# # # Display the DataFrame with the new column
# # data.head(20)

In [15]:
# # On which page did the user checkout on the item most frequently? (NOT USING)
# # Filter the DataFrame to include only CLICK interactions
# click_df = data[data['interaction'] == 'CHECKOUT']

# # Group by user-item-screen combination and count the occurrences
# click_counts = click_df.groupby(['idcol', 'item', 'page'])['interaction'].count().reset_index()

# # Find the screen with the maximum count for each user-item combination
# max_click_screen = click_counts.groupby(['idcol', 'item']).apply(lambda x: x.loc[x['interaction'].idxmax()]).reset_index(drop=True)

# # Merge the result back to the original DataFrame based on user-item combination
# data = data.merge(max_click_screen[['idcol', 'item', 'page']], on=['idcol', 'item'], how='left')
# data.rename(columns={'page': 'most_checked_out_screen'}, inplace=True)

# # Display the DataFrame with the new column
# data.head(20)

In [16]:
# # I will fill NaN values with the mode of the data, for people of the same segment and BEH segment, for people that have not 
# # clicked or bought any items

# def fill_nan_with_mode_for_segment_and_beh_segment(data):
#     # Get unique combinations of segment and beh_segment
#     unique_combinations = data[['segment', 'beh_segment']].drop_duplicates()

#     for index, row in unique_combinations.iterrows():
#         segment = row['segment']
#         beh_segment = row['beh_segment']
        
#         # Filter the DataFrame based on the current segment and beh_segment
#         filtered_data = data[(data['segment'] == segment) & (data['beh_segment'] == beh_segment)]
        
#         # Calculate mode for each column within the segment and beh_segment group
#         mode_values = filtered_data.mode().iloc[0]
        
#         # Replace NaN values in the original DataFrame with mode values for the current segment and beh_segment
#         data.loc[(data['segment'] == segment) & (data['beh_segment'] == beh_segment)] = \
#             filtered_data.fillna(mode_values)
    
#     return data, mode_values


# test, mode_values = fill_nan_with_mode_for_segment_and_beh_segment(data)

# test.head()

### Add Item Features
- Most bought by segment
- Most bought by beh segment
- Most clicked by segment
- Most clicked by beh_segment
- For this item, what is the ratio of checkouts to clicks over the entire dataset?
- WHich screen was this item accessed from the most?

In [17]:
original_data = data.copy()
data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,daily_activity_score,activity_rate,most_interacted_item,second_most_interacted_item,most_interacted
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,2.0,0.011364,IBAB,,INSURE
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,2.0,0.011364,IBAB,,INSURE
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,2.0,0.011364,CAFM,,TRANSACT
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,2.0,0.011364,CAFM,,TRANSACT
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,2.0,0.011364,CARF,,LEND


In [18]:

sorted_segs = sorted(data['segment'].unique())
# Calculate the number of unique users in each segment
unique_users_per_segment = data.groupby('segment')['idcol'].nunique()

# Initialize a dictionary to store the normalized number of unique users for each item within each segment
unique_users_per_item_per_segment = {}

# Iterate over sorted segments
for segment in sorted_segs:
    # Filter the data to include only rows where 'segment' is equal to the current segment and remove any 'NONE' items
    item_seg_data = data[(data['segment'] == segment) & (data['item'] != 'NONE')]
    
    # Calculate the number of unique users for each item
    unique_users_per_item = item_seg_data.groupby('item')['idcol'].nunique().sort_values(ascending=False)
    
    # Normalize the number of unique users for each item by the total number of unique users in the segment
    normalized_unique_users_per_item = unique_users_per_item / unique_users_per_segment[segment]
    
    # Store the normalized number of unique users for each item within the segment
    unique_users_per_item_per_segment[segment] = normalized_unique_users_per_item
    
    # Create a new feature column to store the normalized frequencies for each item within the segment
    data[f"unique_users_{segment}"] = data['item'].map(unique_users_per_item_per_segment[segment])

# Specify columns to fill NaN values
columns_to_fill = ["unique_users_segment1", "unique_users_segment2", "unique_users_segment3", "unique_users_segment4"]

# Fill NaN values in specified columns with a specific value
data[columns_to_fill] = data[columns_to_fill].fillna(0)  # Fill NaN values with 0
# Display the DataFrame

In [19]:
# data = data.drop(columns = ["item_descrip"])
sorted_segs = sorted(data['segment'].unique())
for seg in sorted_segs:

    item_seg_data = data[(data['segment'] == seg)& (data['item'] != 'NONE')]
    #display(item_seg_data['item'].unique)
        # Calculate the frequency of each 'item_descrip' for the segment
    item_counts = item_seg_data['item'].value_counts()
        # Calculate the total count of all 'item_descrip' entries for the segment
    total_item_count = item_seg_data['item'].count()
        # Normalize the frequency of each 'item_descrip' by the total number of all 'item_descrip' entries for the segment
    normalized_item_counts = item_counts.div(total_item_count)


    # Create a new feature column to store the normalized frequencies for each item
    data[seg] = data['item'].map(normalized_item_counts)

# display(data[data['item'] != 'NONE'].head(30))
# Specify columns to fill NaN values
columns_to_fill = ["segment1", "segment2", "segment3", "segment4"]

# Fill NaN values in specified columns with a specific value
data[columns_to_fill] = data[columns_to_fill].fillna(0)  # Fill NaN values with 0

data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,...,second_most_interacted_item,most_interacted,unique_users_segment1,unique_users_segment2,unique_users_segment3,unique_users_segment4,segment1,segment2,segment3,segment4
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,...,,INSURE,0.106952,0.087759,0.034595,0.018314,0.042873,0.037911,0.012892,0.00777
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,...,,INSURE,0.106952,0.087759,0.034595,0.018314,0.042873,0.037911,0.012892,0.00777
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,...,,TRANSACT,0.022209,0.024003,0.032394,0.03178,0.008641,0.010084,0.013896,0.017649
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,...,,TRANSACT,0.022209,0.024003,0.032394,0.03178,0.008641,0.010084,0.013896,0.017649
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,...,,LEND,0.00251,0.008363,0.043662,0.038513,0.00084,0.003926,0.019792,0.023477


In [20]:
# sorted_segs = sorted(data['segment'].unique())
# # Calculate the number of unique users in each segment
# unique_users_per_segment = data.groupby('segment')['idcol'].nunique()

# # Initialize a dictionary to store the normalized number of unique users for each item within each segment
# unique_users_per_item_per_segment = {}

# # Iterate over sorted segments
# for segment in sorted_segs:
#     # Filter the data to include only rows where 'segment' is equal to the current segment and remove any 'NONE' items
#     item_seg_data = data[(data['segment'] == segment) & (data['item'] != 'NONE')]
    
#     # Calculate the number of unique users for each item
#     unique_users_per_item = item_seg_data.groupby('item')['idcol'].nunique().sort_values(ascending=False)
    
#     # Normalize the number of unique users for each item by the total number of unique users in the segment
#     normalized_unique_users_per_item = unique_users_per_item / unique_users_per_segment[segment]
    
#     # Store the normalized number of unique users for each item within the segment
#     unique_users_per_item_per_segment[segment] = normalized_unique_users_per_item
    
#     # Create a new feature column to store the normalized frequencies for each item within the segment
#     data[f"unique_users_{segment}"] = data['item'].map(unique_users_per_item_per_segment[segment])

# # Display the DataFrame

# Prep Features:

In [21]:
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id,...,second_most_interacted_item,most_interacted,unique_users_segment1,unique_users_segment2,unique_users_segment3,unique_users_segment4,segment1,segment2,segment3,segment4
0,4521,CLICK,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,1,0,...,,INSURE,0.106952,0.087759,0.034595,0.018314,0.042873,0.037911,0.012892,0.00777
1,4521,CHECKOUT,2023-02-05,IBAB,INSURE,segment1,B07,Semi Active,2,0,...,,INSURE,0.106952,0.087759,0.034595,0.018314,0.042873,0.037911,0.012892,0.00777
2,14454,CLICK,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,1,1,...,,TRANSACT,0.022209,0.024003,0.032394,0.03178,0.008641,0.010084,0.013896,0.017649
3,14454,CHECKOUT,2023-02-08,CAFM,TRANSACT,segment2,B01,Active,2,1,...,,TRANSACT,0.022209,0.024003,0.032394,0.03178,0.008641,0.010084,0.013896,0.017649
4,15000,CLICK,2023-01-31,CARF,LEND,segment3,B01,Cold Start,1,2,...,,LEND,0.00251,0.008363,0.043662,0.038513,0.00084,0.003926,0.019792,0.023477
5,15000,CHECKOUT,2023-01-31,CARF,LEND,segment3,B01,Cold Start,2,2,...,,LEND,0.00251,0.008363,0.043662,0.038513,0.00084,0.003926,0.019792,0.023477
6,22924,CLICK,2023-02-26,FIWL,INVEST,segment2,B01,Active,1,3,...,,INVEST,0.088072,0.066688,0.035915,0.040399,0.032953,0.027162,0.013589,0.017039
7,22924,CHECKOUT,2023-02-26,FIWL,INVEST,segment2,B01,Active,2,3,...,,INVEST,0.088072,0.066688,0.035915,0.040399,0.032953,0.027162,0.013589,0.017039
8,23484,CHECKOUT,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,2,4,...,,INVEST,0.000709,0.051483,0.020951,0.029087,0.00029,0.020361,0.00792,0.011433
9,23484,CLICK,2023-02-22,CUSS,INVEST,segment2,B01,Cold Start,1,4,...,,INVEST,0.000709,0.051483,0.020951,0.029087,0.00029,0.020361,0.00792,0.011433


In [119]:
# Define the user columns, item columns and interaction columns:
u_cols = ["idcol", "segment", "beh_segment", "active_ind", "most_interacted", "most_interacted_item", "daily_activity_score", "activity_rate"] # TODO Include the checlout-click ratio if useful
# u_cols = ["idcol", "segment", "beh_segment", "active_ind", "avg_daily_freq", "avg_weekly_freq", "avg_monthly_freq"]
# item_cols = ["item_id", "item", "item_type", "most_bought_by_beh_seg", "most_bought_by_seg", "most_clicked_by_beh_seg",
#              "most_clicked_by_seg"]

item_cols = ["item_id", "item_type", "unique_users_segment1", "unique_users_segment2", "unique_users_segment3", "unique_users_segment4"]
interact_cols = ["idcol", "item_id", "interaction_scores"] # I include idcol and item for now, to basically say "this user did this item with this score at this date"


test_item_cols = ["item_id", "item_type", "item"]
# Experiment 4:
# u_cols = ["idcol", "segment", "beh_segment", "active_ind"]
# item_cols = ["item_id", "item", "item_type"]

# I want the interact_cols data in the following format:
# 

user, item, rating = data[u_cols].copy(), data[item_cols].copy(), data[interact_cols].copy()
test_items = data[test_item_cols].copy()
# Accumulate rating data so that, for each unique user-item combination, there is a single row, otherwise the train-test split 
# have shared interactions. Can do an intelligent split, and use LightFM's built in weighting method,
# but the weights matrix generated with that method is the exact same, even if I do the below:
# This also allows us to incorporate the number of times that a user has interacted with an item as interaction weighting, implicitly
rating = rating.groupby(['idcol', 'item_id'], as_index=False)['interaction_scores'].sum()


# Drop duplicates, because I only need the unique items' and users' features:
item = item.drop_duplicates()
# item = item.drop(columns=["item"], inplace=True)
item = item.reset_index(drop=True)

test_items = test_items.drop_duplicates()
test_items = test_items.reset_index(drop=True)

user = user.drop_duplicates()
user = user.reset_index(drop=True)

print(rating.shape)
print(item.shape)
print(user.shape)



# Clamp the interaction_scores to a maximum of 5
# rating['interaction_scores'] = rating['interaction_scores'].clip(upper=5)
display(item.head(20))

(93712, 3)
(103, 6)
(42606, 8)


Unnamed: 0,item_id,item_type,unique_users_segment1,unique_users_segment2,unique_users_segment3,unique_users_segment4
0,0,INSURE,0.106952,0.087759,0.034595,0.018314
1,1,TRANSACT,0.022209,0.024003,0.032394,0.03178
2,2,LEND,0.00251,0.008363,0.043662,0.038513
3,3,INVEST,0.088072,0.066688,0.035915,0.040399
4,4,INVEST,0.000709,0.051483,0.020951,0.029087
5,5,LIFESTYLE,0.068973,0.061366,0.012588,0.011312
6,6,INSURE,0.072247,0.06028,0.045863,0.03582
7,7,LEND,0.164466,0.115564,0.121039,0.030434
8,8,LEND,0.012223,0.009123,0.047095,0.04767
9,9,CONNECT,0.068646,0.037906,0.024296,0.000539


In [120]:
user.head(20)

Unnamed: 0,idcol,segment,beh_segment,active_ind,most_interacted,most_interacted_item,daily_activity_score,activity_rate
0,4521,segment1,B07,Semi Active,INSURE,IBAB,2.0,0.011364
1,14454,segment2,B01,Active,TRANSACT,CAFM,2.0,0.011364
2,15000,segment3,B01,Cold Start,LEND,CARF,2.0,0.011364
3,22924,segment2,B01,Active,INVEST,FIWL,2.0,0.011364
4,23484,segment2,B01,Cold Start,INVEST,CUSS,2.0,0.011364
5,24982,segment1,B08,Cold Start,LIFESTYLE,EBSH,2.0,0.011364
6,25577,segment3,B01,Semi Active,INSURE,FILS,2.0,0.011364
7,27824,segment1,B08,Active,LEND,CTLN,2.0,0.011364
8,28951,segment1,B07,Semi Active,LEND,CTLN,2.375,0.090909
9,29630,segment4,B01,Semi Active,TRANSACT,CCCS,1.5,0.022727


In [121]:
item.head(20)

Unnamed: 0,item_id,item_type,unique_users_segment1,unique_users_segment2,unique_users_segment3,unique_users_segment4
0,0,INSURE,0.106952,0.087759,0.034595,0.018314
1,1,TRANSACT,0.022209,0.024003,0.032394,0.03178
2,2,LEND,0.00251,0.008363,0.043662,0.038513
3,3,INVEST,0.088072,0.066688,0.035915,0.040399
4,4,INVEST,0.000709,0.051483,0.020951,0.029087
5,5,LIFESTYLE,0.068973,0.061366,0.012588,0.011312
6,6,INSURE,0.072247,0.06028,0.045863,0.03582
7,7,LEND,0.164466,0.115564,0.121039,0.030434
8,8,LEND,0.012223,0.009123,0.047095,0.04767
9,9,CONNECT,0.068646,0.037906,0.024296,0.000539


In [122]:
rating.head(20)

Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,3
1,14454,1,3
2,15000,2,3
3,22924,3,3
4,23484,4,3
5,24982,5,3
6,25577,6,3
7,27824,7,3
8,28951,7,9
9,28951,8,3


In [123]:
# Normalise the interaction scores:
# Group by idcol and calculate the sum of interaction scores for each user
sum_interaction_scores = rating.groupby('idcol')['interaction_scores'].sum()

# Merge the sum of interaction scores back to the original DataFrame based on idcol
rating = rating.merge(sum_interaction_scores.reset_index(), on='idcol', suffixes=('', '_sum'))

# Calculate normalized interaction scores by dividing each interaction score by the sum
rating['normalized_interaction_score'] = rating['interaction_scores'] / rating['interaction_scores_sum']

# Replace NaN values with 0 in the normalized interaction scores column
rating['normalized_interaction_score'].fillna(0, inplace=True)

# Drop the temporary sum column
rating.drop(columns=['interaction_scores_sum'], inplace=True)

# Drop the unnormalised column
rating.drop(columns=['interaction_scores'], inplace=True)

# rename
rating.rename(columns={'normalized_interaction_score': 'interaction_scores'}, inplace=True)

# Display the DataFrame with the normalized interaction scores
rating.head(20)


Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,1.0
1,14454,1,1.0
2,15000,2,1.0
3,22924,3,1.0
4,23484,4,1.0
5,24982,5,1.0
6,25577,6,1.0
7,27824,7,1.0
8,28951,7,0.333333
9,28951,8,0.111111


In [124]:
# # Normalise the user averages:
# # Function to normalize a column
# def normalize_column(df, column_name):
#     min_value = df[column_name].min()
#     max_value = df[column_name].max()
#     df[column_name] = (df[column_name] - min_value) / (max_value - min_value)
#     return df

# # Normalize the numerical columns
# numerical_columns = ['avg_daily_freq', 'avg_weekly_freq', 'avg_monthly_freq']

# for column in numerical_columns:
#     user = normalize_column(user, column)


## User Features Data preparation

In [125]:
user.head()


Unnamed: 0,idcol,segment,beh_segment,active_ind,most_interacted,most_interacted_item,daily_activity_score,activity_rate
0,4521,segment1,B07,Semi Active,INSURE,IBAB,2.0,0.011364
1,14454,segment2,B01,Active,TRANSACT,CAFM,2.0,0.011364
2,15000,segment3,B01,Cold Start,LEND,CARF,2.0,0.011364
3,22924,segment2,B01,Active,INVEST,FIWL,2.0,0.011364
4,23484,segment2,B01,Cold Start,INVEST,CUSS,2.0,0.011364


In [126]:
# Normalise the user feaetures column wise:
from sklearn.preprocessing import MinMaxScaler
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Select the columns to normalize
columns_to_normalize = ['daily_activity_score', 'activity_rate']

# Apply the scaler to the selected columns
user[columns_to_normalize] = scaler.fit_transform(user[columns_to_normalize])

# Display the result
user.head(20)

Unnamed: 0,idcol,segment,beh_segment,active_ind,most_interacted,most_interacted_item,daily_activity_score,activity_rate
0,4521,segment1,B07,Semi Active,INSURE,IBAB,0.015625,0.0
1,14454,segment2,B01,Active,TRANSACT,CAFM,0.015625,0.0
2,15000,segment3,B01,Cold Start,LEND,CARF,0.015625,0.0
3,22924,segment2,B01,Active,INVEST,FIWL,0.015625,0.0
4,23484,segment2,B01,Cold Start,INVEST,CUSS,0.015625,0.0
5,24982,segment1,B08,Cold Start,LIFESTYLE,EBSH,0.015625,0.0
6,25577,segment3,B01,Semi Active,INSURE,FILS,0.015625,0.0
7,27824,segment1,B08,Active,LEND,CTLN,0.015625,0.0
8,28951,segment1,B07,Semi Active,LEND,CTLN,0.021484,0.145833
9,29630,segment4,B01,Semi Active,TRANSACT,CCCS,0.007812,0.020833


In [127]:
user_train = pd.get_dummies(user,dtype = int, prefix="", prefix_sep="")
user_features_col = user_train.drop(columns =['idcol']).columns.values
user_feat = user_train.drop(columns =['idcol']).to_dict(orient='records')

user_train = user_train.sort_values(by='idcol', ascending=True)
# print(user_feat)
# user.shape
# user.shape
user_train.head(20)
# print(user_features_col)


Unnamed: 0,idcol,daily_activity_score,activity_rate,segment1,segment2,segment3,segment4,B01,B02,B03,...,KYCA,MMMC,MMSM,NACS,NAFW,NASD,NATR,SEVP,WHCR,XCFL
0,4521,0.015625,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,14454,0.015625,0.0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,15000,0.015625,0.0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22924,0.015625,0.0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,23484,0.015625,0.0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,24982,0.015625,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,25577,0.015625,0.0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,27824,0.015625,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,28951,0.021484,0.145833,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,29630,0.007812,0.020833,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
gog = data.drop_duplicates()

# # Put a value of 0.5 at the second most interacted item:

# Create a dictionary mapping idcol to second_most_interacted_item
second_most_interacted_dict = gog.set_index('idcol')['second_most_interacted_item'].dropna().to_dict()

# Update user_train DataFrame using the dictionary
for idcol, itm in second_most_interacted_dict.items():
    user_train.loc[user_train['idcol'] == idcol, itm] = 0.5

# Display the result
# print(user_train)

user_train.head()

Unnamed: 0,idcol,daily_activity_score,activity_rate,segment1,segment2,segment3,segment4,B01,B02,B03,...,KYCA,MMMC,MMSM,NACS,NAFW,NASD,NATR,SEVP,WHCR,XCFL
0,4521,0.015625,0.0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14454,0.015625,0.0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15000,0.015625,0.0,0,0,1,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22924,0.015625,0.0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,23484,0.015625,0.0,0,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Item Features Data prep:

In [129]:
# item.head()

item_features = pd.get_dummies(item, dtype = int, prefix="", prefix_sep="")
# item_features["idcol"] = data["idcol"]
item_features_col = item_features.drop(columns=['item_id']).columns.values


# Need some for of identification for the item features
# item_features["idcol"] = data["idcol"]


item_features.fillna(value = 0, inplace=True)
# item_features.shape
# print(item_feat[0])
# item.head()
# print(item_features.iloc[0,:])
# print(item_features_col)
item_features.head()

nan_columns = item_features.columns[item_features.isna().any()].tolist()

item_feat = item_features.drop(columns =['item_id']).to_dict(orient='records')

item_features.head()

# print()

Unnamed: 0,item_id,unique_users_segment1,unique_users_segment2,unique_users_segment3,unique_users_segment4,CONNECT,INSURE,INVEST,LEND,LIFESTYLE,TRANSACT
0,0,0.106952,0.087759,0.034595,0.018314,0,1,0,0,0,0
1,1,0.022209,0.024003,0.032394,0.03178,0,0,0,0,0,1
2,2,0.00251,0.008363,0.043662,0.038513,0,0,0,1,0,0
3,3,0.088072,0.066688,0.035915,0.040399,0,0,1,0,0,0
4,4,0.000709,0.051483,0.020951,0.029087,0,0,1,0,0,0


## Fit into LightFM Dataset

In [130]:
dataset = Dataset()
users=[x for x in user_train['idcol']]

items=[x for x in item['item_id']]


dataset.fit(users=users, items=items, user_features=user_features_col)

# num_users, num_items = dataset.interactions_shape()
# print('Num users: {}, num_items {}.'.format(num_users, num_items))


## Build Item Features to be fitted into model

In [131]:
# 
# item_features = dataset.build_item_features(((x,y) for x,y in zip(item_features['item_id'],item_feat)), normalize=False)
# item_features = dataset.build_item_features((x,item_features_col) for x in item_features['item_id'])
# for (x,y) in zip(item_features['item_id'],item_feat):
#     print(x)
#     print(y)
#     break

In [132]:
print(item_features.shape)
# user_train.info()
# user.head()

(103, 11)


## Build User Features to be fit into model

In [133]:
print(user_feat[0])
user_features = dataset.build_user_features(((x,y) for x,y in zip(user_train['idcol'],user_feat)), normalize=False)

# user_features = dataset.build_user_features(((x,user_features_col) for x in user_train['idcol']))

# user_features_col

{'daily_activity_score': 0.015625, 'activity_rate': 0.0, 'segment1': 1, 'segment2': 0, 'segment3': 0, 'segment4': 0, 'B01': 0, 'B02': 0, 'B03': 0, 'B04': 0, 'B05': 0, 'B06': 0, 'B07': 1, 'B08': 0, 'B09': 0, 'B10': 0, 'B11': 0, 'B12': 0, 'B13': 0, 'B14': 0, 'B15': 0, 'B16': 0, 'B17': 0, 'B18': 0, 'B19': 0, 'B20': 0, 'B21': 0, 'B22': 0, 'B23': 0, 'B24': 0, 'B25': 0, 'B26': 0, 'B27': 0, 'B28': 0, 'B29': 0, 'B30': 0, 'B31': 0, 'B32': 0, 'B33': 0, 'B34': 0, 'B35': 0, 'B36': 0, 'B37': 0, 'B38': 0, 'B39': 0, 'B40': 0, 'B41': 0, 'B42': 0, 'B44': 0, 'B46': 0, 'B47': 0, 'B48': 0, 'B49': 0, 'B50': 0, 'Active': 0, 'Cold Start': 0, 'Semi Active': 1, 'CONNECT': 0, 'INSURE': 1, 'INVEST': 0, 'LEND': 0, 'LIFESTYLE': 0, 'TRANSACT': 0, 'CABC': 0, 'CACU': 0, 'CAFB': 0, 'CAFI': 0, 'CAFM': 0, 'CAFS': 0, 'CAFU': 0, 'CALI': 0, 'CANL': 0, 'CAPO': 0, 'CARE': 0, 'CARF': 0, 'CASD': 0, 'CASV': 0, 'CBCC': 0, 'CBDS': 0, 'CBEL': 0, 'CBLT': 0, 'CBPA': 0, 'CBPB': 0, 'CBTMT': 0, 'CBTUD': 0, 'CBTULS': 0, 'CBVC': 0, 'CCAI

In [134]:
print(user_features.shape)

(42606, 42772)


## Build interactions (user — item) and its respective weights (in this case our custom weights - 0, 1, 2)

In [135]:
# from sklearn.model_selection import train_test_split


# # We split the data into train and test by taking 20% of interactions for each user and moving that to the test set, i.e the training set will contain 80% of the items
# # that the user interacted with

# # Custom train-test split: Split the data into train and test before building interactions:
# train_interactions = pd.DataFrame()
# test_interaction = pd.DataFrame()

# for user_id, group in rating.groupby('idcol'):
#     if len(group) == 1:
#         train_interactions = pd.concat([train_interactions, group])
#     else:

#         train_group, test_group = train_test_split(group, test_size=0.2, train_size=0.8, random_state=42)
#         train_interactions = pd.concat([train_interactions, train_group])
#         test_interaction = pd.concat([test_interaction, test_group])





In [136]:
# print(train_interactions.shape)
# print(test_interaction.shape)

rating.head(10)

Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,1.0
1,14454,1,1.0
2,15000,2,1.0
3,22924,3,1.0
4,23484,4,1.0
5,24982,5,1.0
6,25577,6,1.0
7,27824,7,1.0
8,28951,7,0.333333
9,28951,8,0.111111


In [137]:
rating = rating.sort_values(by="idcol")

(interactions, weights) = dataset.build_interactions((x, y, w) for x,y,w in zip(rating['idcol'], rating['item_id'], rating["interaction_scores"]))
# (interactions, weights) = dataset.build_interactions((x, y) for x,y in zip(rating['idcol'], rating['item_id']))



# With custom train/test splitting:
# train, train_w = dataset.build_interactions((x, y) for x,y in zip(train_interactions['idcol'], train_interactions['item_id']))

# test, test_w = dataset.build_interactions((x, y) for x,y in zip(test_interaction['idcol'], test_interaction['item_id']))


# print(weights.todense())
rating.head(10)

# print(weights.todense()[:,0].sum())

Unnamed: 0,idcol,item_id,interaction_scores
0,4521,0,1.0
1,14454,1,1.0
2,15000,2,1.0
3,22924,3,1.0
4,23484,4,1.0
5,24982,5,1.0
6,25577,6,1.0
7,27824,7,1.0
15,28951,14,0.074074
14,28951,13,0.074074


# Model Training

## Train Test Split

In [138]:
# ORIGINAL:
train, test = random_train_test_split(interactions,test_percentage=0.2, random_state=42)
train_w, test_w = random_train_test_split(weights, test_percentage=0.2, random_state=42)


print(train.shape)
print(test.shape)
# Make a custom train-test split that uses either a_ the last 20% of interactions by date, or a random 20% of interactions for the test split
# This ensures that there are no cold start users in the testing set. We will do cold-start testing in a different manner

# SPlit the data from original data and then do all the data processing steps for each step separately. This ensures that the data doesn't bleed over into the test set.


(42606, 103)
(42606, 103)


## Model

In [139]:
# Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
# Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
no_components = 45
loss = 'warp'
epoch = 20
num_thread = 8
learning_rate = 0.05
max_sampled = 10
# n = 5.4809279704140055
# k = 9.583359248210815
model = LightFM(no_components= no_components, loss=loss, random_state = 42, learning_rate=learning_rate, max_sampled=max_sampled)
# model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

# Experiemt 7
# Training without user features
model.fit(train,  user_features= user_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

# Pure CF
# model.fit(train, epochs=epoch,num_threads = num_thread)


<lightfm.lightfm.LightFM at 0x75524f6ac040>

## Model Evaluation

In [140]:
k=5

train_precision = precision_at_k(model, train, k=k, user_features=user_features, num_threads=num_thread).mean()
test_precision = precision_at_k(model, test,train_interactions=train, k=k, user_features=user_features, num_threads=num_thread).mean()

train_recall = recall_at_k(model, train, k=k, user_features=user_features, num_threads=num_thread).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=k, user_features=user_features, num_threads=num_thread).mean()

train_auc = auc_score(model, train, user_features=user_features, num_threads=num_thread).mean()
test_auc = auc_score(model, test, train_interactions=train, user_features=user_features, num_threads=num_thread).mean()


test_rr = reciprocal_rank(model, test, train_interactions=train, user_features=user_features).mean()
train_rr = reciprocal_rank(model, train, user_features=user_features).mean()
# No features:
# train_precision = precision_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_precision = precision_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_recall = recall_at_k(model, train, k=k, num_threads=num_thread).mean()
# test_recall = recall_at_k(model, test,train_interactions=train, k=k, num_threads=num_thread).mean()

# train_auc = auc_score(model, train, num_threads=num_thread).mean()
# test_auc = auc_score(model, test, train_interactions=train, num_threads=num_thread).mean()

# test_rr = reciprocal_rank(model, test, train_interactions=train).mean()
# train_rr = reciprocal_rank(model, train).mean()

print('Precision: train %.4f' % (train_precision))
print('Precision: test %.4f' % (test_precision))

print('Recall: train %.4f' % (train_recall))
print('Recall: test %.4f' % (test_recall))

print('AUC: train %.4f' % (train_auc))
print('AUC: test %.4f' % (test_auc))

print('RR: train %.4f' % (train_rr))
print('RR: test %.4f' % (test_rr))

# best sofar = 0.18758663535118103
# Precision: train 0.2957
# Precision: test 0.1881
# Recall: train 0.9053
# Recall: test 0.8069
# AUC: train 0.9869
# AUC: test 0.9553

Precision: train 0.3664
Precision: test 0.1529
Recall: train 0.9779
Recall: test 0.6516
AUC: train 0.9975
AUC: test 0.9101
RR: train 0.9695
RR: test 0.6628


In [None]:
print(test_precision)

# Automated Hyperparameter Optimisation

In [None]:
# import optuna
# from lightfm import LightFM
# from lightfm.datasets import fetch_movielens
# from lightfm.evaluation import auc_score

# # Fetch the dataset

# def objective(trial):

#     # Best hyperparameters:  {'no_components': 45, 'learning_rate': 0.09949391010649568, 'k': 19.29548285586018, 'n': 10.515335810044794}
#     #  Other best: {'no_components': 50, 'learning_rate': 0.08062443053534539, 'k': 9.583359248210815, 'n': 5.4809279704140055}.
#     # Define the hyperparameters to be tuned
#     no_components = trial.suggest_int('no_components', 10, 50)
#     learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-1)
#     # item_alpha = trial.suggest_loguniform('item_alpha', 1e-6, 1e-1)
#     # user_alpha = trial.suggest_loguniform('user_alpha', 1e-6, 1e-1)
#     k = trial.suggest_loguniform('k', 5, 25)
#     n = trial.suggest_loguniform('n', 5, 25)
    
#     # Create the LightFM model
#     model = LightFM(
#         loss='warp',
#         no_components=no_components,
#         learning_rate=learning_rate,
#         # item_alpha=item_alpha,
#         # user_alpha=user_alpha,
#         k=k,
#         n=n
#     )
#     model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)
    
#     # Evaluate the model
#     pak = precision_at_k(model, test,train_interactions=train, k=5,item_features=item_features, user_features=user_features, num_threads=num_thread).mean()
    
#     return pak

# # Run the optimization
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=400)

# print('Best hyperparameters: ', study.best_params)
# np.save("BestParams.npy", study.best_params)
# print('Best precision@k=5: ', study.best_value)


# Predictions

In [87]:
target_idcol = 77196041
selected_user = data[data["active_ind"]=="Active"].sample(n=1)
target_idcol = selected_user["idcol"].iloc[0]
# target_idcol = 155531648
print(target_idcol)
# Rank the items based on interactions
# Assign scores to interaction types
# interaction_scores = {'DISPLAY': 0, 'CLICK': 1, 'CHECKOUT': 2}

predict_user = data[data["idcol"] == target_idcol]

predict_user.head()

# Map interaction types to scores
predict_user['interaction_score'] = predict_user['interaction'].map(interaction_scores)

# Rank items based on scores
predict_user['item_rank'] = predict_user.groupby('idcol')['interaction_score'].rank(method='max', ascending=False)

# Sort dataframe by item rank
predict_user = predict_user.sort_values(by='item_rank')

predict_user.head(-1)

true_items = predict_user['item'].tolist()
true_items = list(set(true_items))


predict_user[u_cols].head(10)
# Sit langs mekaar:
# model recommendation, user se eie actual interactions, all items popularity over entire dataset, all item popularity for user segment, all item popularity for beh_segment

131166853


Unnamed: 0,idcol,segment,beh_segment,active_ind,most_interacted,most_interacted_item,daily_activity_score,activity_rate
118953,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118952,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118945,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118935,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118933,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118940,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118932,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118929,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118926,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091
118941,131166853,segment2,B18,Active,INSURE,EBEM,2.142857,0.159091


In [88]:



map = dataset._user_id_mapping
index = map[target_idcol]
print(index)

scores = model.predict(index, np.arange(103), user_features=user_features)

# scores = model.predict(index, np.arange(103))
# scores = model.predict(index, np.arange(103))
# print(scores)
# print(user.iloc[index,:])
top_items = test_items.iloc[np.argsort(-scores)]
# print(scores)
# top_items.head()
# # print(item.shape)
# # print(top_items)
# known_positives = item.iloc[interactions.tocsr()[index].indices]

# top_items[0:10]
recommended_list = top_items['item'].tolist()
recommended_list = list(set(recommended_list))

recommended_list_types = top_items['item_type'].tolist()
recommended_list_types = list(set(recommended_list_types))

print(true_items)
print(recommended_list_types)

# def precision_at_k(recommended_list, actual_list, k=10):
#     # Get the intersection of the recommended list and the actual list up to k
#     intersection = set(recommended_list[:k]) & set(actual_list)
    
#     # Calculate precision@k
#     precision = len(intersection) / k
    
#     return precision

# # Calculate precision@k=10
# precision = precision_at_k(recommended_list, true_items, k=5)
# print("Precision@k=10:", precision)
data.head()
print(recommended_list.index("CTLN"))

22727
['CBPA', 'CABC', 'NAFW', 'IBAC', 'FILS', 'IBAA', 'IPRA', 'IBAB', 'EBEM', 'MMMC', 'NATR']
['INSURE', 'LEND', 'INVEST', 'TRANSACT', 'CONNECT', 'LIFESTYLE']
33


In [89]:
# # Rank items over the entire dataset
# # Group by item and sum the interaction scores
# item_scores = data.groupby('item')['interaction_scores'].sum()

# # Sort items by cumulative interaction scores in descending order
# sorted_items = item_scores.sort_values(ascending=False)

# # Return list of unique items sorted by their cumulative interaction scores
# ranked_items = sorted_items.index.tolist()

# print(ranked_items)

# Rank items over the entire dataset based on the number of unique users

# Group by item and count the number of unique users
unique_user_counts = data.groupby('item')['idcol'].nunique().reset_index(name='unique_user_count')

# Sort the items based on the count of unique users in descending order
unique_user_counts = unique_user_counts.sort_values(by='unique_user_count', ascending=False)

# Display the result
ranked_items = unique_user_counts["item"].tolist()
print(ranked_items)

['CTLN', 'IBAA', 'IBAB', 'FIHC', 'FIWL', 'CUPL', 'FILS', 'IBAC', 'CACU', 'CBPA', 'EBSH', 'CBLT', 'CBPB', 'NATR', 'CBEL', 'FICQ', 'CBVC', 'MMMC', 'NAFW', 'EBEM', 'IPRA', 'XCFL', 'NACS', 'CUPX', 'IBAM', 'MMSM', 'EBWP', 'IPTF', 'CASD', 'EBQF', 'CABC', 'IBIC', 'NASD', 'CUHS', 'CAFM', 'CSPL', 'EBET', 'CCLI', 'EBKA', 'CUSS', 'CBTULS', 'CCNC', 'CARF', 'CCAI', 'EBSB', 'EBTV', 'CBCC', 'EVCU', 'FLIS', 'CUSZ', 'CCCU', 'IBDP', 'CAFI', 'IBPP', 'IBGC', 'CASV', 'HLGG', 'EBSP', 'CARE', 'IPSG', 'EVGW', 'GASS', 'FIFS', 'EBIB', 'CCCS', 'ISBCU', 'CUSI', 'EBGA', 'CBTUD', 'SEVP', 'CUSB', 'CAFS', 'CAFU', 'CBTMT', 'CAPO', 'CBDS', 'EBUD', 'CCAN', 'HLGH', 'CALI', 'CAFB', 'EBXM', 'EVAP', 'IBDL', 'EBPD', 'EBBF', 'EBSL', 'DOAA', 'IPFD', 'KYCA', 'FHIS', 'HLGE', 'EBGM', 'GAFC', 'CANL', 'EBQB', 'WHCR', 'DOSW', 'FIWR', 'IPFN', 'IPST', 'IPMX', 'IPSD']


In [90]:
# # Rank items over the active user's segment
# segment_of_interest = predict_user["segment"].iloc[0]
# segment_df = data[data['segment'] == segment_of_interest]
# item_scores = segment_df.groupby('item')['interaction_scores'].sum()

# # Sort items by cumulative interaction scores in descending order
# sorted_items = item_scores.sort_values(ascending=False)

# # Return list of unique items sorted by their cumulative interaction scores
# ranked_items_bysegment = sorted_items.index.tolist()

# print(ranked_items_bysegment)

segment_of_interest = predict_user["segment"].iloc[0]
segment_df = data[data['segment'] == segment_of_interest]

unique_user_counts = segment_df.groupby('item')['idcol'].nunique().reset_index(name='unique_user_count')

# Sort the items based on the count of unique users in descending order
unique_user_counts = unique_user_counts.sort_values(by='unique_user_count', ascending=False)

# Display the result
ranked_items_bysegment = unique_user_counts["item"].tolist()
print(ranked_items_bysegment)


['CTLN', 'IBAA', 'IPRA', 'IBAB', 'FIHC', 'IPTF', 'FIWL', 'CUPL', 'CACU', 'CUHS', 'EBSH', 'FILS', 'CUSS', 'IBAC', 'NATR', 'CUSZ', 'CBPA', 'NAFW', 'XCFL', 'CUPX', 'CBPB', 'MMMC', 'CBVC', 'NACS', 'IBAM', 'NASD', 'FICQ', 'IBIC', 'CBEL', 'CBLT', 'CABC', 'CSPL', 'MMSM', 'CASD', 'CAFM', 'EBET', 'CCNC', 'EBEM', 'EBTV', 'CBCC', 'FLIS', 'CBTULS', 'HLGG', 'EBSB', 'EBSP', 'EVCU', 'CCCU', 'CBTMT', 'IPSG', 'IBPP', 'CASV', 'ISBCU', 'CCCS', 'CCLI', 'CARF', 'FIFS', 'IBGC', 'IBDP', 'EBIB', 'EBQF', 'EVGW', 'CBDS', 'EBGA', 'CAFS', 'CCAI', 'SEVP', 'GASS', 'EBUD', 'EBXM', 'CAFI', 'CAFU', 'CBTUD', 'CALI', 'CARE', 'CAPO', 'EVAP', 'EBWP', 'CCAN', 'IPFD', 'CAFB', 'DOAA', 'KYCA', 'IBDL', 'EBBF', 'EBQB', 'CANL', 'HLGE', 'EBPD', 'FHIS', 'FIWR', 'DOSW', 'GAFC', 'EBGM', 'IPFN', 'EBKA', 'IPST']


In [91]:
# # Rank items over the active user's beh_segment
# beh_segment_of_interest = predict_user["beh_segment"].iloc[0]
# segment_df = data[data['beh_segment'] == beh_segment_of_interest]
# item_scores = segment_df.groupby('item')['interaction_scores'].sum()

# # Sort items by cumulative interaction scores in descending order
# sorted_items = item_scores.sort_values(ascending=False)

# # Return list of unique items sorted by their cumulative interaction scores
# ranked_items_bybehsegment = sorted_items.index.tolist()

# print(ranked_items_bybehsegment)

segment_of_interest = predict_user["beh_segment"].iloc[0]
segment_df = data[data['beh_segment'] == segment_of_interest]

unique_user_counts = segment_df.groupby('item')['idcol'].nunique().reset_index(name='unique_user_count')

# Sort the items based on the count of unique users in descending order
unique_user_counts = unique_user_counts.sort_values(by='unique_user_count', ascending=False)

# Display the result
ranked_items_bybehsegment = unique_user_counts["item"].tolist()
print(ranked_items_bybehsegment)

['CTLN', 'IBAA', 'IBAB', 'FIHC', 'NATR', 'IBAC', 'XCFL', 'FILS', 'FIWL', 'NAFW', 'CBPA', 'CACU', 'CUPL', 'CBEL', 'CBLT', 'FICQ', 'CBPB', 'IPRA', 'NASD', 'EBSH', 'MMMC', 'CBVC', 'IPTF', 'MMSM', 'NACS', 'CUPX', 'IBAM', 'CABC', 'IBIC', 'CUHS', 'EBEM', 'CSPL', 'CUSS', 'CASD', 'EBET', 'EBQF', 'CBTULS', 'EVCU', 'CBCC', 'CAFM', 'EBWP', 'CCNC', 'CUSZ', 'EBTV', 'FLIS', 'CCLI', 'EBSP', 'CASV', 'IPSG', 'HLGG', 'EVGW', 'IBDP', 'CARF', 'ISBCU', 'CBTUD', 'FIFS', 'CBTMT', 'GASS', 'CCAI', 'IBPP', 'IBGC', 'CUSI', 'EBIB', 'SEVP', 'CBDS', 'CAPO', 'CAFI', 'CUSB', 'EBGA', 'CCCS', 'EBKA', 'EBSB', 'CCCU', 'CARE', 'CAFS', 'CCAN', 'IBDL', 'EVAP', 'EBUD', 'EBXM', 'EBBF', 'CAFB', 'CALI', 'IPFD', 'CAFU', 'KYCA', 'HLGH', 'GAFC', 'EBGM', 'FHIS', 'DOSW', 'EBPD', 'WHCR', 'DOAA', 'CANL', 'FIWR', 'EBSL', 'IPFN', 'HLGE', 'IPST', 'EBQB', 'IPMX']


In [94]:
# Show the true items for the 2 next most similar users
# First, lets find the two most similar users:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)
    # user_bias ,user_representations = model.get_user_representations()

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar
    
# map = dataset._user_id_mapping
# index = map[77196041]
similar_item_list = similar_users(index,model, N = 3, norm=True)
display(similar_item_list)
similar_idx = [x[0] for x in similar_item_list]
filtered_data = user.loc[similar_idx, :]
filtered_data.head(20)

two_users = filtered_data["idcol"].tolist()[1:]
print(two_users)

next_two_users_items = []
for id in two_users:
    predict_user = data[data["idcol"] == id]
    true = predict_user['item'].tolist()
    true = list(set(true))
    next_two_users_items.append(true)


print(next_two_users_items[1])

# Function to look up item_type by item_id
def get_item_type(item):
    # Use loc to find the row where item_id matches and get the item_type
    item_type = test_items.loc[test_items['item'] == item, 'item_type']
    # Return the item_type if found, otherwise return None
    return item_type.iloc[0] if not item_type.empty else None

itm_tps = []
for x in recommended_list:
    itm_tps.append(get_item_type(x))

true_item_types = []
for x in true_items:
    true_item_types.append(get_item_type(x))

ranked_tps = []
for x in ranked_items:
    ranked_tps.append(get_item_type(x))

rank_seg_tps = []
for x in ranked_items_bysegment:
    rank_seg_tps.append(get_item_type(x))

rank_bseg_tps = []
for x in ranked_items_bybehsegment:
    rank_bseg_tps.append(get_item_type(x))

[(22727, 3.6198), (25863, 2.7305136), (32996, 2.678877)]

[182718087, 355213039]
['IBAM', 'CABC', 'IBAA', 'EBEM', 'CTLN', 'MMMC', 'NATR']


In [95]:
# Combine all those into a single dataframe so I can see them side by side
num_item = len(true_items)
max_length = 30
# print(num_item)


recdata = {
    'True Items': true_items + [None] * (max_length - num_item),
    'typs': true_item_types + [None] * (max_length - len(true_item_types)),
    'User2' : next_two_users_items[0] + [None] * (max_length - len(next_two_users_items[0])),
    'User3' : next_two_users_items[1] + [None] * (max_length - len(next_two_users_items[1])),
    'Recommended Items': recommended_list[:max_length],
    "Rec itm types": itm_tps[:max_length],
    'Most Popular Tot': ranked_items[:max_length],
    "pop types": ranked_tps[:max_length],
    'Most Popular Seg': ranked_items_bysegment[:max_length],
    "popseg types": rank_seg_tps[:max_length],
    'Most Popular BSeg': ranked_items_bybehsegment[:max_length],
    "popbseg types": rank_bseg_tps[:max_length],
}

# Create DataFrame from dictionary
df = pd.DataFrame(recdata)

def listwise_precision_at_k(recommended_list, actual_list, k=10):
    # Get the intersection of the recommended list and the actual list up to k
    intersection = set(recommended_list[:k]) & set(actual_list)
    
    # Calculate precision@k
    precision = len(intersection) / k
    
    return precision

print(listwise_precision_at_k(recommended_list, true_items))

df.head(-1)


# True items for next 2 most similar users
# Recommend items' types

0.0


Unnamed: 0,True Items,typs,User2,User3,Recommended Items,Rec itm types,Most Popular Tot,pop types,Most Popular Seg,popseg types,Most Popular BSeg,popbseg types
0,CBPA,CONNECT,CBPA,IBAM,FHIS,INSURE,CTLN,LEND,CTLN,LEND,CTLN,LEND
1,CABC,INVEST,CBPB,CABC,CBVC,CONNECT,IBAA,INSURE,IBAA,INSURE,IBAA,INSURE
2,NAFW,INSURE,CBVC,IBAA,CUSS,INVEST,IBAB,INSURE,IPRA,INVEST,IBAB,INSURE
3,IBAC,INSURE,CUSZ,EBEM,FIHC,INSURE,FIHC,INSURE,IBAB,INSURE,FIHC,INSURE
4,FILS,INSURE,IPRA,CTLN,CAFS,TRANSACT,FIWL,INVEST,FIHC,INSURE,NATR,INSURE
5,IBAA,INSURE,CBEL,MMMC,EBIB,LIFESTYLE,CUPL,LEND,IPTF,INVEST,IBAC,INSURE
6,IPRA,INVEST,CUSS,NATR,CUSB,INVEST,FILS,INSURE,FIWL,INVEST,XCFL,LIFESTYLE
7,IBAB,INSURE,CBLT,,EBWP,LIFESTYLE,IBAC,INSURE,CUPL,LEND,FILS,INSURE
8,EBEM,LIFESTYLE,FIHC,,MMSM,LIFESTYLE,CACU,TRANSACT,CACU,TRANSACT,FIWL,INVEST
9,MMMC,LIFESTYLE,IBAA,,CASV,INVEST,CBPA,CONNECT,CUHS,INVEST,NAFW,INSURE


## Similar Item Calculation using cosine similarity

In [141]:
def similar_items(item_id, model, N=10, norm = True):
    item_bias ,item_representations = model.get_item_representations(features = item_features)

    # Cosine similarity
    scores = item_representations.dot(item_representations[item_id, :])
    item_norms = np.linalg.norm(item_representations, axis=1)

    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]/ item_norms[item_id] ), key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] ), key=lambda x: -x[1])
    return similar


value_to_find = 'CBVC'
value_to_compare = "CBPA"
index = test_items['item'].eq(value_to_find).idxmax()
print(index)
similar_item_list = similar_items(index, model, N=103)

simscores = [x[1] for x in similar_item_list]


similar_idx = [x[0] for x in similar_item_list ]
siitms = test_items.iloc[similar_idx]# Can also add the other

siitms["scores"] = simscores

siitms.head(104)


scores_column_name = "scores"  # Replace this with the actual column name

# Filter the DataFrame based on the condition value_to_compare
filtered_items = siitms[siitms["item"] == value_to_compare]

# Check if any items match the condition
if not filtered_items.empty:
    # Extract the score from the first matching item
    compare_score = filtered_items.iloc[0][scores_column_name]
    print("Comparison score:", compare_score)
else:
    print("No items matching the condition:", value_to_compare)

# print(compare_score)

71


ValueError: dimension mismatch

In [None]:
nan_counts = test_items.isna().sum()
rows_with_nan = test_items[test_items.isna().any(axis=1)]
print(nan_counts)
display(rows_with_nan)

## Similar User Calculation

In [None]:
def similar_users(user_id, model, N=10, norm = True):
    user_bias ,user_representations = model.get_user_representations(features= user_features)

    # Cosine similarity
    scores = user_representations.dot(user_representations[user_id, :])
    item_norms = np.linalg.norm(user_representations, axis=1)
    
    if norm == True:
        scores /= item_norms
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best] / item_norms[user_id]), 
                    key=lambda x: -x[1])
    else:
        best = np.argpartition(scores, -N)[-N:]
        similar = sorted(zip(best, scores[best]), 
                    key=lambda x: -x[1])
    return similar
    
# map = dataset._user_id_mapping
# index = map[77196041]
similar_item_list = similar_users(index,model, N = 3)
print(similar_item_list)
similar_idx = [x[0] for x in similar_item_list]
filtered_data = user.loc[similar_idx, :]
filtered_data.head(20)



In [None]:
# print(u_cols)


# i = 0
# lists = []
# for index, row in filtered_data.iterrows():
#     print(row)
#     break
#     userlst = []
#     pos_idxs = row[row == 1].index.tolist()
#     userlst.append(filtered_data.iloc[i,0])
#     userlst += pos_idxs
    
#     i+=1
#     lists.append(userlst)


# new_df = pd.DataFrame(data = lists, columns = u_cols)
# new_df.head(-1)



## Cold Start Problem

In [None]:
# import random

# from scipy import sparse

# def format_newuser_input(user_feature_map, user_feature_list):
#   num_features = len(user_feature_list)
#   normalised_val = 1.0 
#   target_indices = []
#   for feature in user_feature_list:
#     try:
#         target_indices.append(user_feature_map[feature])
#     except KeyError:
#         print("new user feature encountered '{}'".format(feature))
#         pass

#   new_user_features = np.zeros(len(user_feature_map.keys()))
#   for i in target_indices:
#     new_user_features[i] = normalised_val
#   new_user_features = sparse.csr_matrix(new_user_features)
#   return(new_user_features)

# user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()
# user_feature_list = ["segment4", "B01", "Cold Start"]

# new_user_features = format_newuser_input(user_feature_map, u_cols)
# scores = model.predict(0, np.arange(104), user_features=new_user_features)

# top_items = item.iloc[np.argsort(-scores)]

# top_items.head()

# new_user = pd.DataFrame(np.zeros(len(user_features_col))).T
# new_user.columns = user_features_col
# # print(new_user)

# new_user.head()


# new_user_id = 86000
# new_user['segment4'] = 1
# new_user['B50'] = 1
# new_user['Cold Start'] = 1

# new_user = csr_matrix(new_user)
# scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user)
# top_items_new_user = item.iloc[np.argsort(-scores)]
# top_items_new_user[0:10]

In [None]:
# Use our isolated user to check our algorithm:
# We predict items for this user, then check the precision at k=5.

selected_user_df.head()
idcol = selected_user_df.iloc[0]["idcol"]
segment = selected_user_df.iloc[0]["segment"]
beh_segment = selected_user_df.iloc[0]["beh_segment"]
active_ind = selected_user_df.iloc[0]["active_ind"]

column_names = u_cols
# Populate the new DataFrame with relevant information from the original DataFrame
new_user_data = {
    'idcol': [idcol],
    'segment': [segment],
    'beh_segment': [beh_segment],
    'active_ind': [active_ind],
    'most_clicked_item': [np.nan],
    'most_bought_item': [np.nan],
    'most_clicked_item_type': [np.nan],
    'most_bought_item_type': [np.nan],
    'daily_activity_score': [np.nan],
    'activity_rate': [np.nan]
}

print(new_user_data)

new_user = pd.DataFrame(new_user_data)

# Function to fill NaN values in the new row
def fill_na_with_mode_or_mean(data, cold_start_user):
    filled_row = cold_start_user.copy()
    
    for column in data.columns:
        if cold_start_user[column].isna().any():
            if data[column].dtype == 'object':  # Categorical data
                mode_value = data[column].mode()[0]
                filled_row[column].fillna(mode_value, inplace=True)
            else:  # Numerical data
                mean_value = data[column].mean()
                filled_row[column].fillna(mean_value, inplace=True)
    
    return filled_row

new_user_completed = fill_na_with_mode_or_mean(user, new_user)
# new_user_sparse = pd.get_dummies(new_user_completed,dtype = int, prefix="", prefix_sep="")
# # new_user_sparse.head()
selected_user_df.head(-1)

In [None]:
new_user_wide = pd.DataFrame(np.zeros(len(user_features_col))).T
new_user_wide.columns = user_features_col
# print(new_user)

# Populate the DataFrame with the provided values
new_user_wide.at[0, 'daily_activity_score'] = new_user_completed.iloc[0]['daily_activity_score']
new_user_wide.at[0, 'activity_rate'] = new_user_completed.iloc[0]['activity_rate']
new_user_wide.at[0, new_user_completed.iloc[0]['segment']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['beh_segment']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['active_ind']] = 1.0  # Assuming 1.0 indicates this segment is active
new_user_wide.at[0, new_user_completed.iloc[0]['most_clicked_item']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_bought_item']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_clicked_item_type']] = 1.0
new_user_wide.at[0, new_user_completed.iloc[0]['most_bought_item_type']] = 1.0
# new_user_wide.at[0, 'INSURE1'] = 1.0  # Populate the first occurrence of 'INSURE' with 1.0
new_user_wide.head()

In [None]:
new_user_csr = csr_matrix(new_user_wide)
# new_user = csr_matrix(new_user)
print(new_user_csr.shape)
scores = model.predict(user_ids = 0,item_ids = np.arange(interactions.shape[1]), user_features=new_user_csr)
top_items_new_user = item.iloc[np.argsort(-scores)]
top_items_new_user[0:20]

In [None]:
selected_user_df.head()

# Rank the items based on interactions
# Assign scores to interaction types
interaction_scores = {'DISPLAY': 0, 'CLICK': 1, 'CHECKOUT': 2}

# Map interaction types to scores
selected_user_df['interaction_score'] = selected_user_df['interaction'].map(interaction_scores)

# Rank items based on scores
selected_user_df['item_rank'] = selected_user_df.groupby('idcol')['interaction_score'].rank(method='min', ascending=False)

# Sort dataframe by item rank
selected_user_df = selected_user_df.sort_values(by='item_rank')

selected_user_df.head(-1)

true_items = selected_user_df['item'].tolist()
true_items = list(set(true_items))

print(true_items)

recommended_list = top_items_new_user['item'].tolist()
recommended_list = list(set(recommended_list))

print(recommended_list)

In [None]:
# def precision_at_k(recommended_list, actual_list, k=10):
#     # Get the intersection of the recommended list and the actual list up to k
#     intersection = set(recommended_list[:k]) & set(actual_list)
    
#     # Calculate precision@k
#     precision = len(intersection) / k
    
#     return precision

# # Calculate precision@k=10
# precision = precision_at_k(recommended_list, true_items, k=10)
# print("Precision@k=10:", precision)