# Imports

In [4]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, auc_score
import numpy as np
from lightfm.cross_validation import random_train_test_split
import os
from scipy.sparse import csr_matrix

# Data Prep

## Load Data

In [5]:
data = pd.read_csv("./fnb_datav2.csv")

# Remove 
data = data.drop(columns = ["page", "tod", "item_descrip"])
data.head(20)

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind
0,755,DISPLAY,17JAN2023,NONE,ALL,segment3,B01,Semi Active
1,4521,DISPLAY,27FEB2023,NONE,ALL,segment1,B07,Semi Active
2,4521,DISPLAY,18FEB2023,NONE,ALL,segment1,B07,Semi Active
3,4521,DISPLAY,30JAN2023,NONE,ALL,segment1,B07,Semi Active
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
5,4521,CHECKOUT,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active
6,6145,DISPLAY,26FEB2023,NONE,ALL,segment3,B01,Cold Start
7,6145,DISPLAY,27JAN2023,NONE,ALL,segment3,B01,Cold Start
8,6145,DISPLAY,10FEB2023,NONE,ALL,segment3,B01,Cold Start
9,6145,DISPLAY,10JAN2023,NONE,ALL,segment3,B01,Cold Start


In [140]:
# Checking transactions for a specific ID:
id = 77196041
data[data["idcol"]==id]

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id
155291,77196041,DISPLAY,26FEB2023,NONE,ALL,segment2,B01,Active,0,1
155292,77196041,DISPLAY,16FEB2023,NONE,ALL,segment2,B01,Active,0,1
155293,77196041,DISPLAY,29MAR2023,NONE,ALL,segment2,B01,Active,0,1
155294,77196041,DISPLAY,05JAN2023,NONE,ALL,segment2,B01,Active,0,1
155295,77196041,CLICK,27MAR2023,FICQ,INSURE,segment2,B01,Active,1,41
...,...,...,...,...,...,...,...,...,...,...
155353,77196041,CLICK,27MAR2023,CACU,TRANSACT,segment2,B01,Active,1,51
155354,77196041,CHECKOUT,27MAR2023,CABC,INVEST,segment2,B01,Active,2,62
155355,77196041,CLICK,27MAR2023,CABC,INVEST,segment2,B01,Active,1,62
155356,77196041,CHECKOUT,27MAR2023,SEVP,TRANSACT,segment2,B01,Active,2,76


In [6]:
data.nunique()

idcol          84375
interaction        3
int_date          88
item             104
item_type          7
segment            4
beh_segment       50
active_ind         3
dtype: int64

## Add Features:

In [7]:
# Give scores to the interactions, and then drop the interaction column:L
interaction_scores = {
    'CLICK': 1,
    'CHECKOUT': 2
}

data['interaction_scores'] = data['interaction'].map(interaction_scores).fillna(0).astype(int)

# Add ids for each unique item
data['item_id'] = pd.factorize(data['item'])[0] + 1

data.drop(columns = ["interaction"])
data.head()

Unnamed: 0,idcol,interaction,int_date,item,item_type,segment,beh_segment,active_ind,interaction_scores,item_id
0,755,DISPLAY,17JAN2023,NONE,ALL,segment3,B01,Semi Active,0,1
1,4521,DISPLAY,27FEB2023,NONE,ALL,segment1,B07,Semi Active,0,1
2,4521,DISPLAY,18FEB2023,NONE,ALL,segment1,B07,Semi Active,0,1
3,4521,DISPLAY,30JAN2023,NONE,ALL,segment1,B07,Semi Active,0,1
4,4521,CLICK,05FEB2023,IBAB,INSURE,segment1,B07,Semi Active,1,2


In [8]:
# pivot_table = data.pivot_table(index='idcol', columns='item', aggfunc='size', fill_value=0)

# #  Convert the pivot table back to long format for LightFM
# user_item_interactions = pivot_table.stack().reset_index()
# user_item_interactions.columns = ['idcol', 'item', 'interaction_count']

# # Factorize item to get unique item IDs
# user_item_interactions['item_id'] = pd.factorize(user_item_interactions['item'])[0] + 1


In [111]:
# Define the user columns, item columns and interaction columns:
u_cols = ["idcol", "segment", "beh_segment", "active_ind"]
item_cols = ["item_id", "item", "item_type"] 
interact_cols = ["idcol", "item_id", "interaction_scores"] # I include idcol and item for now, to basically say "this user did this item with this score at this date"

# I want the interact_cols data in the following format:
# 

user, item, rating = data[u_cols].copy(), data[item_cols].copy(), data[interact_cols].copy()

# Accumulate rating data so that, for each unique user-item combination, there is a single row
rating = rating.groupby(['idcol', 'item_id'], as_index=False)['interaction_scores'].sum()


# Drop duplicates, because I only need the unique items' and users' data:
item = item.drop_duplicates()
item = item.reset_index(drop=True)

user = user.drop_duplicates()
user = user.reset_index(drop=True)

print(rating.shape)
print(item.shape)
print(user.shape)


(169646, 3)
(104, 3)
(84375, 4)


## User Features Data preparation

In [128]:
user = pd.get_dummies(user,dtype = int, prefix="", prefix_sep="")
user_features_col = user.drop(columns =['idcol']).columns.values
user_feat = user.drop(columns =['idcol']).to_dict(orient='records')

user = user.sort_values(by='idcol', ascending=True)
user.head()
# user.shape
# print(user.iloc[0,:])


Unnamed: 0,idcol,segment1,segment2,segment3,segment4,B01,B02,B03,B04,B05,...,B44,B45,B46,B47,B48,B49,B50,Active,Cold Start,Semi Active
0,755,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,4521,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,6145,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,7125,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,8469,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Item Features Data prep:

In [113]:
# item.head()

item_features = pd.get_dummies(item, dtype = int, prefix="", prefix_sep="")
# item_features["idcol"] = data["idcol"]
item_features_col = item_features.drop(columns=['item_id']).columns.values
item_feat = item_features.drop(columns =['item_id']).to_dict(orient='records')

# Need some for of identification for the item features
# item_features["idcol"] = data["idcol"]
item_features.head()
# item_features.shape
# print(item_feat[0])
# item.head()
# print(item_features.iloc[0,:])

Unnamed: 0,item_id,CABC,CACU,CAFB,CAFI,CAFM,CAFS,CAFU,CALI,CANL,...,SEVP,WHCR,XCFL,ALL,CONNECT,INSURE,INVEST,LEND,LIFESTYLE,TRANSACT
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## Fit into LightFM Dataset

In [114]:
dataset = Dataset()
dataset.fit(users=[x for x in user['idcol']], items=[x for x in item['item_id']], item_features=item_features_col, user_features=user_features_col)

num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))


Num users: 84375, num_items 104.


## Build Item Features to be fitted into model

In [115]:
# 
item_features = dataset.build_item_features((x,y) for x,y in zip(item_features['item_id'],item_feat))

In [116]:
print(item_features)

  (0, 0)	0.3333333432674408
  (0, 104)	0.0
  (0, 105)	0.0
  (0, 106)	0.0
  (0, 107)	0.0
  (0, 108)	0.0
  (0, 109)	0.0
  (0, 110)	0.0
  (0, 111)	0.0
  (0, 112)	0.0
  (0, 113)	0.0
  (0, 114)	0.0
  (0, 115)	0.0
  (0, 116)	0.0
  (0, 117)	0.0
  (0, 118)	0.0
  (0, 119)	0.0
  (0, 120)	0.0
  (0, 121)	0.0
  (0, 122)	0.0
  (0, 123)	0.0
  (0, 124)	0.0
  (0, 125)	0.0
  (0, 126)	0.0
  (0, 127)	0.0
  :	:
  (103, 190)	0.0
  (103, 191)	0.0
  (103, 192)	0.0
  (103, 193)	0.0
  (103, 194)	0.3333333432674408
  (103, 195)	0.0
  (103, 196)	0.0
  (103, 197)	0.0
  (103, 198)	0.0
  (103, 199)	0.0
  (103, 200)	0.0
  (103, 201)	0.0
  (103, 202)	0.0
  (103, 203)	0.0
  (103, 204)	0.0
  (103, 205)	0.0
  (103, 206)	0.0
  (103, 207)	0.0
  (103, 208)	0.0
  (103, 209)	0.0
  (103, 210)	0.0
  (103, 211)	0.3333333432674408
  (103, 212)	0.0
  (103, 213)	0.0
  (103, 214)	0.0


## Build User Features to be fit into model

In [117]:
user_features = dataset.build_user_features((x,y) for x,y in zip(user['idcol'],user_feat))

## Build interactions (user — item) and its respective weights (in this case each user’s movie rating score)

In [118]:
(interactions, weights) = dataset.build_interactions((x, y) for x,y in zip(rating['idcol'], rating['item_id']))

# Model Training

## Train Test Split

In [119]:
train, test = random_train_test_split(interactions,test_percentage=0.1, random_state=42)
train_w, test_w = random_train_test_split(weights, test_percentage=0.1, random_state=42)

## Model

In [120]:
n_components = 30
loss = 'warp'
epoch = 30
num_thread = 4
model = LightFM(no_components= n_components, loss=loss, random_state = 42)
model.fit(train,  user_features= user_features, item_features= item_features, epochs=epoch,num_threads = num_thread, sample_weight = train_w)

<lightfm.lightfm.LightFM at 0x7e62612bd990>

## Model Evaluation

In [121]:
train_precision = precision_at_k(model, train, k=10,item_features=item_features, user_features=user_features).mean()
test_precision = precision_at_k(model, test,train_interactions=train, k=10,item_features=item_features, user_features=user_features).mean()

train_recall = recall_at_k(model, train, k=10,item_features=item_features, user_features=user_features).mean()
test_recall = recall_at_k(model, test,train_interactions=train, k=10,item_features=item_features, user_features=user_features).mean()

train_auc = auc_score(model, train,item_features=item_features, user_features=user_features).mean()
test_auc = auc_score(model, test, train_interactions=train,item_features=item_features, user_features=user_features).mean()

print('Precision: train %.2f' % (train_precision))
print('Precision: test %.2f' % (test_precision))

print('Recall: train %.2f' % (train_recall))
print('Recall: test %.2f' % (test_recall))

print('AUC: train %.2f' % (train_auc))
print('AUC: test %.2f' % (test_auc))

Precision: train 0.16
Precision: test 0.08
Recall: train 0.92
Recall: test 0.72
AUC: train 0.98
AUC: test 0.91


# Predictions

In [166]:
target_idcol = 77196041
index = int(user[user['idcol'] == target_idcol].index[0])

scores = model.predict(index, np.arange(104))
# print(scores)
print(user.iloc[index,:])
top_items = item.iloc[np.argsort(-scores)]
top_items.head()
# print(item.shape)
# print(top_items)
known_positives = item.iloc[interactions.tocsr()[index].indices]

top_items[0:10][['item_id', 'item']]

idcol          77196041
segment1              0
segment2              1
segment3              0
segment4              0
B01                   1
B02                   0
B03                   0
B04                   0
B05                   0
B06                   0
B07                   0
B08                   0
B09                   0
B10                   0
B11                   0
B12                   0
B13                   0
B14                   0
B15                   0
B16                   0
B17                   0
B18                   0
B19                   0
B20                   0
B21                   0
B22                   0
B23                   0
B24                   0
B25                   0
B26                   0
B27                   0
B28                   0
B29                   0
B30                   0
B31                   0
B32                   0
B33                   0
B34                   0
B35                   0
B36                   0
B37             

Unnamed: 0,item_id,item
41,42,IPRA
34,35,IPTF
57,58,CUHS
4,5,FIWL
0,1,NONE
5,6,CUSS
29,30,XCFL
15,16,MMSM
80,81,EVCU
25,26,MMMC


In [167]:
# print(user["idcol"])

known_positives_rating = rating[(rating['idcol']==user['idcol'][3])][['item_id','interaction_scores']].merge(item[['item_id','item']], on = 'item_id')
print(rating[(rating['idcol']==user['idcol'][index])])
# print(known_positives_rating)

# tor = rating.sort_values(by='idcol', ascending=True)
# tor.head(20)
# print(user['idcol'][3])
# known_positives_rating[known_positives_rating['item_id'].isin(top_items['item_id'][0:10])]

Unnamed: 0,item_id,interaction_scores,item
0,1,0,NONE
