In [24]:

import polars as pl
import torch
import pickle
import sys
import os
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from utils.modelling.rnn import recommend_next_action, BERT4Rec

EVENT_MODEL_PATH = './models/bert4rec_model.pth'
RETENTION_MODEL_PATH = './models/xgb_classifier.pkl'
TIME_USAGE_MODEL_PATH = './models/xgb_regressor.pkl'

## Prediction Framework

in this notebook, we will showcase our optimization framework for suggesting the next event which yields the highest sucess rate or retaining users.

### Import Dataset

In [17]:
# Create sample data
file_path = os.path.expanduser('~/Desktop/data/preprocessed_data.parquet')
df = pl.scan_parquet(file_path).limit(5)
df = df.collect()
df = df.with_columns(pl.Series('user_sequence', [[1,2], [3,4,5], [6], [7,9], [100, 200]]))
df = df.drop('session_id', 'user_id_first')
df

device_family_linux_max,device_family_mac os x_max,device_family_windows_max,region_grouped_international_max,region_grouped_midwest_max,region_grouped_northeast_max,region_grouped_south_max,region_grouped_west_max,event_category_account & policy management_max,event_category_action center & workflow_max,event_category_dashboard & ui interactions_max,event_category_other/system events_max,event_category_session & navigation_max,event_category_submission & forms_max,returned_within_28_days_max,uw_max,admin_max,manager_max,broker_max,google_max,microsoft_max,session_seconds_mean,client_event_hour_mean,client_upload_hour_mean,event_hour_mean,server_received_hour_mean,server_upload_hour_mean,time_to_server_mean,server_to_process_mean,processing_time_mean,slug_encoded_mean,user_sequence
u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i32,i8,i8,i8,i8,i32,i32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,list[i64]
0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,937.0,20.0,19.851852,20.0,19.851852,19.851852,1076.148148,0.0,3.592593,0.044815,"[1, 2]"
0,1,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0.0,14.0,14.5,14.0,14.5,14.5,1598.75,0.0,1.5,0.045,"[3, 4, 5]"
0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,16.0,20.0,20.0,20.0,20.0,20.0,2.0,0.0,0.6,0.0,[6]
0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0.0,16.0,16.0,16.0,16.0,16.0,1.0,0.0,0.0,0.0,"[7, 9]"
0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,6.0,16.0,17.0,16.0,17.0,17.0,14778000.0,0.0,10.0,0.0,"[100, 200]"


### Initialize Event Predictor

In [98]:
# Initialize event predictor class
class EventPredictor:

    def __init__(self, event_model, retention_model, time_usage_model, device):

        # Save path
        self.event_model = event_model
        self.retention_model = retention_model
        self.time_usage_model = time_usage_model
        self.device = device


    def predict(self, df : pl.DataFrame):
        
        # Get predictions from models
        scores = []

        for row in df.iter_rows(named = True):
            predicted_action, prob = recommend_next_action(self.event_model, row['user_sequence'], 10, self.device)
            retention_y_pred_proba = self.retention_model.predict_proba(
                                        pl.DataFrame(row).drop('returned_within_28_days_max', 'user_sequence'))[::,1]
            time_usage_y_pred = self.time_usage_model.predict(
                                        pl.DataFrame(row).drop('session_seconds_mean', 'user_sequence'))
            scores.append(max(prob * retention_y_pred_proba * time_usage_y_pred))

        # Get next action
        return scores


### Get Sample Prediction

In [99]:
# Define hyperparameters
num_items = 741
hidden_size = 128
num_layers = 2
num_heads = 2
max_seq_len = 10  # Adjusted for longer sequences if needed
learning_rate = 1e-4
batch_size = 32
num_epochs = 50

In [100]:
# Load event model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
event_model = BERT4Rec(num_items, hidden_size, num_layers, num_heads, max_seq_len).to(device)
param_dict = torch.load(os.path.expanduser(EVENT_MODEL_PATH), 
                        weights_only = False, map_location=torch.device('cpu'))
event_model.load_state_dict(param_dict)

with open(os.path.expanduser(RETENTION_MODEL_PATH), 'rb') as file:
    retention_model = pickle.load(file)

with open(os.path.expanduser(TIME_USAGE_MODEL_PATH), 'rb') as file:
    time_usage_model = pickle.load(file)


In [101]:
# Get scores for each row
event_predictor = EventPredictor(event_model, retention_model, time_usage_model, device)
scores = event_predictor.predict(df)
scores

[np.float32(4.407903),
 np.float32(4.8511276),
 np.float32(8.221094),
 np.float32(0.02035117),
 np.float32(8.060284)]