In [1]:
import pandas as pd
import numpy as np
import dateutil
import random
import time
import warnings
warnings.filterwarnings('ignore')

import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_curve, roc_auc_score, auc


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import h2o
from h2o.automl import H2OAutoML

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

import json
import csv




In [2]:
start_time = time.time()

## Goal:
Fit a recommender system to predict whether or not a user will purchase an item.

In [3]:
data_raw = pd.read_csv('events.csv')
data_raw

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-09-24 11:57:06 UTC,view,1996170,2144415922528452715,electronics.telephone,,31.90,1515915625519388267,LJuJVLEjPT
1,2020-09-24 11:57:26 UTC,view,139905,2144415926932472027,computers.components.cooler,zalman,17.16,1515915625519380411,tdicluNnRY
2,2020-09-24 11:57:27 UTC,view,215454,2144415927158964449,,,9.81,1515915625513238515,4TMArHtXQy
3,2020-09-24 11:57:33 UTC,view,635807,2144415923107266682,computers.peripherals.printer,pantum,113.81,1515915625519014356,aGFYrNgC08
4,2020-09-24 11:57:36 UTC,view,3658723,2144415921169498184,,cameronsino,15.87,1515915625510743344,aa4mmk0kwQ
...,...,...,...,...,...,...,...,...,...
885124,2021-02-28 23:55:01 UTC,view,953226,2144415927553229037,,,219.94,1515915625611023730,FRLqIttxKU
885125,2021-02-28 23:58:05 UTC,view,1715907,2144415927049912542,electronics.video.tv,starwind,80.03,1515915625611024014,g6WqPf50Ma
885126,2021-02-28 23:58:09 UTC,view,4170534,2144415939364389423,electronics.clocks,amazfit,64.92,1515915625611024020,xNIJBqZdkd
885127,2021-02-28 23:58:14 UTC,view,888273,2144415921932861531,electronics.telephone,,10.16,1515915625611024030,9pCbKMIcSx


## Exploratory Data Analysis

In [4]:
data_cleaned = data_raw.copy()

### Data types

In [5]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 885129 entries, 0 to 885128
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     885129 non-null  object 
 1   event_type     885129 non-null  object 
 2   product_id     885129 non-null  int64  
 3   category_id    885129 non-null  int64  
 4   category_code  648910 non-null  object 
 5   brand          672765 non-null  object 
 6   price          885129 non-null  float64
 7   user_id        885129 non-null  int64  
 8   user_session   884964 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 60.8+ MB


We can see that event_time should be a timestamp but is set to an object, lets go ahead and cast it.

In [6]:
data_cleaned['event_time'] = pd.to_datetime(data_cleaned['event_time'])
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 885129 entries, 0 to 885128
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype              
---  ------         --------------   -----              
 0   event_time     885129 non-null  datetime64[ns, UTC]
 1   event_type     885129 non-null  object             
 2   product_id     885129 non-null  int64              
 3   category_id    885129 non-null  int64              
 4   category_code  648910 non-null  object             
 5   brand          672765 non-null  object             
 6   price          885129 non-null  float64            
 7   user_id        885129 non-null  int64              
 8   user_session   884964 non-null  object             
dtypes: datetime64[ns, UTC](1), float64(1), int64(3), object(4)
memory usage: 60.8+ MB


### Duplicate entries

In [7]:
data_cleaned.shape

(885129, 9)

In [8]:
data_cleaned.drop_duplicates(inplace=False).shape

(884474, 9)

The shape of the dataset before and after dropping duplicates is not the same therefore duplicates must exist.

In [9]:
data_cleaned[data_cleaned.duplicated()]

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
512,2020-09-24 13:51:07+00:00,view,387956,2144415922427789416,computers.components.videocards,asus,104.21,1515915625519429853,PZu2caZ5EN
975,2020-09-24 15:48:55+00:00,view,874667,2144415922738167921,computers.components.cdrw,asus,23.48,1515915625519457150,8wvs0vbHtv
4828,2020-09-25 13:15:09+00:00,view,453469,2144415924222951574,auto.accessories.parktronic,,69.84,1515915625519725870,9ofICyh8Eo
6144,2020-09-25 19:31:42+00:00,view,1010933,2144415921932861531,electronics.telephone,,10.79,1515915625492539666,dfyeBb7YCt
8706,2020-09-26 15:32:01+00:00,view,1248094,2144415966652530999,,,13.49,1515915625520020227,r4o2Ukczfc
...,...,...,...,...,...,...,...,...,...
876900,2021-02-27 11:17:09+00:00,view,525416,2144415940119364164,,merrylock,309.30,1515915625610353957,1t3jWWFsGR
877116,2021-02-27 12:02:26+00:00,view,3804563,2144415926966026460,computers.components.cpu,intel,97.35,1515915625551553663,R97Nw0H5Be
878751,2021-02-27 17:56:05+00:00,view,1571204,2144415924491387038,computers.components.motherboard,asus,146.40,1515915625610505518,EUqy2lyCvY
879545,2021-02-27 20:54:20+00:00,view,1027953,2144415923837075596,electronics.audio.acoustic,jbl,332.87,1515915625529755153,3mD3HIQ017


These are all the duplicate rows so let's just go ahead and drop them.

In [10]:
data_cleaned.drop_duplicates(inplace=True)
data_cleaned

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-09-24 11:57:06+00:00,view,1996170,2144415922528452715,electronics.telephone,,31.90,1515915625519388267,LJuJVLEjPT
1,2020-09-24 11:57:26+00:00,view,139905,2144415926932472027,computers.components.cooler,zalman,17.16,1515915625519380411,tdicluNnRY
2,2020-09-24 11:57:27+00:00,view,215454,2144415927158964449,,,9.81,1515915625513238515,4TMArHtXQy
3,2020-09-24 11:57:33+00:00,view,635807,2144415923107266682,computers.peripherals.printer,pantum,113.81,1515915625519014356,aGFYrNgC08
4,2020-09-24 11:57:36+00:00,view,3658723,2144415921169498184,,cameronsino,15.87,1515915625510743344,aa4mmk0kwQ
...,...,...,...,...,...,...,...,...,...
885124,2021-02-28 23:55:01+00:00,view,953226,2144415927553229037,,,219.94,1515915625611023730,FRLqIttxKU
885125,2021-02-28 23:58:05+00:00,view,1715907,2144415927049912542,electronics.video.tv,starwind,80.03,1515915625611024014,g6WqPf50Ma
885126,2021-02-28 23:58:09+00:00,view,4170534,2144415939364389423,electronics.clocks,amazfit,64.92,1515915625611024020,xNIJBqZdkd
885127,2021-02-28 23:58:14+00:00,view,888273,2144415921932861531,electronics.telephone,,10.16,1515915625611024030,9pCbKMIcSx


### Unique values

In [11]:
data_cleaned.nunique()

event_time       845041
event_type            3
product_id        53453
category_id         718
category_code       107
brand               999
price             12422
user_id          407283
user_session     490398
dtype: int64

Since we know that the size of our data consists of 885129 rows and we can see that no column has that number of unique values, we can infer that no column contains only distinct values.

### Nulls/ Missing values 

In [12]:
data_cleaned.isna().sum()

event_time            0
event_type            0
product_id            0
category_id           0
category_code    236047
brand            212232
price                 0
user_id               0
user_session        162
dtype: int64

Since we are potentially going to utilize category_code and brand in our models(we can ignore the nulls in user_session), let's go ahead and impute the data using the following aproaches:
- **category_code**: replace nulls with 'unknown'
- **brand**: replace nulls with 'unknown'

In [13]:
data_cleaned['category_code'] = data_cleaned['category_code'].fillna('unknown')
data_cleaned['brand'] = data_cleaned['brand'].fillna('unknown')
data_cleaned.isna().sum()

event_time         0
event_type         0
product_id         0
category_id        0
category_code      0
brand              0
price              0
user_id            0
user_session     162
dtype: int64

### Outlier values

Price is our only numerical column in our dataset so let's investigate if there any outliers.

In [14]:
data_cleaned['price'].describe()

count    884474.000000
mean        146.331556
std         296.864505
min           0.220000
25%          26.460000
50%          65.710000
75%         190.540000
max       64771.060000
Name: price, dtype: float64

Since our da
taset is so large it would be inefficient to plot all values, therefore let's only analyze values in the top 75% percentile.


In [15]:
price_75percentile = np.percentile(data_cleaned['price'], [75])[0]
x_data = data_cleaned[data_cleaned['price'] >  price_75percentile]['price']

fig = go.Figure(data = [go.Box(x=x_data)])
fig.update_layout(
    title='Price 75th percentile distribution',
)
fig.show()

In [16]:
data_cleaned[data_cleaned['price'] >  price_75percentile].sort_values(by='price', ascending=False)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
418716,2020-12-09 18:31:21+00:00,view,4170916,2144415922402623591,computers.peripherals.monitor,lg,64771.06,1515915625556475919,3cyaesEFAP
86316,2020-10-14 06:26:41+00:00,view,4170916,2144415922402623591,computers.peripherals.monitor,lg,64771.06,1515915625525406075,M91MkaVScv
627562,2021-01-17 16:43:17+00:00,view,4170916,2144415922402623591,computers.peripherals.monitor,lg,64771.06,1515915625593603786,nmfeoOLmrb
51272,2020-10-06 06:54:04+00:00,view,4170916,2144415922402623591,computers.peripherals.monitor,lg,64771.06,1515915625522796597,pfaAniPp5L
866584,2021-02-25 14:03:50+00:00,view,4078837,2144415927049912542,electronics.video.tv,samsung,42590.13,1515915625609502967,RvBeY5oySp
...,...,...,...,...,...,...,...,...,...
358209,2020-11-26 22:08:27+00:00,view,1012953,2144415923107266682,computers.peripherals.printer,hp,190.63,1515915625533518591,55AlQvZwO6
676558,2021-01-25 09:18:10+00:00,view,1080151,2144415923535085701,unknown,crucial,190.62,1515915625556193402,SXafhErAOZ
676553,2021-01-25 09:17:48+00:00,view,1080151,2144415923535085701,unknown,crucial,190.62,1515915625556193402,SXafhErAOZ
797041,2021-02-13 13:26:06+00:00,view,1080151,2144415923535085701,unknown,crucial,190.62,1515915625605054171,rxhZQguQn7


While it does seem like there are some extreme values in our dataset, it is hard to infer whether or not these values are actually outliers. We can see these values appear multiple times across different rows in our dataset so it is hard to believe that it was a due to an accident. That being said, we will be making the decision to leave them in our analysis.

### Temporal Consistency

In [17]:
data_cleaned['event_time'].min(), data_cleaned['event_time'].max() 

(Timestamp('2020-09-24 11:57:06+0000', tz='UTC'),
 Timestamp('2021-02-28 23:59:09+0000', tz='UTC'))

It looks like all of our time values fall within our expected reasonable range.

## Visualizing the data

In [18]:
data = data_cleaned.copy()

In [19]:
grouped = data.groupby('event_type').count().iloc[:,0].sort_values(ascending=False)[::-1]
events = list(grouped.index)
counts = list(grouped.values)
text = [str(value)+'%' for value in (np.array(counts)*100 / sum(counts)).round(2)]

fig = go.Figure(data = [go.Bar(y=events, x=counts, orientation='h', text=text)])
fig.update_layout(
    title='Data Label Distribution',
    xaxis=dict(title='Count'),
    yaxis=dict(title='Label'),
)
fig.show()

Since we are trying to observe any underlying patterns in purchase history, we are going to focus on the items that were purchased and put into their cart vs the items that were only viewed.

In [20]:
# items that were never purchased or put into a cart
purchased_or_cart_pids = data[(data['event_type'] == 'cart') | (data['event_type'] == 'purchase')]['product_id'].unique()
viewed_only_pids = data[~data['product_id'].isin(purchased_or_cart_pids)]['product_id'].unique()
print('Number of products purchased or carted:',len(purchased_or_cart_pids))
print('Number of products viewed only:', len(viewed_only_pids))

Number of products purchased or carted: 9837
Number of products viewed only: 43616


### First, let us see if brand effects the likelihood of a product being purchased

In [21]:
brands_purchased_or_cart = data[data['product_id'].isin(purchased_or_cart_pids)].groupby('brand').count().iloc[:,0]
brands_viewed_only = data[data['product_id'].isin(viewed_only_pids)].groupby('brand').count().iloc[:,0]

df1 = pd.DataFrame(brands_viewed_only).rename(columns={'event_time':'num_viewed_only'})
df2 = pd.DataFrame(brands_purchased_or_cart).rename(columns={'event_time':'num_purchased_cart'})

by_brand = df1.join(df2, how='outer').fillna(0)
by_brand['total_products'] = by_brand.sum(axis=1)
by_brand['percent_purchased_cart'] = round(by_brand['num_purchased_cart']*100 / by_brand['total_products'], 2)

plot_data = by_brand.sort_values(by='total_products', ascending=False)['percent_purchased_cart'][:30]
text = by_brand.sort_values(by='total_products', ascending=False)['total_products'][:30]
fig = go.Figure()
fig.add_trace(go.Bar(y=plot_data.index[::-1], x=plot_data.values[::-1], orientation='h', text=text[::-1]))

fig.add_shape(
    dict(
        type='line',
        x0=50,
        x1=50,
        y0=0,
        y1=30,
        line=dict(color='red', width=2, dash='dash')
    )
)

fig.update_layout(
    title='Brands with the largest number of products(top 30), percentage of purchase/cart',
    xaxis=dict(title='Percentage (total products in bar)'),
    yaxis=dict(title='Brand'),
    height=1000,
)

fig.show()

Any brand that is behind the red-dashed line means that more than 50% of their products were only ever viewed and never purchased/put into a cart. This could be valuable information to consider later when we begin to develop features for our model.

### Price

In [22]:
prices_purchased_or_cart = data[data['product_id'].isin(purchased_or_cart_pids)].groupby('price').count().iloc[:,0]
prices_viewed_only = data[data['product_id'].isin(viewed_only_pids)].groupby('price').count().iloc[:,0]

fig = make_subplots(rows=2, cols=1, subplot_titles=['Prices of purchased/cart products', 'Prices of viewed only products'])

fig.add_trace(go.Box(x=prices_purchased_or_cart, boxpoints='all', jitter=0.3, pointpos=-1.8,), row=1, col=1)

fig.add_trace(go.Box(x=prices_viewed_only, boxpoints='all', jitter=0.3, pointpos=-1.8,), row=2, col=1)

fig.update_layout(
    xaxis=dict(title='Count'),
    height=800
)
fig.update_layout(xaxis_range=[-50, 1200])

fig.show()


Although it is hard to tell from the plots themselves, the median price of purchased/cart is 41 while the median for viewed only is 8. While it is hard to derive any useful information from this since the two groups are different signficantly in size, it could be worth considering when we train our models later.

### Category

In [23]:
category_purchased_or_cart = data[data['product_id'].isin(purchased_or_cart_pids)].groupby('category_code').count().iloc[:,0]
category_viewed_only = data[data['product_id'].isin(viewed_only_pids)].groupby('category_code').count().iloc[:,0]

df1 = pd.DataFrame(category_purchased_or_cart).rename(columns={'event_time':'purchased_cart'})
df2 = pd.DataFrame(category_viewed_only).rename(columns={'event_time':'viewed_only'})
product_categories = df1.join(df2, how='outer')
product_categories['total'] = product_categories.sum(axis=1)
product_categories['percent_purchased_cart'] = round(product_categories['purchased_cart'] * 100 / product_categories['total'], 2)

plot_data = product_categories.sort_values(by='total', ascending=False)['percent_purchased_cart'][:30]
text = product_categories.sort_values(by='total', ascending=False)['total'][:30]

fig = go.Figure()
fig.add_trace(go.Bar(y=plot_data.index[::-1], x=plot_data.values[::-1], orientation='h', text=text[::-1]))

fig.add_shape(
    dict(
        type='line',
        x0=50,
        x1=50,
        y0=0,
        y1=30,
        line=dict(color='red', width=2, dash='dash')
    )
)

fig.update_layout(
    title='Product categories with the largest number of products(top 30), percentage of purchase/cart',
    xaxis=dict(title='Percentage (total products in bar)'),
    yaxis=dict(title='Category'),
    height=1000,
)

fig.show()

It looks like some categories are purchased/cart significantly less compared to other categories. This is also useful information going forward.

### Popularity

In [24]:
#category_purchased_or_cart = data[data['product_id'].isin(purchased_or_cart_pids)].groupby('category_code').count().iloc[:,0]
#category_viewed_only = data[data['product_id'].isin(viewed_only_pids)].groupby('category_code').count().iloc[:,0]

product_interactions = pd.DataFrame(data.groupby('product_id').count()['event_type'])\
                        .rename(columns={'event_type':'interactions'})\
                        .reset_index()
product_interactions['was_purchased_cart'] = product_interactions['product_id'].isin(purchased_or_cart_pids)

hist1_data = product_interactions[product_interactions['was_purchased_cart'] == True]['interactions']
hist2_data = product_interactions[product_interactions['was_purchased_cart'] == False]['interactions']

fig = make_subplots(rows=2, cols=1, subplot_titles=['Number of interactions distribution of purchased/cart', 'Number of interactions distribution of viewed only'])

fig.add_trace(go.Box(x=hist1_data, boxpoints='all', jitter=0.3, pointpos=-1.8,), row=1, col=1)
fig.add_trace(go.Box(x=hist2_data, boxpoints='all', jitter=0.3, pointpos=-1.8,), row=2, col=1)

# Customize the layout (optional)
fig.update_layout(
    xaxis=dict(title='Count'),
    height=800,
    showlegend=False,
   
)
fig.update_layout(xaxis_range=[0, 1200])
# Show the chart
fig.show()

Although it may be a little hard to decipher the plots, we can compare the x-axis of purchased/cart vs viewed only and see that products that were purchased/cart had much more interactions. Products that were purchased/cart had a median number of interactions equal to 18 while viewed only had 2. These numbers will be specifically helpful later when we engineer features for our model.

## Model Testing

In order to create models, we need to first develop a dataset that contains negative and positive entries. It should include products that a user has purchased/cart and products that a user has **not** purchased/cart before.

In [25]:
all_purchase_cart = data[(data['event_type'] == 'purchase') | (data['event_type'] == 'cart')].sample(frac=1).reset_index(drop=True)

# only extract useful columns
all_purchase_cart = all_purchase_cart[['user_id', 'product_id', 'category_code', 'brand', 'price']]
all_purchase_cart['category_code'] = all_purchase_cart['category_code'].str.replace('.', '_')
all_purchase_cart['checkout'] = np.ones(len(all_purchase_cart))

# generate train, valid, test datasets for positive interactions
train_size, valid_size, test_size = 0.65, 0.25, 0.10
size = len(all_purchase_cart)
train_data_positive = all_purchase_cart.iloc[:int(size*train_size),:]
valid_data_positive = all_purchase_cart.iloc[int(size*train_size):int(size*train_size + size*valid_size),:]
test_data_positive = all_purchase_cart.iloc[int(size*train_size + size*valid_size):,:]
print('train shape(positive interactions only):',train_data_positive.shape)
print('valid shape(positive interactions only):', valid_data_positive.shape)
print('test shape(positive interactions only):', test_data_positive.shape)

train shape(positive interactions only): (59393, 6)
valid shape(positive interactions only): (22844, 6)
test shape(positive interactions only): (9138, 6)


In [26]:
def generate_x_negative(data):
    negative_interactions = {}
    products = list(data['product_id'].unique())

    for index, row in data.iterrows():
        user = row['user_id']
        user_products = set(data[data['user_id'] == user]['product_id'])

        random_product = random.sample(products, 1)[0]
        while random_product in user_products:
            random_product = random.sample(products, 1)[0]
        random_product_info = data[data['product_id'] == random_product]

        negative_interactions['user_id'] = negative_interactions.get('user_id', []) + [user]
        negative_interactions['product_id'] = negative_interactions.get('product_id', []) + \
                                            [random_product]
        negative_interactions['category_code'] = negative_interactions.get('category_code', []) + \
                                            [random_product_info['category_code'].iloc[0].replace('.', '_')]
        negative_interactions['brand'] = negative_interactions.get('brand', []) + \
                                            [random_product_info['brand'].iloc[0]]
        negative_interactions['price'] = negative_interactions.get('price', []) + \
                                            [random_product_info['price'].iloc[0]]
        negative_interactions['checkout'] = negative_interactions.get('checkout', []) + [0]

    return pd.DataFrame(negative_interactions)

In [27]:
train_data_negative = generate_x_negative(train_data_positive)
valid_data_negative = generate_x_negative(valid_data_positive)
test_data_negative = generate_x_negative(test_data_positive)

In [28]:
train_data = pd.concat([train_data_positive, train_data_negative]).reset_index(drop=True)
valid_data = pd.concat([valid_data_positive, valid_data_negative]).reset_index(drop=True)
test_data = pd.concat([test_data_positive, test_data_negative]).reset_index(drop=True)

In [29]:
print('total train shape:',train_data.shape)
print('total valid shape:', valid_data.shape)
print('total test shape:', test_data.shape)

total train shape: (118786, 6)
total valid shape: (45688, 6)
total test shape: (18276, 6)


## Feature Engineering

### Feature: is_popular_brand

In [30]:
brand_popularity = train_data_positive.groupby('brand').count().iloc[:,0]
popular_threshold = np.percentile(brand_popularity, 50)
popular_brands = set(brand_popularity[brand_popularity > popular_threshold].index)

def is_popular_brand(X):
    X['is_popular_brand'] = X['brand'].isin(popular_brands)
    return X

### Feature: is_popular_product

In [31]:
product_popularity = train_data_positive.groupby('product_id').count().iloc[:,0]
popular_threshold = np.percentile(product_popularity, 50)
popular_products = set(product_popularity[product_popularity > popular_threshold].index)

def is_popular_product(X):
    X['is_popular_product'] = X['product_id'].isin(popular_products)
    return X

### Feature: category_code one-hot encoded

### Feature: brand one-hot encoded

### Feature: Jaccard similarity

In [32]:
users_per_product = {}
products_per_user = {}

for user_i, product_i in zip(train_data['user_id'], train_data['product_id']):
    users_per_product[product_i] = users_per_product.get(product_i, []) + [user_i]
    products_per_user[user_i] = products_per_user.get(user_i, []) + [product_i]

In [33]:
# use jaccard similarity for user and product pair
# idea: for all the users who purchased that product,
def jaccard(s1, s2):
    if len(s1) == 0 or len(s2) == 0:
        return 0
    
    numerator = len(set(s1) & set(s2))
    denominator = len(set(s1) | set(s2))
    similarity = numerator / denominator
    return similarity

In [34]:
# get the jaccard similarity score of the user that is the most similar that purchased that product
def similarity_score(X):
    def get_highest_user_similarity(row):
        user_i = row['user_id']
        product_i = row['product_id']
        users_per_product_i = users_per_product.get(product_i, [])
        if len(users_per_product_i) == 0:
            return 0
        
        highest_similarity_score = max(list(map(lambda user_j: jaccard(products_per_user.get(user_i, []), products_per_user.get(user_j, [])) \
                                                if user_i != user_j else 0, users_per_product_i)))
        return highest_similarity_score
    X['similarity_score'] = X.apply(get_highest_user_similarity, axis=1)
    return X

### Sklearn's pipeline

To improve the flow of the data transformation, we will utilize sklearn's Pipeline

In [35]:
#'user_id', 'product_id', 'category_code', 'brand', 'price'
preprocessor = ColumnTransformer(
    transformers=[
        ('price standardized', StandardScaler(), ['price']),
        ('category code one hot', OneHotEncoder(handle_unknown='ignore'), ['category_code']),
        ('brand one hot encoder', OneHotEncoder(handle_unknown='ignore'), ['brand'])
    ])


pipeline = Pipeline([
    ('is popular product', FunctionTransformer(is_popular_product)),
    ('is popular brand', FunctionTransformer(is_popular_brand)),
    ('similarity score of purchase/cart customer', FunctionTransformer(similarity_score)),
    ('preprocessor', preprocessor),
])

In [36]:
temp_train_data = train_data.sample(frac=0.5).reset_index(drop=True)
X_train = temp_train_data.drop('checkout', axis=1)
y_train = temp_train_data['checkout']

In [37]:
pipeline.fit(X_train)

### Baseline model: Logistic Regression model

Since we are trying to perform a binary prediction here, whether or not product was purchased/cart, a logistic regression model should give us a baseline prediction benchmark just fine. To create this model, we will engineer the following features to create our datasets for predictions.
- binary value if the product is considered popular or not (above threshold)
- normalized value of price
- one hot encoding of brand
- one hot encoding of product_category


In [38]:
x_train_transformed = pipeline.transform(X_train)
x_train_transformed

<59393x648 sparse matrix of type '<class 'numpy.float64'>'
	with 178179 stored elements in Compressed Sparse Row format>

Fit our Logisitic Regression model

In [39]:
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(x_train_transformed, y_train)

Evaluate its performance on the training set

In [40]:
y_train_pred = lr_model.predict(x_train_transformed)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

print('Accuracy score:', train_accuracy)
print('F1 score:', train_f1)

Accuracy score: 0.6686646574512148
F1 score: 0.6264497636719121


Evaluate performance on the validation set

In [41]:
temp_valid_data = valid_data.sample(frac=0.5).reset_index(drop=True)

X_valid = temp_valid_data.drop('checkout', axis=1)
y_valid = temp_valid_data['checkout']

x_valid_transformed = pipeline.transform(X_valid)

In [42]:
y_valid_pred = lr_model.predict(x_valid_transformed)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_f1 = f1_score(y_valid, y_valid_pred)

print('Accuracy score:', valid_accuracy)
print('F1 score:', valid_f1)

Accuracy score: 0.6445018385571704
F1 score: 0.6072827506165677


#### Perform Cross Validation grid-search to tune hyperparameters

In [43]:
lr_model = LogisticRegression()

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'] 
}

grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')

grid_search.fit(x_train_transformed, y_train)

In [44]:
lr_model_optimized = grid_search.best_estimator_

In [45]:
y_train_pred = lr_model_optimized.predict(x_train_transformed)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

print('Accuracy score:', train_accuracy)
print('F1 score:', train_f1)

Accuracy score: 0.6675702523866449
F1 score: 0.6223267913845213


In [46]:
y_valid_pred = lr_model_optimized.predict(x_valid_transformed)
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_f1 = f1_score(y_valid, y_valid_pred)

print('Accuracy score:', valid_accuracy)
print('F1 score:', valid_f1)

Accuracy score: 0.6454211171423568
F1 score: 0.6046852122986822


#### Evaluate Performance on the test set

In [47]:
test_valid_data = test_data.sample(frac=0.5).reset_index(drop=True)

X_test = test_valid_data.drop('checkout', axis=1)
y_test = test_valid_data['checkout']

x_test_transformed = pipeline.transform(X_test)

In [48]:
y_test_pred = lr_model_optimized.predict(x_test_transformed)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print('Accuracy score:', test_accuracy)
print('F1 score:', test_f1)

Accuracy score: 0.6200481505799956
F1 score: 0.5888204642349597


In [49]:
y_probs = lr_model_optimized.predict_proba(x_test_transformed)[:, 1]

fpr_lr, tpr_lr, thresholds = roc_curve(y_test, y_probs)

In [50]:
# Calculate the AUC-ROC score
roc_auc_lr = auc(fpr_lr, tpr_lr)
print("AUC-ROC Score:", roc_auc_lr)

AUC-ROC Score: 0.660105374593895


In [51]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr_lr, y=tpr_lr, mode='lines', name=f'ROC Curve (AUC={roc_auc_lr:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='x=y', line={'dash': 'dash'}))

fig.update_layout(
    title='ROC Curve',
    xaxis_title='False Positive Rate (FPR)',
    yaxis_title='True Positive Rate (TPR)',
    width=1000,  
    height=600,  
    legend=dict(
        x=0.02, 
        y=0.98,  
        traceorder='normal',
        font=dict(size=10),  
    ),
)

fig.show()

## Finding a better model using h2o

In [52]:
h2o.init(port=54321)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 21.0.1+12-LTS-29, mixed mode, sharing)
  Starting server from C:\Users\prsal\Desktop\ecommerce_recommender_system\.venv\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\prsal\AppData\Local\Temp\tmpyxcg1_2d
  JVM stdout: C:\Users\prsal\AppData\Local\Temp\tmpyxcg1_2d\h2o_prsal_started_from_python.out
  JVM stderr: C:\Users\prsal\AppData\Local\Temp\tmpyxcg1_2d\h2o_prsal_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_prsal_asrup8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.964 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,16 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.3
H2O_cluster_version_age:,12 days
H2O_cluster_name:,H2O_from_python_prsal_asrup8
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.488 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


First we need to transform all of our datasets into h2o frames

In [53]:
train_h2o = h2o.H2OFrame(np.hstack((x_train_transformed.toarray(), np.array(y_train).reshape(-1,1))))
train_h2o[train_h2o.columns[-1]] = train_h2o[train_h2o.columns[-1]].asfactor()

valid_h2o = h2o.H2OFrame(np.hstack((x_valid_transformed.toarray(), np.array(y_valid).reshape(-1,1))))
valid_h2o[valid_h2o.columns[-1]] = valid_h2o[valid_h2o.columns[-1]].asfactor()

test_h2o = h2o.H2OFrame(np.hstack((x_test_transformed.toarray(), np.array(y_test).reshape(-1,1))))
test_h2o[test_h2o.columns[-1]] = test_h2o[test_h2o.columns[-1]].asfactor()

x = list(train_h2o.columns[:-1])
y = train_h2o.columns[-1]

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [54]:
algorithms = ['GLM',
             'XGBoost',
             'DRF',
             'StackedEnsemble']

aml = H2OAutoML(max_models=12,
               include_algos=algorithms)

In [55]:
aml.train(x=x, y=y, training_frame=train_h2o)
aml_model_optimized = aml.leader
aml.leaderboard

AutoML progress: |█
21:05:11.106: AutoML: XGBoost is not available; skipping it.

██████████████████████████████████████████████████████████████| (done) 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_BestOfFamily_1_AutoML_1_20240101_210511,0.720625,0.608818,0.737913,0.382059,0.459383,0.211033
GLM_1_AutoML_1_20240101_210511,0.716579,0.613339,0.725752,0.403663,0.460665,0.212212
DRF_1_AutoML_1_20240101_210511,0.707298,0.624016,0.730081,0.422983,0.466636,0.217749
XRT_1_AutoML_1_20240101_210511,0.620938,0.68692,0.659202,0.5,0.496881,0.246891


In [56]:
y_train_pred = aml_model_optimized.predict(train_h2o).as_data_frame()['predict'].values
y_train_accuracy = accuracy_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

print('Accuracy score:', y_train_accuracy)
print('F1 score:', train_f1)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%



converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above).



Accuracy score: 0.6224302527233849
F1 score: 0.69117950836604


In [57]:
y_valid_pred = aml_model_optimized.predict(valid_h2o).as_data_frame()['predict'].values
y_valid_accuracy = accuracy_score(y_valid, y_valid_pred)
valid_f1 = f1_score(y_valid, y_valid_pred)

print('Accuracy score:', y_valid_accuracy)
print('F1 score:', valid_f1)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%



converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above).



Accuracy score: 0.5967869024689196
F1 score: 0.6755662005565144


In [58]:
y_test_pred_df = aml_model_optimized.predict(test_h2o).as_data_frame()
y_test_pred = y_test_pred_df['predict'].values
y_test_accuracy = accuracy_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

print('Accuracy score:', y_test_accuracy)
print('F1 score:', test_f1)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%



converting H2O frame to pandas dataframe using single-thread.  For faster conversion using multi-thread, install datatable (for Python 3.9 or lower), or polars and pyarrow (for Python 3.10 or above).



Accuracy score: 0.5762748960385204
F1 score: 0.6659188955996549


In [59]:
positive_probs = y_test_pred_df['p1'].values
fpr_aml, tpr_aml, thresholds = roc_curve(y_test, positive_probs)
roc_auc_aml = auc(fpr_aml, tpr_aml)

In [60]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr_aml, y=tpr_aml, mode='lines', name=f'ROC Curve (AUC={roc_auc_aml:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='x=y', line={'dash': 'dash'}))

fig.update_layout(
    title='ROC Curve',
    xaxis_title='False Positive Rate (FPR)',
    yaxis_title='True Positive Rate (TPR)',
    width=1000,  
    height=600,  
    legend=dict(
        x=0.02, 
        y=0.98,  
        traceorder='normal',
        font=dict(size=10),  
    ),
)

fig.show()

## Basic Tensorflow Collaborative Filtering Model

In [61]:
# first we must create product and user id mappers because our neural network can only handle numerical inputs
all_users = data['user_id'].unique()
all_products = data['product_id'].unique()

user_mapper = {user:mapped_id for mapped_id, user in enumerate(all_users)}
product_mapper = {product:mapped_id for mapped_id, product in enumerate(all_products)}

# store the total number of users and products, we will need this when defining the number of input dimensions for our embedding
num_users = len(all_users)
num_products = len(all_products)

In [62]:
# create our new train, valid, and test datasets
train_user_ids = np.array(X_train['user_id'].apply(lambda uid: user_mapper[uid]))
train_product_ids = np.array(X_train['product_id'].apply(lambda pid: product_mapper[pid]))
train_labels = np.array(y_train)

valid_user_ids = np.array(X_valid['user_id'].apply(lambda uid: user_mapper[uid]))
valid_product_ids = np.array(X_valid['product_id'].apply(lambda pid: product_mapper[pid]))
valid_labels = np.array(y_valid)

test_user_ids = np.array(X_test['user_id'].apply(lambda uid: user_mapper[uid]))
test_product_ids = np.array(X_test['product_id'].apply(lambda pid: product_mapper[pid]))
test_labels = np.array(y_test)


In [63]:
num_embedding_dimensions = 50

# create user and movie input layers
user_input = Input(shape=(1,), name='user_input')
product_input = Input(shape=(1,), name='product_input')

# create user and move embedding layers
user_embedding = Embedding(input_dim=num_users, output_dim=num_embedding_dimensions, input_length=1)(user_input)
product_embedding = Embedding(input_dim=num_products, output_dim=num_embedding_dimensions, input_length=1)(product_input)

user_flat = Flatten()(user_embedding)
product_flat = Flatten()(product_embedding)
concatenated = Concatenate()([user_flat, product_flat])

# add dense layers with dropout regularization
dense1 = Dense(128, activation='relu')(concatenated)
dropout1 = Dropout(0.2)(dense1)
dense2 = Dense(64, activation='relu')(dense1)
dropout2 = Dropout(0.2)(dense2)

# add our output layer
output = Dense(1, activation='sigmoid')(dropout2)




In [64]:
# create our model
model = Model(inputs=[user_input, product_input], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

model.fit([train_user_ids, train_product_ids], train_labels, epochs=5, batch_size=64,
         validation_data=([valid_user_ids, valid_product_ids], valid_labels))

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1b88a5abe20>

In [65]:
# Make predictions on the test set
train_preds = model.predict([train_user_ids, train_product_ids])

binary_train_predictions = (train_preds > 0.5).astype(int)

train_accuracy = accuracy_score(train_labels, binary_train_predictions)
train_f1 = f1_score(train_labels, binary_train_predictions)

print('Accuracy score:', train_accuracy)
print('F1 score:', train_f1)

Accuracy score: 0.9981310928897345
F1 score: 0.9981318477876702


In [66]:
# Make predictions on the test set
valid_preds = model.predict([valid_user_ids, valid_product_ids])

binary_valid_predictions = (valid_preds > 0.5).astype(int)

valid_accuracy = accuracy_score(valid_labels, binary_valid_predictions)
valid_f1 = f1_score(valid_labels, binary_valid_predictions)

print('Accuracy score:', valid_accuracy)
print('F1 score:', valid_f1)

Accuracy score: 0.7120907021537384
F1 score: 0.7336492123273803


In [67]:
# Make predictions on the test set
test_preds = model.predict([test_user_ids, test_product_ids])

binary_test_predictions = (test_preds > 0.5).astype(int)

test_accuracy = accuracy_score(test_labels, binary_test_predictions)
test_f1 = f1_score(test_labels, binary_test_predictions)

print('Accuracy score:', test_accuracy)
print('F1 score:', test_f1)

Accuracy score: 0.6957758809367477
F1 score: 0.722610257433646


In [68]:
fpr_tf, tpr_tf, thresholds = roc_curve(test_labels, test_preds)
roc_auc_tf = auc(fpr_tf, tpr_tf)

In [69]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr_tf, y=tpr_tf, mode='lines', name=f'ROC Curve (AUC={roc_auc_tf:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='x=y', line={'dash': 'dash'}))

fig.update_layout(
    title='ROC Curve',
    xaxis_title='False Positive Rate (FPR)',
    yaxis_title='True Positive Rate (TPR)',
    width=1000,  
    height=600,  
    legend=dict(
        x=0.02, 
        y=0.98,  
        traceorder='normal',
        font=dict(size=10),  
    ),
)

fig.show()

## ------------

In [70]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr_lr, y=tpr_lr, mode='lines', name=f'Linear Regression ROC Curve (AUC={roc_auc_lr:.2f})'))
fig.add_trace(go.Scatter(x=fpr_aml, y=tpr_aml, mode='lines', name=f'AutoML(GLM) ROC Curve (AUC={roc_auc_aml:.2f})'))
fig.add_trace(go.Scatter(x=fpr_tf, y=tpr_tf, mode='lines', name=f'Tensorflow ROC Curve (AUC={roc_auc_tf:.2f})'))

fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='x=y', line={'dash': 'dash'}))

fig.update_layout(
    title='ROC-AUC Curve',
    xaxis_title='False Positive Rate (FPR)',
    yaxis_title='True Positive Rate (TPR)',
    width=1000,  
    height=600,  
    legend=dict(
        x=0.02, 
        y=0.98,     
        traceorder='normal',
        font=dict(size=10),  
    ),
)

fig.show()

In [71]:
model.save('saved_model')


INFO:tensorflow:Assets written to: saved_model\assets


INFO:tensorflow:Assets written to: saved_model\assets


In [72]:
# save the user mapper to json
user_mapper_file_name = 'user_id_mapper.json'

user_mapper = {str(uid):map_value for uid,map_value in user_mapper.items()}

with open(user_mapper_file_name, 'w') as json_file:
    json.dump(user_mapper, json_file)

In [73]:
# save the product mapper to json
product_mapper_file_name = 'product_id_mapper.json'

product_mapper = {str(pid):map_value for pid,map_value in product_mapper.items()}

with open(product_mapper_file_name, 'w') as json_file:
    json.dump(product_mapper, json_file)

In [74]:
model = tf.keras.models.load_model('saved_model')








In [97]:
user_product_interactions_file_name = 'user_product_interactions.json'
user_product_interactions = {}
positive_interactions = data[(data['event_type']== 'purchase') |
                             (data['event_type'] == 'cart')]

for i,row in positive_interactions.iterrows():
    user_i = str(row['user_id'])
    product_i = str(row['product_id'])
    user_product_interactions[(user_i, product_i)] = 1

user_product_interactions = {str(key):value for key,value in user_product_interactions.items()}

with open(user_product_interactions_file_name, 'w') as json_file:
    json.dump(user_product_interactions, json_file)