In [143]:
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder

# Exploratory Data Analysis

In [144]:
ls -l data/

total 142568
-rw-rw-rw-@ 1 pavlemedvidovic  staff     56067 Oct  6  2024 item.csv
-rw-rw-rw-@ 1 pavlemedvidovic  staff  19547042 Oct  6  2024 promotion.csv
-rw-rw-rw-@ 1 pavlemedvidovic  staff  53376614 Oct  6  2024 sales.csv
-rw-rw-rw-@ 1 pavlemedvidovic  staff      4177 Oct  6  2024 supermarkets.csv


In [145]:
items = pd.read_csv('data/item.csv')
items.head()

Unnamed: 0,code,descrption,type,brand,size
0,3000005040,AUNT JEM ORIGINAL PANCAKE MIX,Type 1,Aunt Jemima,2 LB
1,3000005070,A/JEM COMPLETE PANCAKE MI,Type 1,Aunt Jemima,32 OZ
2,3000005300,AJ BUTTERMILK PANCAKE MIX,Type 1,Aunt Jemima,32 OZ
3,3000005350,A J BTRMLK COMP PNCK MIX,Type 1,Aunt Jemima,1 LB
4,1600015760,BC PANCAKE MIX BUTTERMILK,Type 1,Bisquick,6.75 OZ


In [146]:
# Count distinct values in each column
items.shape

(927, 5)

In [147]:
promo = pd.read_csv('data/promotion.csv')
promo.head()

Unnamed: 0,code,supermarkets,week,feature,display,province
0,2700042240,285,91,Not on Feature,Mid-Aisle End Cap,2
1,2700042292,285,92,Interior Page Feature,Not on Display,2
2,2700042274,285,92,Interior Page Feature,Not on Display,2
3,2700042273,285,92,Interior Page Feature,Not on Display,2
4,2700042254,285,92,Interior Page Feature,Not on Display,2


In [148]:
promo.shape

(351372, 6)

In [149]:
sales = pd.read_csv('data/sales.csv')
sales.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
1,3620000470,3.59,1,1100,2,1,125434,244,1,1,0
2,1800028064,2.25,1,1137,2,1,108320,244,2,1,0
3,9999985067,0.85,1,1148,2,1,162016,244,3,1,0
4,9999985131,2.19,1,1323,2,1,89437,244,4,1,0


In [150]:
sales.shape

(1048575, 11)

In [151]:
supermarkets = pd.read_csv('data/supermarkets.csv')
supermarkets.head()

Unnamed: 0,supermarket_No,postal-code
0,199,30319
1,200,30134
2,201,30066
3,202,31093
4,203,30542


In [152]:
supermarkets.shape

(387, 2)

# Data Cleaning
Data is not in a great format for our purposes
1. ~~Encode categorical variables (brands, types, features, etc.) to not be text~~
2. ~~Are joins needed?~~ I can make any needed joins at the end I think
3. ~~Remove null values~~

In [153]:
items.dropna()
promo.dropna()
sales.dropna()
supermarkets.dropna()

items.shape, promo.shape, sales.shape, supermarkets.shape

((927, 5), (351372, 6), (1048575, 11), (387, 2))

In [154]:
# Encode items categorical variables (type and brand) using label encoding
le = LabelEncoder()
items['type'] = le.fit_transform(items['type'])
items['brand'] = le.fit_transform(items['brand'])
items.head()

Unnamed: 0,code,descrption,type,brand,size
0,3000005040,AUNT JEM ORIGINAL PANCAKE MIX,0,8,2 LB
1,3000005070,A/JEM COMPLETE PANCAKE MI,0,8,32 OZ
2,3000005300,AJ BUTTERMILK PANCAKE MIX,0,8,32 OZ
3,3000005350,A J BTRMLK COMP PNCK MIX,0,8,1 LB
4,1600015760,BC PANCAKE MIX BUTTERMILK,0,15,6.75 OZ


In [155]:
# Enconde promo categorical variables (type and channel) using label encoding
le = LabelEncoder()
promo['feature'] = le.fit_transform(promo['feature'])
promo['display'] = le.fit_transform(promo['display'])
promo.head()

Unnamed: 0,code,supermarkets,week,feature,display,province
0,2700042240,285,91,4,3,2
1,2700042292,285,92,2,4,2
2,2700042274,285,92,2,4,2
3,2700042273,285,92,2,4,2
4,2700042254,285,92,2,4,2


# Item-Item Collaborative Filtering
Generate a sparse matrix where each row is a user and each column represents an item. The value in each column will be the number of times each user bought each item. This is normally done with ratings, but I will use number of times purchased as a rating proxy.

## Split Data for Train and Test

In [156]:
# Split sales data into train and test sets based on date
sorted_sales = sales.sort_values(by='week')

# Split data 80/20 train, test based on week number
split_index = int(len(sorted_sales['week'].unique()) * 0.8)
train_weeks = sorted_sales['week'].unique()[:split_index]
test_weeks = sorted_sales['week'].unique()[split_index:]

# Create train and test datasets
train_data = sorted_sales[sorted_sales['week'].isin(train_weeks)]
test_data = sorted_sales[sorted_sales['week'].isin(test_weeks)]

In [157]:
train_data.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
351429,7680851433,0.8,1,1949,1,1,202867,170,225560,3,0
351430,7680850294,0.8,1,1949,1,1,202867,170,225560,3,0
351431,7130000075,0.99,1,2022,1,1,296714,170,225561,3,0
351432,9999985070,0.89,1,2102,1,1,2264,170,225562,3,0


In [158]:
test_data.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
966143,4144900110,2.79,1,1502,2,24,7329,258,613341,167,0
966167,7680850294,0.99,1,1447,2,24,7471,241,613355,167,0
966168,3620000444,2.19,1,1447,2,24,7471,241,613355,167,0
966174,9999985132,0.79,1,1528,2,24,26075,241,613358,167,0
966173,1510000007,0.99,1,1522,2,24,19511,241,613357,167,0


In [159]:
# Get train data week range
train_data['week'].min(), train_data['week'].max()

(np.int64(1), np.int64(23))

In [160]:
# Get test data week range
test_data['week'].min(), test_data['week'].max()

(np.int64(24), np.int64(28))

In [161]:
# Create collaborative filtering matrix
# Pivot sales data to create a user-item matrix
# Rows are customerId, columns are item codes, values are number of units purchased
train_matrix = train_data.pivot_table(index='customerId', columns='code', values='units', aggfunc='sum', fill_value=0)
train_matrix.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [162]:
# Create collaborative filtering matrix for test data
test_matrix = test_data.pivot_table(index='customerId', columns='code', values='units', aggfunc='sum', fill_value=0)
test_matrix.head()

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [163]:
# Get user item matrix entries where 111112360 is greater than 0 (just to sanity check that the matrix is correct)
test_matrix[test_matrix[111112360] > 0]

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
175058,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
215056,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
215652,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364919,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [164]:
train_matrix.shape, test_matrix.shape

((218326, 755), (107500, 685))

# Generate item-item matrix using cosine similarity

In [165]:
# Find similar items in the training set using cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
train_similarity = cosine_similarity(train_matrix.T)
train_similarity_df = pd.DataFrame(train_similarity, index=train_matrix.columns, columns=train_matrix.columns)
train_similarity_df.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300023,0.0,1.0,0.032505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002207,0.0,0.0,0.0,0.006011,0.008763,0.0,0.0
566300028,0.0,0.032505,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000393,...,0.001259,0.0,0.001906,0.0,0.0,0.0,0.002884,0.006727,0.0,0.0
566300029,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300035,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
# Get most commonly bought items together with item 111112360 in the training set
train_111112360_similar = train_similarity_df[111112360].sort_values(ascending=False)
train_111112360_similar.head(10)

code
111112360     1.000000
6971911131    0.147442
5100012013    0.054215
9999985006    0.016835
5100001588    0.013857
9999985027    0.010366
5100012911    0.010348
7130000008    0.009225
2700042273    0.006487
5100002794    0.006249
Name: 111112360, dtype: float64

In [169]:
# Find similar items in the test set using cosine similarity
test_similarity = cosine_similarity(test_matrix.T)
test_similarity_df = pd.DataFrame(test_similarity, index=test_matrix.columns, columns=test_matrix.columns)
test_similarity_df.head()

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300023,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300028,0.0,0.0,1.0,0.0,0.010143,0.0,0.0,0.0,0.0,0.0,...,0.006241,0.0,0.0,0.0,0.0,0.0,0.0,0.01308,0.0,0.0
566300035,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
601011292,0.0,0.0,0.010143,0.0,1.0,0.039056,0.016434,0.051033,0.073609,0.072993,...,0.007478,0.007173,0.004933,0.024985,0.0,0.003334,0.001508,0.004644,0.006917,0.0


In [None]:
def item_based_cf(item_code, user):
    # Get similar items from training set
    # Check if item is in training similarity dataframe
    if item_code not in train_similarity_df.columns:
        return pd.Series(dtype=float)
    similar_items = train_similarity_df[item_code].sort_values(ascending=False)
    # Get items the user has already purchased
    # Check if user is in training matrix
    if user not in train_matrix.index:
        return similar_items.head(10)
    user_purchases = train_matrix.loc[user]
    purchased_items = user_purchases[user_purchases > 0].index.tolist()
    # Filter out items the user has already purchased (offset by one to not get the item itself)
    recommended_items = similar_items[~similar_items.index.isin(purchased_items)]
    return recommended_items.head(10)

In [175]:
item_based_cf(111112360, 10001)

code
111112360     1.000000
6971911131    0.147442
5100012013    0.054215
9999985006    0.016835
5100001588    0.013857
9999985027    0.010366
5100012911    0.010348
7130000008    0.009225
2700042273    0.006487
5100002794    0.006249
Name: 111112360, dtype: float64