In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error


# Exploratory Data Analysis

In [2]:
items = pd.read_csv('data/item.csv')
items.head()

Unnamed: 0,code,descrption,type,brand,size
0,3000005040,AUNT JEM ORIGINAL PANCAKE MIX,Type 1,Aunt Jemima,2 LB
1,3000005070,A/JEM COMPLETE PANCAKE MI,Type 1,Aunt Jemima,32 OZ
2,3000005300,AJ BUTTERMILK PANCAKE MIX,Type 1,Aunt Jemima,32 OZ
3,3000005350,A J BTRMLK COMP PNCK MIX,Type 1,Aunt Jemima,1 LB
4,1600015760,BC PANCAKE MIX BUTTERMILK,Type 1,Bisquick,6.75 OZ


In [683]:
# Count distinct values in each column
items.shape

(927, 5)

In [3]:
promo = pd.read_csv('data/promotion.csv')
promo.head()

Unnamed: 0,code,supermarkets,week,feature,display,province
0,2700042240,285,91,Not on Feature,Mid-Aisle End Cap,2
1,2700042292,285,92,Interior Page Feature,Not on Display,2
2,2700042274,285,92,Interior Page Feature,Not on Display,2
3,2700042273,285,92,Interior Page Feature,Not on Display,2
4,2700042254,285,92,Interior Page Feature,Not on Display,2


In [685]:
promo.shape

(351372, 6)

In [2]:
sales = pd.read_csv('data/sales.csv')
sales.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
1,3620000470,3.59,1,1100,2,1,125434,244,1,1,0
2,1800028064,2.25,1,1137,2,1,108320,244,2,1,0
3,9999985067,0.85,1,1148,2,1,162016,244,3,1,0
4,9999985131,2.19,1,1323,2,1,89437,244,4,1,0


In [3]:
sales.shape

(1048575, 11)

In [4]:
# Pivot sales to have basket as rows and code as columns
basket = sales.pivot_table(index='basket', columns='code', values='units', fill_value=0)
basket.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
basket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Get baskets where code 111112360 was purchased
basket_111112360 = basket[basket[111112360] > 0]
basket_111112360.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
basket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
330724,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487026,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
522090,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
550468,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
564425,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Normalize all quantity values >=1 to 1
basket_normalized = basket.applymap(lambda x: 1 if x >= 1 else 0)
basket_normalized.head()

  basket_normalized = basket.applymap(lambda x: 1 if x >= 1 else 0)


code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
basket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Write basket normalized to CSV
basket_normalized.to_csv('data/basket_normalized.csv')

In [3]:
# Load basket normalized from CSV
basket_normalized = pd.read_csv('data/basket_normalized.csv', index_col=0)
basket_normalized.head()

Unnamed: 0_level_0,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
basket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Compute pairwise cosine similarity between basket1 and every other basket
from sklearn.metrics.pairwise import cosine_similarity
basket1 = basket_normalized.iloc[0:1]
similarities = cosine_similarity(basket1, basket_normalized)
similarities_df = pd.DataFrame(similarities.T, index=basket_normalized.index, columns=['similarity'])
similarities_df.head()

Unnamed: 0_level_0,similarity
basket,Unnamed: 1_level_1
1,1.0
2,0.0
3,0.0
4,0.0
5,0.0


In [11]:
# Get top 10 most similar baskets that are less similar than basket1 itself
top_similar_baskets = similarities_df[similarities_df['similarity'] < 0.99].nlargest(10, 'similarity')
top_similar_baskets

Unnamed: 0_level_0,similarity
basket,Unnamed: 1_level_1
129947,0.816497
197019,0.816497
241650,0.816497
314978,0.816497
414093,0.816497
445410,0.816497
565474,0.816497
616834,0.816497
640172,0.816497
254,0.707107


In [688]:
supermarkets = pd.read_csv('data/supermarkets.csv')
supermarkets.head()

Unnamed: 0,supermarket_No,postal-code
0,199,30319
1,200,30134
2,201,30066
3,202,31093
4,203,30542


In [689]:
supermarkets.shape

(387, 2)

# Data Cleaning
Data is not in a great format for our purposes
1. ~~Encode categorical variables (brands, types, features, etc.) to not be text~~
2. ~~Are joins needed?~~ I can make any needed joins at the end I think
3. ~~Remove null values~~

In [690]:
items.dropna()
promo.dropna()
sales.dropna()
supermarkets.dropna()

items.shape, promo.shape, sales.shape, supermarkets.shape

((927, 5), (351372, 6), (1048575, 11), (387, 2))

In [691]:
from sklearn.preprocessing import LabelEncoder

# Encode items categorical variables (type and brand) using label encoding
le = LabelEncoder()
items['type'] = le.fit_transform(items['type'])
items['brand'] = le.fit_transform(items['brand'])
items.head()

Unnamed: 0,code,descrption,type,brand,size
0,3000005040,AUNT JEM ORIGINAL PANCAKE MIX,0,8,2 LB
1,3000005070,A/JEM COMPLETE PANCAKE MI,0,8,32 OZ
2,3000005300,AJ BUTTERMILK PANCAKE MIX,0,8,32 OZ
3,3000005350,A J BTRMLK COMP PNCK MIX,0,8,1 LB
4,1600015760,BC PANCAKE MIX BUTTERMILK,0,15,6.75 OZ


In [692]:
# Enconde promo categorical variables (type and channel) using label encoding
le = LabelEncoder()
promo['feature'] = le.fit_transform(promo['feature'])
promo['display'] = le.fit_transform(promo['display'])
promo.head()

Unnamed: 0,code,supermarkets,week,feature,display,province
0,2700042240,285,91,4,3,2
1,2700042292,285,92,2,4,2
2,2700042274,285,92,2,4,2
3,2700042273,285,92,2,4,2
4,2700042254,285,92,2,4,2


# Item-Item Collaborative Filtering
Generate a sparse matrix where each row is a user and each column represents an item. The value in each column will be the number of times each user bought each item. This is normally done with ratings, but I will use number of times purchased as a rating proxy.

## Split Data for Train and Test

In [693]:
# Split sales data into train and test sets based on date
sorted_sales = sales.sort_values(by='week')

# Split data 80/20 train, test based on week number
split_index = int(len(sorted_sales['week'].unique()) * 0.8)
train_weeks = sorted_sales['week'].unique()[:split_index]
test_weeks = sorted_sales['week'].unique()[split_index:]

# Create train and test datasets
train_data = sorted_sales[sorted_sales['week'].isin(train_weeks)]
test_data = sorted_sales[sorted_sales['week'].isin(test_weeks)]

In [694]:
train_data.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
351429,7680851433,0.8,1,1949,1,1,202867,170,225560,3,0
351430,7680850294,0.8,1,1949,1,1,202867,170,225560,3,0
351431,7130000075,0.99,1,2022,1,1,296714,170,225561,3,0
351432,9999985070,0.89,1,2102,1,1,2264,170,225562,3,0


In [695]:
test_data.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
966143,4144900110,2.79,1,1502,2,24,7329,258,613341,167,0
966167,7680850294,0.99,1,1447,2,24,7471,241,613355,167,0
966168,3620000444,2.19,1,1447,2,24,7471,241,613355,167,0
966174,9999985132,0.79,1,1528,2,24,26075,241,613358,167,0
966173,1510000007,0.99,1,1522,2,24,19511,241,613357,167,0


In [696]:
# Get train data week range
train_data['week'].min(), train_data['week'].max()

(np.int64(1), np.int64(23))

In [697]:
# Get test data week range
test_data['week'].min(), test_data['week'].max()

(np.int64(24), np.int64(28))

In [698]:
# Create collaborative filtering matrix
# Pivot sales data to create a user-item matrix
# Rows are customerId, columns are item codes, values are number of units purchased
train_matrix = train_data.pivot_table(index='customerId', columns='code', values='units', aggfunc='sum', fill_value=0)
train_matrix.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [699]:
train_item_user_matrix = train_data.pivot_table(index='code', columns='customerId', values='units', aggfunc='sum', fill_value=0)
train_item_user_matrix.head()

customerId,1,2,5,7,9,19,22,23,25,29,...,510011,510013,510015,510016,510018,510019,510020,510021,510022,510023
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [700]:
# Get average number of purchases of a specific item from train_item_user_matrix

item_purchases = train_item_user_matrix.loc[1480071124]
item_purchases.mean()

np.float64(0.004511601916400245)

In [701]:
# Locate items where number of units purchased is greater than 0 for a specific user
train_matrix.loc[1][train_matrix.loc[1] > 0]

code
4420930142    1
9999967727    1
Name: 1, dtype: int64

In [702]:
# Create collaborative filtering matrix for test data
test_matrix = test_data.pivot_table(index='customerId', columns='code', values='units', aggfunc='sum', fill_value=0)
test_matrix.head()

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [703]:
test_item_user_matrix = test_data.pivot_table(index='code', columns='customerId', values='units', aggfunc='sum', fill_value=0)
test_item_user_matrix.head()

customerId,2,6,22,36,47,50,66,84,95,101,...,509983,509995,510001,510007,510008,510011,510015,510016,510018,510027
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601011292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [704]:
# Get user item matrix entries where 111112360 is greater than 0 (just to sanity check that the matrix is correct)
test_matrix[test_matrix[111112360] > 0]

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
175058,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
215056,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
215652,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364919,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [705]:
train_matrix.shape, test_matrix.shape

((218326, 755), (107500, 685))

# Generate item-item matrix using cosine similarity

In [706]:
from sklearn.metrics.pairwise import cosine_similarity
# Find similar items in the training set using cosine similarity
train_similarity = cosine_similarity(train_matrix.T)
train_similarity_df = pd.DataFrame(train_similarity, index=train_matrix.columns, columns=train_matrix.columns)
train_similarity_df.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300023,0.0,1.0,0.032505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002207,0.0,0.0,0.0,0.006011,0.008763,0.0,0.0
566300028,0.0,0.032505,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000393,...,0.001259,0.0,0.001906,0.0,0.0,0.0,0.002884,0.006727,0.0,0.0
566300029,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300035,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [707]:
# Find similar items in the test set using cosine similarity
test_similarity = cosine_similarity(test_matrix.T)
test_similarity_df = pd.DataFrame(test_similarity, index=test_matrix.columns, columns=test_matrix.columns)
test_similarity_df.head()

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300023,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300028,0.0,0.0,1.0,0.0,0.010143,0.0,0.0,0.0,0.0,0.0,...,0.006241,0.0,0.0,0.0,0.0,0.0,0.0,0.01308,0.0,0.0
566300035,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
601011292,0.0,0.0,0.010143,0.0,1.0,0.039056,0.016434,0.051033,0.073609,0.072993,...,0.007478,0.007173,0.004933,0.024985,0.0,0.003334,0.001508,0.004644,0.006917,0.0


In [708]:
def get_popular_items(train_matrix, n=10):
    item_popularity = (train_matrix > 0).sum(axis=0)
    return item_popularity.sort_values(ascending=False).head(n)

In [709]:
def get_similar_items(item_code, similarity_df=train_similarity_df, n=10):
    if item_code not in similarity_df.columns:
        return get_popular_items(train_matrix)
    
    similar_items = similarity_df[item_code].sort_values(ascending=False)
    similar_items = similar_items.drop(item_code, errors='ignore')
    
    return similar_items.head(n)

In [738]:
def item_based_cf(item_code, user, train_matrix=train_matrix, similarity_df=train_similarity_df):
    # Cold starts
    # If the item or the user does not exist in the training matrix, default to returning 0
    if item_code not in similarity_df.columns:
        return 0
    if user not in train_matrix.index:
        return 0
    
    # Get similar items (excluding the item itself)
    similar_items = get_similar_items(item_code, similarity_df)

    weighted_score = 0
    similarity_score = 0
    # NOTE: This will not cause an error because if the user does not exist in the matrix, we return popular items above
    user_purchases = train_matrix.loc[user]
    item_purchases = train_item_user_matrix.loc[item_code].mean()

    for item in similar_items:
        similarity = item
        weighted_score += similarity * item_purchases
        similarity_score += abs(similarity)

    if similarity_score == 0:
        return 0
    return round(user_purchases.get(item_code, 0) + weighted_score / similarity_score)

In [739]:
user_count = 0
cf_predictions = []
for user in test_matrix.index:
    for item in test_matrix.columns:
        pred = item_based_cf(item, user, train_matrix, train_similarity_df)
        cf_predictions.append({
            'customerId': user,
            'code': item,
            'predicted_units': pred
        })
    user_count += 1
    if user_count == 100:
        break

In [740]:
# Get all predictions for first where predicted_units > 0
cf_results = pd.DataFrame(cf_predictions)
cf_results

Unnamed: 0,customerId,code,predicted_units
0,2,111112360,0
1,2,566300023,0
2,2,566300028,0
3,2,566300035,0
4,2,601011292,0
...,...,...,...
68495,713,9999985217,0
68496,713,9999985260,0
68497,713,9999985261,0
68498,713,9999985488,0


In [741]:
# Get RMSE for collaborative filtering predictions on customers in cf_results
# Get unique users in cf_results
customers_in_cf = cf_results['customerId'].unique()

# Get cf results for customers_in_cf
cf_results_filtered = cf_results[cf_results['customerId'].isin(customers_in_cf)]
merged_customers = pd.merge(cf_results_filtered, test_data, on=['customerId', 'code'], how='left')
merged_customers['units'] = merged_customers['units'].fillna(0)
rmse_cf = root_mean_squared_error(merged_customers['units'], merged_customers['predicted_units'])
rmse_cf


0.12374108790667744

In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-4.1.0.tar.gz (455.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m455.3/455.3 MB[0m [31m15.2 MB/s[0m  [33m0:00:26[0m:00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting py4j<0.10.9.10,>=0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading py4j-0.10.9.9-py2.py3-none-any.whl (203 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (pyproject.toml) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-4.1.0-py2.py3-none-any.whl size=455986285 sha256=0fd2d3af107f90f2c089f86991250c8fbfea7b7cda6285d5e82951c25978c28b
  Stored in directory: /home/jovyan/.cache/pip/wheels/ba/c7/71/d0e2427d6c47e04bbf39b97f2b4dfc16820aa875cfc981a6cd
Successfully built pyspark
Installing collected package

In [1]:
from pyspark.sql import SparkSession

# Initialize Spark
spark = SparkSession.builder \
    .appName("App") \
    .config("spark.driver.memory", "2g") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Create a small sample DataFrame
data = [("Alice", 34), ("Bob", 45), ("Charlie", 29)]
df = spark.createDataFrame(data, ["Name", "Age"])

df.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 34|
|    Bob| 45|
|Charlie| 29|
+-------+---+



In [2]:
# Read data/sales.csv with spark as a dataframe
sales = spark.read.format('csv') \
    .option('header', 'true') \
    .option('inferSchema', 'true') \
    .load('data/sales.csv')
# display first 5 rows of sales
sales.show(5)


+----------+------+-----+----+--------+----+----------+-----------+------+---+-------+
|      code|amount|units|time|province|week|customerId|supermarket|basket|day|voucher|
+----------+------+-----+----+--------+----+----------+-----------+------+---+-------+
|7680850106|   0.8|    1|1100|       2|   1|    125434|        244|     1|  1|      0|
|3620000470|  3.59|    1|1100|       2|   1|    125434|        244|     1|  1|      0|
|1800028064|  2.25|    1|1137|       2|   1|    108320|        244|     2|  1|      0|
|9999985067|  0.85|    1|1148|       2|   1|    162016|        244|     3|  1|      0|
|9999985131|  2.19|    1|1323|       2|   1|     89437|        244|     4|  1|      0|
+----------+------+-----+----+--------+----+----------+-----------+------+---+-------+
only showing top 5 rows


In [3]:
# Pivot sales to have basket as rows and code as columns
basket = sales.groupBy('basket').pivot('code').sum('units').fillna(0)
basket.show(5)

+------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+--

In [4]:
# Normalize all quantity values >=1 to 1
from pyspark.sql.functions import when, col
basket_normalized = basket.select(
    [col(c) if c == 'basket' else when(col(c) >= 1, 1).otherwise(0).alias(c) for c in basket.columns]
)
basket_normalized.show(5)

+------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+--

In [5]:
# Compute pairwise cosine similarity between all baskets in basket_normalized resulting in an adjacency matrix
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Summarizer
from pyspark.ml.feature import Normalizer   
assembler = VectorAssembler(inputCols=basket_normalized.columns[1:], outputCol="features")
basket_vector = assembler.transform(basket_normalized).select('basket', 'features')
basket_vector.show(5)

+------+--------------------+
|basket|            features|
+------+--------------------+
|   148|   (782,[749],[1.0])|
|  1238|(782,[84,178],[1....|
|  1645|(782,[12,627],[1....|
|  4101|   (782,[254],[1.0])|
|  5300|   (782,[232],[1.0])|
+------+--------------------+
only showing top 5 rows


In [6]:
#Get first row of basket vector , convert to pandas df, and save it to a file
sample = basket_vector.filter(col('basket') == 1)
sample.show()
sample.toPandas().to_csv('data/sample_basket_vector.csv', index=False)

+------+--------------------+
|basket|            features|
+------+--------------------+
|     1|(782,[236,601],[1...|
+------+--------------------+



In [7]:
# Compute pairwise cosine similarity between all baskets
from pyspark.sql.functions import udf, col
from pyspark.sql.types import DoubleType
from math import sqrt

# Define UDF to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm1 = sqrt(float(vec1.dot(vec1)))
    norm2 = sqrt(float(vec2.dot(vec2)))
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Create a cross join to compute similarity between all pairs of baskets
basket_vector_1 = basket_vector.withColumnRenamed('basket', 'basket_1').withColumnRenamed('features', 'features_1')
basket_vector_2 = basket_vector.withColumnRenamed('basket', 'basket_2').withColumnRenamed('features', 'features_2')

similarity_matrix = basket_vector_1.crossJoin(basket_vector_2) \
    .withColumn('similarity', cosine_similarity_udf(col('features_1'), col('features_2'))) \
    .select('basket_1', 'basket_2', 'similarity')

similarity_matrix.show(10)


+--------+--------+----------+
|basket_1|basket_2|similarity|
+--------+--------+----------+
|     148|     148|       1.0|
|     148|    1238|       0.0|
|     148|    1645|       0.0|
|     148|    4101|       0.0|
|     148|    5300|       0.0|
|     148|    5518|       0.0|
|     148|    7240|       0.0|
|     148|    7253|       0.0|
|     148|    7880|       0.0|
|     148|    8086|       0.0|
+--------+--------+----------+
only showing top 10 rows


In [9]:
# Save similarity matrix to CSV
similarity_matrix.toPandas().to_csv('data/similarity_matrix.csv', index=False)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.13/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.13/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ~~~~~~~~~~~~~~~~~~~~^^
  File "/opt/conda/lib/python3.13/socket.py", line 719, in readinto
    return self._sock.recv_into(b)
           ~~~~~~~~~~~~~~~~~~~~^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [8]:
# Get dimensions of similarity matrix
similarity_matrix.count()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.13/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.13/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ~~~~~~~~~~~~~~~~~~~~^^
  File "/opt/conda/lib/python3.13/socket.py", line 719, in readinto
    return self._sock.recv_into(b)
           ~~~~~~~~~~~~~~~~~~~~^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [10]:
# Pivot similarity matrix to have rows and columns be baskets and the asssociated entry be the similarity
similarity_pivot = similarity_matrix.groupBy('basket_1').pivot('basket_2').agg({'similarity': 'first'}).fillna(0)
similarity_pivot.show(5)


ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.13/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.13/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ~~~~~~~~~~~~~~~~~~~~^^
  File "/opt/conda/lib/python3.13/socket.py", line 719, in readinto
    return self._sock.recv_into(b)
           ~~~~~~~~~~~~~~~~~~~~^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [10]:
# Pivot in Spark to have baskets as rows and columns, then convert to pandas and save as CSV
adjacency_matrix_spark = similarity_matrix.pivot('basket_1', 'basket_2').agg({'similarity': 'first'})
adjacency_matrix_df = adjacency_matrix_spark.toPandas()
adjacency_matrix_df.to_csv('data/adjacency_matrix.csv')
print(f"Adjacency matrix saved with shape: {adjacency_matrix_df.shape}")
print(adjacency_matrix_df.head(10))

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.13/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.13/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ~~~~~~~~~~~~~~~~~~~~^^
  File "/opt/conda/lib/python3.13/socket.py", line 719, in readinto
    return self._sock.recv_into(b)
           ~~~~~~~~~~~~~~~~~~~~^^^
KeyboardInterrupt
ERROR:py4j.clientserver:Exception occurred while shutting down connection
Traceback (most recent call last):
  File "/opt/conda/lib/python3.13/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.13/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
   

KeyboardInterrupt: 

In [12]:
print("Spark processing complete.")

Spark processing complete.


In [None]:
# Sample for commit