In [311]:
import pandas as pd
import numpy as np
import sklearn
import math


# Exploratory Data Analysis

In [199]:
items = pd.read_csv('data/item.csv')
items.head()

Unnamed: 0,code,descrption,type,brand,size
0,3000005040,AUNT JEM ORIGINAL PANCAKE MIX,Type 1,Aunt Jemima,2 LB
1,3000005070,A/JEM COMPLETE PANCAKE MI,Type 1,Aunt Jemima,32 OZ
2,3000005300,AJ BUTTERMILK PANCAKE MIX,Type 1,Aunt Jemima,32 OZ
3,3000005350,A J BTRMLK COMP PNCK MIX,Type 1,Aunt Jemima,1 LB
4,1600015760,BC PANCAKE MIX BUTTERMILK,Type 1,Bisquick,6.75 OZ


In [200]:
# Count distinct values in each column
items.shape

(927, 5)

In [201]:
promo = pd.read_csv('data/promotion.csv')
promo.head()

Unnamed: 0,code,supermarkets,week,feature,display,province
0,2700042240,285,91,Not on Feature,Mid-Aisle End Cap,2
1,2700042292,285,92,Interior Page Feature,Not on Display,2
2,2700042274,285,92,Interior Page Feature,Not on Display,2
3,2700042273,285,92,Interior Page Feature,Not on Display,2
4,2700042254,285,92,Interior Page Feature,Not on Display,2


In [202]:
promo.shape

(351372, 6)

In [203]:
sales = pd.read_csv('data/sales.csv')
sales.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
1,3620000470,3.59,1,1100,2,1,125434,244,1,1,0
2,1800028064,2.25,1,1137,2,1,108320,244,2,1,0
3,9999985067,0.85,1,1148,2,1,162016,244,3,1,0
4,9999985131,2.19,1,1323,2,1,89437,244,4,1,0


In [204]:
sales.shape

(1048575, 11)

In [205]:
supermarkets = pd.read_csv('data/supermarkets.csv')
supermarkets.head()

Unnamed: 0,supermarket_No,postal-code
0,199,30319
1,200,30134
2,201,30066
3,202,31093
4,203,30542


In [206]:
supermarkets.shape

(387, 2)

# Data Cleaning
Data is not in a great format for our purposes
1. ~~Encode categorical variables (brands, types, features, etc.) to not be text~~
2. ~~Are joins needed?~~ I can make any needed joins at the end I think
3. ~~Remove null values~~

In [207]:
items.dropna()
promo.dropna()
sales.dropna()
supermarkets.dropna()

items.shape, promo.shape, sales.shape, supermarkets.shape

((927, 5), (351372, 6), (1048575, 11), (387, 2))

In [208]:
from sklearn.preprocessing import LabelEncoder

# Encode items categorical variables (type and brand) using label encoding
le = LabelEncoder()
items['type'] = le.fit_transform(items['type'])
items['brand'] = le.fit_transform(items['brand'])
items.head()

Unnamed: 0,code,descrption,type,brand,size
0,3000005040,AUNT JEM ORIGINAL PANCAKE MIX,0,8,2 LB
1,3000005070,A/JEM COMPLETE PANCAKE MI,0,8,32 OZ
2,3000005300,AJ BUTTERMILK PANCAKE MIX,0,8,32 OZ
3,3000005350,A J BTRMLK COMP PNCK MIX,0,8,1 LB
4,1600015760,BC PANCAKE MIX BUTTERMILK,0,15,6.75 OZ


In [209]:
# Enconde promo categorical variables (type and channel) using label encoding
le = LabelEncoder()
promo['feature'] = le.fit_transform(promo['feature'])
promo['display'] = le.fit_transform(promo['display'])
promo.head()

Unnamed: 0,code,supermarkets,week,feature,display,province
0,2700042240,285,91,4,3,2
1,2700042292,285,92,2,4,2
2,2700042274,285,92,2,4,2
3,2700042273,285,92,2,4,2
4,2700042254,285,92,2,4,2


# Item-Item Collaborative Filtering
Generate a sparse matrix where each row is a user and each column represents an item. The value in each column will be the number of times each user bought each item. This is normally done with ratings, but I will use number of times purchased as a rating proxy.

## Split Data for Train and Test

In [210]:
# Split sales data into train and test sets based on date
sorted_sales = sales.sort_values(by='week')

# Split data 80/20 train, test based on week number
split_index = int(len(sorted_sales['week'].unique()) * 0.8)
train_weeks = sorted_sales['week'].unique()[:split_index]
test_weeks = sorted_sales['week'].unique()[split_index:]

# Create train and test datasets
train_data = sorted_sales[sorted_sales['week'].isin(train_weeks)]
test_data = sorted_sales[sorted_sales['week'].isin(test_weeks)]

In [211]:
train_data.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
0,7680850106,0.8,1,1100,2,1,125434,244,1,1,0
351429,7680851433,0.8,1,1949,1,1,202867,170,225560,3,0
351430,7680850294,0.8,1,1949,1,1,202867,170,225560,3,0
351431,7130000075,0.99,1,2022,1,1,296714,170,225561,3,0
351432,9999985070,0.89,1,2102,1,1,2264,170,225562,3,0


In [212]:
test_data.head()

Unnamed: 0,code,amount,units,time,province,week,customerId,supermarket,basket,day,voucher
966143,4144900110,2.79,1,1502,2,24,7329,258,613341,167,0
966167,7680850294,0.99,1,1447,2,24,7471,241,613355,167,0
966168,3620000444,2.19,1,1447,2,24,7471,241,613355,167,0
966174,9999985132,0.79,1,1528,2,24,26075,241,613358,167,0
966173,1510000007,0.99,1,1522,2,24,19511,241,613357,167,0


In [213]:
# Get train data week range
train_data['week'].min(), train_data['week'].max()

(np.int64(1), np.int64(23))

In [214]:
# Get test data week range
test_data['week'].min(), test_data['week'].max()

(np.int64(24), np.int64(28))

In [215]:
# Create collaborative filtering matrix
# Pivot sales data to create a user-item matrix
# Rows are customerId, columns are item codes, values are number of units purchased
train_matrix = train_data.pivot_table(index='customerId', columns='code', values='units', aggfunc='sum', fill_value=0)
train_matrix.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [295]:
train_item_user_matrix = train_data.pivot_table(index='code', columns='customerId', values='units', aggfunc='sum', fill_value=0)
train_item_user_matrix.head()

customerId,1,2,5,7,9,19,22,23,25,29,...,510011,510013,510015,510016,510018,510019,510020,510021,510022,510023
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300029,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [316]:
# Get average number of purchases of a specific item from train_item_user_matrix

item_purchases = train_item_user_matrix.loc[1480071124]
item_purchases.mean()

np.float64(0.004511601916400245)

In [248]:
# Locate items where number of units purchased is greater than 0 for a specific user
train_matrix.loc[1][train_matrix.loc[1] > 0]

code
4420930142    1
9999967727    1
Name: 1, dtype: int64

In [216]:
# Create collaborative filtering matrix for test data
test_matrix = test_data.pivot_table(index='customerId', columns='code', values='units', aggfunc='sum', fill_value=0)
test_matrix.head()

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [296]:
test_item_user_matrix = test_data.pivot_table(index='code', columns='customerId', values='units', aggfunc='sum', fill_value=0)
test_item_user_matrix.head()

customerId,2,6,22,36,47,50,66,84,95,101,...,509983,509995,510001,510007,510008,510011,510015,510016,510018,510027
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300028,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
566300035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
601011292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [217]:
# Get user item matrix entries where 111112360 is greater than 0 (just to sanity check that the matrix is correct)
test_matrix[test_matrix[111112360] > 0]

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
customerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
175058,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
215056,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
215652,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
364919,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [218]:
train_matrix.shape, test_matrix.shape

((218326, 755), (107500, 685))

# Generate item-item matrix using cosine similarity

In [219]:
from sklearn.metrics.pairwise import cosine_similarity
# Find similar items in the training set using cosine similarity
train_similarity = cosine_similarity(train_matrix.T)
train_similarity_df = pd.DataFrame(train_similarity, index=train_matrix.columns, columns=train_matrix.columns)
train_similarity_df.head()

code,111112360,566300023,566300028,566300029,566300035,601011292,601011293,601011294,601011295,601011296,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300023,0.0,1.0,0.032505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.002207,0.0,0.0,0.0,0.006011,0.008763,0.0,0.0
566300028,0.0,0.032505,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000393,...,0.001259,0.0,0.001906,0.0,0.0,0.0,0.002884,0.006727,0.0,0.0
566300029,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300035,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [220]:
# Get most commonly bought items together with item 111112360 in the training set
train_111112360_similar = train_similarity_df[111112360].sort_values(ascending=False)
train_111112360_similar.head(10)

code
111112360     1.000000
6971911131    0.147442
5100012013    0.054215
9999985006    0.016835
5100001588    0.013857
9999985027    0.010366
5100012911    0.010348
7130000008    0.009225
2700042273    0.006487
5100002794    0.006249
Name: 111112360, dtype: float64

In [221]:
# Find similar items in the test set using cosine similarity
test_similarity = cosine_similarity(test_matrix.T)
test_similarity_df = pd.DataFrame(test_similarity, index=test_matrix.columns, columns=test_matrix.columns)
test_similarity_df.head()

code,111112360,566300023,566300028,566300035,601011292,601011293,601011294,601011295,601011296,601011297,...,9999985134,9999985137,9999985165,9999985215,9999985216,9999985217,9999985260,9999985261,9999985488,9999985766
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111112360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300023,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
566300028,0.0,0.0,1.0,0.0,0.010143,0.0,0.0,0.0,0.0,0.0,...,0.006241,0.0,0.0,0.0,0.0,0.0,0.0,0.01308,0.0,0.0
566300035,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
601011292,0.0,0.0,0.010143,0.0,1.0,0.039056,0.016434,0.051033,0.073609,0.072993,...,0.007478,0.007173,0.004933,0.024985,0.0,0.003334,0.001508,0.004644,0.006917,0.0


In [222]:
def get_popular_items(train_matrix, n=10):
    item_popularity = (train_matrix > 0).sum(axis=0)
    return item_popularity.sort_values(ascending=False).head(n)

In [255]:
def get_similar_items(item_code, similarity_df=train_similarity_df):
    if item_code not in similarity_df.columns:
        return get_popular_items(train_matrix)
    
    similar_items = similarity_df[item_code].sort_values(ascending=False)
    similar_items = similar_items.drop(item_code, errors='ignore')
    
    return similar_items

In [268]:
# Test similar items function
test = get_similar_items(111112360)
type(test)

pandas.core.series.Series

In [320]:
def item_based_cf(item_code, user, train_matrix=train_matrix, similarity_df=train_similarity_df):
    # Cold starts
    # If the item or the user does not exist in the training matrix, default to returning 0
    if item_code not in similarity_df.columns:
        print("Item not found in similarity dataframe")
        return 0
    if user not in train_matrix.index:
        print("User not found in training matrix")
        return 0
    
    # Get similar items (excluding the item itself)
    similar_items = get_similar_items(item_code, similarity_df)

    weighted_score = 0
    similarity_score = 0
    # NOTE: This will not cause an error because if the user does not exist in the matrix, we return popular items above
    user_purchases = train_matrix.loc[user]
    item_purchases = train_item_user_matrix.loc[item_code].mean()

    for item in similar_items:
        similarity = item
        weighted_score += similarity * item_purchases
        similarity_score += abs(similarity)

    if similarity_score == 0:
        return 0
    return user_purchases.get(item_code, 0) + weighted_score / similarity_score

    
    # # Filter by user's purchase history if user exists
    # if user in train_matrix.index:
    #     user_purchases = train_matrix.loc[user]
    #     purchased_items = user_purchases[user_purchases > 0].index.tolist()
    #     similar_items = similar_items[~similar_items.index.isin(purchased_items)]
    #     similar_items = similar_items.drop(item_code, errors='ignore')
    
    # top_items = similar_items.head(n)

    # # Filter by minimum similarity. If there are less than n items above the threshold return less than n
    # top_items = top_items[top_items >= min_similarity]
    
    # return pd.DataFrame({
    #     'item_code': top_items.index,
    #     'similarity': top_items.values,
    #     'reason': 'item_similarity'
    # })
# return average number of purchases per user for the item + weighted_score/similarity_score (if similarity_score > 0 else 0) and round to nearest integer

In [321]:
user_purchases = train_matrix.loc[2]
user_purchases.get(1480071124, 0)

np.int64(1)

In [322]:
# test item_based_cf function witha  valid user and item

item_based_cf(1480071124, 2, train_matrix, train_similarity_df)

np.float64(1.0045116019164002)

In [323]:
cf_predictions = []
for user in test_matrix.index:
    for item in test_matrix.columns:
        pred = item_based_cf(item, user, train_matrix, train_similarity_df)
        cf_predictions.append({
            'customerId': user,
            'code': item,
            'predicted_units': pred
        })
    break

Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similarity dataframe
Item not found in similar

In [None]:
# Get all predictions for first where predicted_units > 0
cf_results = pd.DataFrame(cf_predictions)
cf_results[cf_results['predicted_units'] >= 1]

Unnamed: 0,customerId,code,predicted_units
0,2,111112360,0.000009
1,2,566300023,0.000128
2,2,566300028,0.001310
3,2,566300035,0.000027
4,2,601011292,0.014089
...,...,...,...
680,2,9999985217,0.001782
681,2,9999985260,0.012261
682,2,9999985261,0.019888
683,2,9999985488,0.001704


In [286]:
# Get list of items purchased by user 2
train_matrix.loc[2][train_matrix.loc[2] > 0]

code
1480071124    1
2409407012    1
3000005040    1
3340060980    1
3340061283    1
3620000470    1
3620000474    1
4112907763    1
4300000039    1
5100002511    2
5100005044    2
7680851708    1
9999985093    1
Name: 2, dtype: int64