# Collaborative Filtering
## Quickstart

### Load the Data

In [2]:
import pandas as pd
import numpy as np
from resype.collab_filtering import CollabFilteringModel

%load_ext autoreload
%autoreload 2 

In [3]:
# load transaction list
transaction_list = pd.read_csv("sample_data/ratings.csv")[['userId', 'movieId', 'rating']]
transaction_list = transaction_list.sample(20)
transaction_list.columns = ["user_id", 'item_id', 'rating']

### Preprocess

In [5]:
re = CollabFilteringModel(transaction_list)
re.transaction_list.head(3)

Unnamed: 0,user_id,item_id,rating
58565,381,78266,3.0
18628,119,54272,4.5
35027,234,2123,3.0


In [6]:
# construct utlity matrix
re.construct_utility_matrix()
re.utility_matrix.head(3)

item_id,196,208,266,355,434,586,588,1701,1856,2123,2455,2502,2867,4025,6188,54272,78266,80463,106100,157108
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
18,,,,,,,,,,,,,,,,,,,,4.5
107,,3.0,,,,,,,,,,,,,,,,,,
111,,,,,,,,,4.0,,,,,,,,,,,


### Train on Unclustered Matrix

#### Iterative Approach

In [7]:
# import sklearn Model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [8]:
# fit and predict
re.fit(model, method='iterative')
re.utility_matrix_preds.head(3)

item_id,196,208,266,355,434,586,588,1701,1856,2123,2455,2502,2867,4025,6188,54272,78266,80463,106100,157108
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# recommend
user_list = [0, 1, 2] # indices
top_n = 10
re.get_rec(user_list, top_n)
re.df_rec

Unnamed: 0,user_id,rank_1,rank_2,rank_3,rank_4,rank_5,rank_6,rank_7,rank_8,rank_9,rank_10
0,18,2123.0,106100.0,208.0,266.0,355.0,434.0,586.0,588.0,1701.0,1856.0
1,107,157108.0,106100.0,266.0,355.0,434.0,586.0,588.0,1701.0,1856.0,2123.0
2,111,157108.0,106100.0,208.0,266.0,355.0,434.0,586.0,588.0,1701.0,2123.0


### Train on Clustered matrix

In [10]:
from sklearn.cluster import KMeans
km_users = KMeans(n_clusters=10)
km_items = KMeans(n_clusters=10)

user_model, user_cluster_map, util_matrix_w_users = re.cluster_users(km_users)
item_model, item_cluster_map, util_matrix_w_items = re.cluster_items(km_items)

In [11]:
re.cluster_assignment()
re.utility_matrix_agg(u_agg="mean", i_agg="mean")
re.utility_matrix.head(3)

i_cluster,0,1,2,3,4,5,6,7,8,9
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.363636,0.272727,0.0,0.0,0.18595,0.0,0.0,0.0,0.0,0.363636
2,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.ensemble import RandomForestRegressor
model1 = RandomForestRegressor()
re.fit(model_object=model1, method='iterative', n_synth_data=5, p=0.1)
re.utility_matrix_preds.head(3)

i_cluster,0,1,2,3,4,5,6,7,8,9
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,-0.430173,-0.417114,-0.458333,3.538939,-0.458333,-0.458333,-0.458333,0.209448,-0.230966,-0.417366
1,0.225689,0.13478,-0.080534,-0.088732,0.067354,-0.099382,-0.114118,-0.137948,-0.116099,0.225689
2,-0.486881,-0.522222,4.477778,-0.522222,-0.522222,0.130154,-0.522222,-0.522222,-0.522222,-0.522222


In [13]:
# predict top item clusters per user index
user_list = [0, 1, 2] # index
top_n = 5 # top n clusters
re.get_rec(user_list, top_n, re.user_assignment)

Unnamed: 0,user_id,rank_1,rank_2,rank_3,rank_4,rank_5
0,0,2.0,9.0,7.0,6.0,5.0
1,1,2.0,3.0,5.0,6.0,8.0
2,2,5.0,9.0,0.0,1.0,3.0


In [14]:
# predict top items per user_id
top_n = 5 # top n clusters
re.get_rec_item(top_n)

Unnamed: 0,user_id,rank_1,rank_2,rank_3,rank_4,rank_5
0,18,2502.0,588.0,54272.0,106100.0,2455.0
1,107,2502.0,2867.0,2455.0,106100.0,157108.0
2,111,2455.0,588.0,1701.0,78266.0,2867.0
