# Demo: Iterative Approach to ML-based Item-wise Collaborative Filtering Applied to Clustered Data

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2 

## Prepare data

In [2]:
movies_df = pd.DataFrame({'M1': [2, None, 3, None, 1, 5],
                          'M2': [None, 1, None, 5, 1, None],
                          'M3': [5, 4, 1, 1, 4, None],
                          'M4': [5, 4, None, None, None, 1]},
                         index=[f'U{i+1}' for i in range(6)])
movies_df

Unnamed: 0,M1,M2,M3,M4
U1,2.0,,5.0,5.0
U2,,1.0,4.0,4.0
U3,3.0,,1.0,
U4,,5.0,1.0,
U5,1.0,1.0,4.0,
U6,5.0,,,1.0


In [3]:
np.random.seed(202109)
rating_vals = np.hstack([np.arange(1,6), [np.nan]])
rating_vals

array([ 1.,  2.,  3.,  4.,  5., nan])

In [4]:
userids = np.arange(1000)
itemids = np.arange(1000)
random_ratings = np.random.choice(rating_vals, size=len(userids)*len(itemids))

In [5]:
transactions = pd.DataFrame(
    {'user_id': userids.repeat(len(itemids)),
     'item_id': itemids.reshape((-1, 1)).repeat(len(userids), axis=1).T.flatten(),
     'rating': random_ratings}).drop_duplicates()

In [6]:
transactions

Unnamed: 0,user_id,item_id,rating
0,0,0,2.0
1,0,1,
2,0,2,
3,0,3,5.0
4,0,4,4.0
...,...,...,...
999995,999,995,1.0
999996,999,996,3.0
999997,999,997,
999998,999,998,2.0


## Load resype

In [7]:
from resype import Resype

In [8]:
re = Resype(transactions)

In [9]:
utility_matrix = re.construct_utility_matrix()
utility_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,,,5.0,4.0,4.0,3.0,4.0,4.0,3.0,...,5.0,1.0,4.0,2.0,3.0,,2.0,,5.0,2.0
1,1.0,,4.0,5.0,3.0,2.0,1.0,3.0,1.0,,...,3.0,3.0,2.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0
2,3.0,4.0,4.0,4.0,2.0,4.0,2.0,4.0,1.0,4.0,...,5.0,4.0,3.0,1.0,,5.0,2.0,2.0,,5.0
3,5.0,2.0,1.0,,2.0,4.0,3.0,3.0,,1.0,...,2.0,,2.0,3.0,5.0,2.0,,5.0,,1.0
4,2.0,1.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,5.0,...,,3.0,1.0,4.0,4.0,1.0,2.0,1.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,5.0,,,2.0,1.0,5.0,1.0,2.0,2.0,3.0,...,,1.0,5.0,1.0,5.0,2.0,2.0,,3.0,5.0
996,1.0,5.0,5.0,2.0,5.0,4.0,1.0,,1.0,5.0,...,,,1.0,,,,4.0,5.0,1.0,2.0
997,2.0,5.0,,4.0,5.0,4.0,,3.0,,3.0,...,1.0,4.0,1.0,5.0,4.0,3.0,,5.0,2.0,2.0
998,3.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,5.0,1.0,...,3.0,4.0,4.0,5.0,5.0,,4.0,5.0,4.0,4.0


## Cluster data 

In [10]:
from sklearn.cluster import (KMeans, SpectralClustering,
                             AgglomerativeClustering, DBSCAN, OPTICS,
                             cluster_optics_dbscan, Birch)

model1 = KMeans(n_clusters = 15)
model2 = KMeans(n_clusters = 20)

In [11]:
x_u,y_u, df_u  = re.cluster_users(model1)
x_i,y_i, df_i  = re.cluster_items(model2)

## Generate new utility matrix based on clusters

In [12]:
# Running this overwrites the original utility matrix
Uc_df = re.utility_matrix_agg()
Uc_df

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,2.473856,2.553536,2.5,2.416667,2.498735,2.485294,2.525882,2.536415,2.44572,2.499222,2.473389,2.510558,2.412745,2.505957,2.518627,2.509635,2.555556,2.516256,2.47208,2.448495
1,2.450192,2.490895,2.470994,2.503831,2.591769,2.470443,2.573793,2.571429,2.668629,2.571976,2.512315,2.575597,2.649425,2.466608,2.455172,2.570749,2.433757,2.488064,2.522489,2.541039
2,2.515723,2.539114,2.550943,2.49109,2.562995,2.358491,2.432453,2.584906,2.462954,2.586703,2.436658,2.474601,2.515723,2.45904,2.507547,2.510085,2.432307,2.532817,2.550041,2.525113
3,2.515748,2.528355,2.514497,2.508749,2.476251,2.491001,2.500157,2.510686,2.514116,2.516935,2.551556,2.51272,2.509974,2.506229,2.565748,2.485474,2.512225,2.49041,2.515919,2.492071
4,2.471688,2.480121,2.502941,2.515759,2.479218,2.572802,2.554038,2.502747,2.521341,2.553266,2.514194,2.48003,2.485897,2.517892,2.4875,2.502321,2.507422,2.466469,2.503135,2.502167
5,2.5,2.522536,2.550642,2.505428,2.484613,2.615764,2.492874,2.502874,2.494253,2.419814,2.501368,2.489832,2.523372,2.48305,2.499425,2.544788,2.508167,2.461735,2.508246,2.523231
6,2.439964,2.486771,2.445161,2.482527,2.462539,2.483871,2.563548,2.53341,2.490952,2.438556,2.496928,2.515509,2.53871,2.47305,2.492339,2.527253,2.4618,2.529225,2.421459,2.554975
7,2.464006,2.44659,2.460149,2.552034,2.530214,2.638833,2.558873,2.477867,2.498798,2.478426,2.580818,2.485374,2.523944,2.543412,2.452113,2.5051,2.497158,2.500662,2.537355,2.472724
8,2.603175,2.507624,2.4,2.47619,2.520737,2.497449,2.562857,2.424745,2.468641,2.495465,2.442177,2.508242,2.382143,2.405515,2.470536,2.529557,2.581454,2.525641,2.541149,2.454225
9,2.691358,2.509613,2.429542,2.439506,2.612903,2.460317,2.606222,2.595238,2.533875,2.465608,2.435979,2.492308,2.504444,2.492546,2.52,2.46092,2.506823,2.488319,2.539614,2.499844


## Train iterative model using `train_model_iterative_cluster`

#### Create model object (load from sklearn)

In [13]:
from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)

#### Train model

In [14]:
re.utility_matrix

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,2.473856,2.553536,2.5,2.416667,2.498735,2.485294,2.525882,2.536415,2.44572,2.499222,2.473389,2.510558,2.412745,2.505957,2.518627,2.509635,2.555556,2.516256,2.47208,2.448495
1,2.450192,2.490895,2.470994,2.503831,2.591769,2.470443,2.573793,2.571429,2.668629,2.571976,2.512315,2.575597,2.649425,2.466608,2.455172,2.570749,2.433757,2.488064,2.522489,2.541039
2,2.515723,2.539114,2.550943,2.49109,2.562995,2.358491,2.432453,2.584906,2.462954,2.586703,2.436658,2.474601,2.515723,2.45904,2.507547,2.510085,2.432307,2.532817,2.550041,2.525113
3,2.515748,2.528355,2.514497,2.508749,2.476251,2.491001,2.500157,2.510686,2.514116,2.516935,2.551556,2.51272,2.509974,2.506229,2.565748,2.485474,2.512225,2.49041,2.515919,2.492071
4,2.471688,2.480121,2.502941,2.515759,2.479218,2.572802,2.554038,2.502747,2.521341,2.553266,2.514194,2.48003,2.485897,2.517892,2.4875,2.502321,2.507422,2.466469,2.503135,2.502167
5,2.5,2.522536,2.550642,2.505428,2.484613,2.615764,2.492874,2.502874,2.494253,2.419814,2.501368,2.489832,2.523372,2.48305,2.499425,2.544788,2.508167,2.461735,2.508246,2.523231
6,2.439964,2.486771,2.445161,2.482527,2.462539,2.483871,2.563548,2.53341,2.490952,2.438556,2.496928,2.515509,2.53871,2.47305,2.492339,2.527253,2.4618,2.529225,2.421459,2.554975
7,2.464006,2.44659,2.460149,2.552034,2.530214,2.638833,2.558873,2.477867,2.498798,2.478426,2.580818,2.485374,2.523944,2.543412,2.452113,2.5051,2.497158,2.500662,2.537355,2.472724
8,2.603175,2.507624,2.4,2.47619,2.520737,2.497449,2.562857,2.424745,2.468641,2.495465,2.442177,2.508242,2.382143,2.405515,2.470536,2.529557,2.581454,2.525641,2.541149,2.454225
9,2.691358,2.509613,2.429542,2.439506,2.612903,2.460317,2.606222,2.595238,2.533875,2.465608,2.435979,2.492308,2.504444,2.492546,2.52,2.46092,2.506823,2.488319,2.539614,2.499844


In [15]:
%%time
utility_matrix_imputed = re.train_model_iterative_cluster(
    re.utility_matrix, rs_model1)

CPU times: user 3min 5s, sys: 3.97 s, total: 3min 9s
Wall time: 3min 11s


#### Prediction

In [16]:
utility_matrix_imputed

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-0.013074,0.047755,-0.000197,-0.063121,0.008185,-0.010274,0.026295,0.03364,-0.037737,0.00711,-0.01518,0.017409,-0.062834,0.006098,0.019071,0.013096,0.049219,0.018724,-0.017654,-0.034735
1,-0.05813,-0.031214,-0.048271,-0.018687,0.055575,-0.04846,0.041292,0.041684,0.104034,0.033784,-0.012668,0.04062,0.092205,-0.045471,-0.06171,0.031975,-0.081858,-0.031116,-0.006979,0.011517
2,0.015007,0.029299,0.036348,-0.011963,0.047749,-0.107504,-0.051081,0.065343,-0.030539,0.062338,-0.052328,-0.018536,0.011183,-0.036614,0.00437,0.005215,-0.050679,0.023354,0.037037,0.015773
3,1e-05,0.013241,0.003283,-0.001857,-0.024702,-0.009766,-0.003744,0.003335,-0.001498,0.004976,0.028562,0.003296,-0.00333,-0.002897,0.038695,-0.020114,0.001203,-0.017379,0.003992,-0.010168
4,-0.02945,-0.021991,-0.009648,0.007248,-0.013089,0.052217,0.040277,-0.001234,0.01022,0.036666,0.006853,-0.013776,-0.017351,0.010696,-0.018043,-0.004046,-0.000745,-0.032726,-0.002449,-0.003915
5,-0.00603,0.012043,0.035477,-0.0006,-0.015259,0.072615,-0.009786,0.000763,-0.010416,-0.060482,-0.00304,-0.010555,0.012108,-0.018093,-0.005646,0.027322,-0.000689,-0.035961,0.002342,0.014981
6,-0.041004,-0.005529,-0.039716,-0.009789,-0.020175,-0.00902,0.062339,0.035669,-0.003145,-0.038011,0.000964,0.021474,0.032381,-0.01657,-0.003988,0.025014,-0.027033,0.027106,-0.053888,0.05035
7,-0.028728,-0.051517,-0.040756,0.037892,0.015321,0.099532,0.041812,-0.021972,-0.008818,-0.023684,0.049692,-0.013322,0.007654,0.026616,-0.051286,-0.005679,-0.010948,-0.011068,0.020483,-0.027615
8,0.09606,0.012174,-0.070912,-0.014624,0.027912,0.004316,0.057797,-0.039464,-0.016927,0.002212,-0.037594,0.015372,-0.091025,-0.065691,-0.016311,0.029176,0.070333,0.022332,0.043183,-0.030567
9,0.140798,-0.004173,-0.069121,-0.057765,0.081195,-0.041453,0.072236,0.061352,0.014553,-0.030646,-0.067015,-0.012377,-0.006969,-0.020119,0.001923,-0.038655,-0.005164,-0.01967,0.020513,-0.011848


## Train iterative model using `fit`

#### Train model

In [17]:
re.fit(rs_model1, method='iterative')

#### Prediction

In [18]:
re.utility_matrix_preds

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-0.018426,0.061254,-0.002642,-0.060105,0.010529,-0.013969,0.0336,0.035783,-0.046562,0.00924,-0.017462,0.017931,-0.079537,0.003223,0.009507,0.017353,0.063274,0.023974,-0.020202,-0.043787
1,-0.050972,-0.033474,-0.042363,-0.022855,0.051226,-0.050492,0.047107,0.0255,0.110782,0.02879,-0.01635,0.043333,0.070625,-0.060078,-0.055429,0.044063,-0.071287,-0.033504,-0.002171,0.012005
2,0.013615,0.019485,0.048835,-0.011018,0.05375,-0.143618,-0.069655,0.082797,-0.031042,0.084594,-0.065451,-0.002496,0.013615,-0.037577,0.005439,0.006994,-0.054074,0.030709,0.047933,0.023005
3,0.005788,0.018396,0.004538,-0.001211,-0.033709,-0.018958,-0.009802,0.003571,-0.004734,0.006976,0.030922,0.00276,1.4e-05,-0.00373,0.041779,-0.018234,0.002266,-0.01955,0.00596,-0.017889
4,-0.035198,-0.022608,-0.013008,0.008872,-0.00122,0.060041,0.047152,-0.004139,0.014455,0.04638,0.007308,-0.026857,-0.00864,0.011006,-0.019463,-0.004565,-0.002115,-0.040417,-0.001059,-0.004719
5,-0.005833,0.016704,0.044809,-0.000405,-0.02122,0.083547,-0.008647,0.00208,-0.01158,-0.086019,-0.003845,-0.016001,0.012235,-0.01803,-0.005361,0.038955,-0.001931,-0.044098,0.002413,0.017398
6,-0.051695,-0.004888,-0.046497,-0.009132,-0.020055,-0.015241,0.061003,0.041751,-0.000707,-0.036782,-0.003561,0.02385,0.047051,-0.018608,-0.00719,0.035594,-0.029859,0.025317,-0.0702,0.063316
7,-0.047664,-0.06508,-0.051521,0.040365,0.014529,0.127163,0.045979,-0.023769,-0.012872,-0.027041,0.052063,-0.026296,0.008801,0.031743,-0.032082,-0.00657,-0.014511,-0.009527,0.025685,-0.02457
8,0.111896,0.011342,-0.091279,-0.012836,0.029459,0.00617,0.071579,-0.02067,-0.022637,0.001164,-0.049102,0.018687,-0.109136,-0.068049,-0.014,0.023109,0.090175,0.034362,0.049871,-0.037053
9,0.176496,-0.005104,-0.073401,-0.075356,0.098041,-0.054545,0.079797,0.080376,0.019013,-0.049254,-0.078883,-0.001336,-0.014304,-0.019342,0.005138,-0.053942,-0.008039,-0.026543,0.022803,-0.015019
