# Demo: Iterative Approach to ML-based Item-wise Collaborative Filtering Applied to Clustered Data

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../resype')
%load_ext autoreload
%autoreload 2 

## Prepare data

In [3]:
np.random.seed(202109)
rating_vals = np.hstack([np.arange(1,6), [np.nan]])
rating_vals

array([ 1.,  2.,  3.,  4.,  5., nan])

In [4]:
userids = np.arange(1000)
itemids = np.arange(1000)
random_ratings = np.random.choice(rating_vals, size=len(userids)*len(itemids))

In [5]:
transactions = pd.DataFrame(
    {'user_id': userids.repeat(len(itemids)),
     'item_id': itemids.reshape((-1, 1)).repeat(len(userids), axis=1).T.flatten(),
     'rating': random_ratings}).drop_duplicates()

In [6]:
transactions

Unnamed: 0,user_id,item_id,rating
0,0,0,2.0
1,0,1,
2,0,2,
3,0,3,5.0
4,0,4,4.0
...,...,...,...
999995,999,995,1.0
999996,999,996,3.0
999997,999,997,
999998,999,998,2.0


## Load resype

In [7]:
from resype import Resype

In [8]:
re = Resype(transactions)

In [9]:
utility_matrix = re.construct_utility_matrix()
utility_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2.0,,,5.0,4.0,4.0,3.0,4.0,4.0,3.0,...,5.0,1.0,4.0,2.0,3.0,,2.0,,5.0,2.0
1,1.0,,4.0,5.0,3.0,2.0,1.0,3.0,1.0,,...,3.0,3.0,2.0,4.0,4.0,3.0,4.0,4.0,3.0,4.0
2,3.0,4.0,4.0,4.0,2.0,4.0,2.0,4.0,1.0,4.0,...,5.0,4.0,3.0,1.0,,5.0,2.0,2.0,,5.0
3,5.0,2.0,1.0,,2.0,4.0,3.0,3.0,,1.0,...,2.0,,2.0,3.0,5.0,2.0,,5.0,,1.0
4,2.0,1.0,3.0,1.0,2.0,2.0,3.0,1.0,3.0,5.0,...,,3.0,1.0,4.0,4.0,1.0,2.0,1.0,2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,5.0,,,2.0,1.0,5.0,1.0,2.0,2.0,3.0,...,,1.0,5.0,1.0,5.0,2.0,2.0,,3.0,5.0
996,1.0,5.0,5.0,2.0,5.0,4.0,1.0,,1.0,5.0,...,,,1.0,,,,4.0,5.0,1.0,2.0
997,2.0,5.0,,4.0,5.0,4.0,,3.0,,3.0,...,1.0,4.0,1.0,5.0,4.0,3.0,,5.0,2.0,2.0
998,3.0,2.0,2.0,1.0,1.0,4.0,2.0,1.0,5.0,1.0,...,3.0,4.0,4.0,5.0,5.0,,4.0,5.0,4.0,4.0


## Cluster data 

In [10]:
from sklearn.cluster import (KMeans, SpectralClustering,
                             AgglomerativeClustering, DBSCAN, OPTICS,
                             cluster_optics_dbscan, Birch)

model1 = KMeans(n_clusters = 15)
model2 = KMeans(n_clusters = 20)

In [11]:
x_u,y_u, df_u  = re.cluster_users(model1)
x_i,y_i, df_i  = re.cluster_items(model2)

## Generate new utility matrix based on clusters

In [12]:
# Running this overwrites the original utility matrix
Uc_df = re.utility_matrix_agg()
Uc_df

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,2.473856,2.553536,2.5,2.416667,2.498735,2.485294,2.525882,2.536415,2.44572,2.499222,2.473389,2.510558,2.412745,2.505957,2.518627,2.509635,2.555556,2.516256,2.47208,2.448495
1,2.450192,2.490895,2.470994,2.503831,2.591769,2.470443,2.573793,2.571429,2.668629,2.571976,2.512315,2.575597,2.649425,2.466608,2.455172,2.570749,2.433757,2.488064,2.522489,2.541039
2,2.515723,2.539114,2.550943,2.49109,2.562995,2.358491,2.432453,2.584906,2.462954,2.586703,2.436658,2.474601,2.515723,2.45904,2.507547,2.510085,2.432307,2.532817,2.550041,2.525113
3,2.515748,2.528355,2.514497,2.508749,2.476251,2.491001,2.500157,2.510686,2.514116,2.516935,2.551556,2.51272,2.509974,2.506229,2.565748,2.485474,2.512225,2.49041,2.515919,2.492071
4,2.471688,2.480121,2.502941,2.515759,2.479218,2.572802,2.554038,2.502747,2.521341,2.553266,2.514194,2.48003,2.485897,2.517892,2.4875,2.502321,2.507422,2.466469,2.503135,2.502167
5,2.5,2.522536,2.550642,2.505428,2.484613,2.615764,2.492874,2.502874,2.494253,2.419814,2.501368,2.489832,2.523372,2.48305,2.499425,2.544788,2.508167,2.461735,2.508246,2.523231
6,2.439964,2.486771,2.445161,2.482527,2.462539,2.483871,2.563548,2.53341,2.490952,2.438556,2.496928,2.515509,2.53871,2.47305,2.492339,2.527253,2.4618,2.529225,2.421459,2.554975
7,2.464006,2.44659,2.460149,2.552034,2.530214,2.638833,2.558873,2.477867,2.498798,2.478426,2.580818,2.485374,2.523944,2.543412,2.452113,2.5051,2.497158,2.500662,2.537355,2.472724
8,2.603175,2.507624,2.4,2.47619,2.520737,2.497449,2.562857,2.424745,2.468641,2.495465,2.442177,2.508242,2.382143,2.405515,2.470536,2.529557,2.581454,2.525641,2.541149,2.454225
9,2.691358,2.509613,2.429542,2.439506,2.612903,2.460317,2.606222,2.595238,2.533875,2.465608,2.435979,2.492308,2.504444,2.492546,2.52,2.46092,2.506823,2.488319,2.539614,2.499844


## Train iterative model using `train_model_iterative_cluster`

#### Create model object (load from sklearn)

In [13]:
from sklearn.ensemble import RandomForestRegressor
rs_model1 = RandomForestRegressor(random_state=202109)

#### Train model

In [14]:
re.utility_matrix

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,2.473856,2.553536,2.5,2.416667,2.498735,2.485294,2.525882,2.536415,2.44572,2.499222,2.473389,2.510558,2.412745,2.505957,2.518627,2.509635,2.555556,2.516256,2.47208,2.448495
1,2.450192,2.490895,2.470994,2.503831,2.591769,2.470443,2.573793,2.571429,2.668629,2.571976,2.512315,2.575597,2.649425,2.466608,2.455172,2.570749,2.433757,2.488064,2.522489,2.541039
2,2.515723,2.539114,2.550943,2.49109,2.562995,2.358491,2.432453,2.584906,2.462954,2.586703,2.436658,2.474601,2.515723,2.45904,2.507547,2.510085,2.432307,2.532817,2.550041,2.525113
3,2.515748,2.528355,2.514497,2.508749,2.476251,2.491001,2.500157,2.510686,2.514116,2.516935,2.551556,2.51272,2.509974,2.506229,2.565748,2.485474,2.512225,2.49041,2.515919,2.492071
4,2.471688,2.480121,2.502941,2.515759,2.479218,2.572802,2.554038,2.502747,2.521341,2.553266,2.514194,2.48003,2.485897,2.517892,2.4875,2.502321,2.507422,2.466469,2.503135,2.502167
5,2.5,2.522536,2.550642,2.505428,2.484613,2.615764,2.492874,2.502874,2.494253,2.419814,2.501368,2.489832,2.523372,2.48305,2.499425,2.544788,2.508167,2.461735,2.508246,2.523231
6,2.439964,2.486771,2.445161,2.482527,2.462539,2.483871,2.563548,2.53341,2.490952,2.438556,2.496928,2.515509,2.53871,2.47305,2.492339,2.527253,2.4618,2.529225,2.421459,2.554975
7,2.464006,2.44659,2.460149,2.552034,2.530214,2.638833,2.558873,2.477867,2.498798,2.478426,2.580818,2.485374,2.523944,2.543412,2.452113,2.5051,2.497158,2.500662,2.537355,2.472724
8,2.603175,2.507624,2.4,2.47619,2.520737,2.497449,2.562857,2.424745,2.468641,2.495465,2.442177,2.508242,2.382143,2.405515,2.470536,2.529557,2.581454,2.525641,2.541149,2.454225
9,2.691358,2.509613,2.429542,2.439506,2.612903,2.460317,2.606222,2.595238,2.533875,2.465608,2.435979,2.492308,2.504444,2.492546,2.52,2.46092,2.506823,2.488319,2.539614,2.499844


In [15]:
%%time
utility_matrix_imputed = re.train_model_iterative_cluster(
    re.utility_matrix, rs_model1)

CPU times: user 3min 5s, sys: 3.97 s, total: 3min 9s
Wall time: 3min 11s


#### Prediction

In [16]:
utility_matrix_imputed

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-0.013074,0.047755,-0.000197,-0.063121,0.008185,-0.010274,0.026295,0.03364,-0.037737,0.00711,-0.01518,0.017409,-0.062834,0.006098,0.019071,0.013096,0.049219,0.018724,-0.017654,-0.034735
1,-0.05813,-0.031214,-0.048271,-0.018687,0.055575,-0.04846,0.041292,0.041684,0.104034,0.033784,-0.012668,0.04062,0.092205,-0.045471,-0.06171,0.031975,-0.081858,-0.031116,-0.006979,0.011517
2,0.015007,0.029299,0.036348,-0.011963,0.047749,-0.107504,-0.051081,0.065343,-0.030539,0.062338,-0.052328,-0.018536,0.011183,-0.036614,0.00437,0.005215,-0.050679,0.023354,0.037037,0.015773
3,1e-05,0.013241,0.003283,-0.001857,-0.024702,-0.009766,-0.003744,0.003335,-0.001498,0.004976,0.028562,0.003296,-0.00333,-0.002897,0.038695,-0.020114,0.001203,-0.017379,0.003992,-0.010168
4,-0.02945,-0.021991,-0.009648,0.007248,-0.013089,0.052217,0.040277,-0.001234,0.01022,0.036666,0.006853,-0.013776,-0.017351,0.010696,-0.018043,-0.004046,-0.000745,-0.032726,-0.002449,-0.003915
5,-0.00603,0.012043,0.035477,-0.0006,-0.015259,0.072615,-0.009786,0.000763,-0.010416,-0.060482,-0.00304,-0.010555,0.012108,-0.018093,-0.005646,0.027322,-0.000689,-0.035961,0.002342,0.014981
6,-0.041004,-0.005529,-0.039716,-0.009789,-0.020175,-0.00902,0.062339,0.035669,-0.003145,-0.038011,0.000964,0.021474,0.032381,-0.01657,-0.003988,0.025014,-0.027033,0.027106,-0.053888,0.05035
7,-0.028728,-0.051517,-0.040756,0.037892,0.015321,0.099532,0.041812,-0.021972,-0.008818,-0.023684,0.049692,-0.013322,0.007654,0.026616,-0.051286,-0.005679,-0.010948,-0.011068,0.020483,-0.027615
8,0.09606,0.012174,-0.070912,-0.014624,0.027912,0.004316,0.057797,-0.039464,-0.016927,0.002212,-0.037594,0.015372,-0.091025,-0.065691,-0.016311,0.029176,0.070333,0.022332,0.043183,-0.030567
9,0.140798,-0.004173,-0.069121,-0.057765,0.081195,-0.041453,0.072236,0.061352,0.014553,-0.030646,-0.067015,-0.012377,-0.006969,-0.020119,0.001923,-0.038655,-0.005164,-0.01967,0.020513,-0.011848


## Train iterative model using `fit`

#### Train model

In [21]:
re.fit(rs_model1, method='iterative')

#### Prediction

In [20]:
re.utility_matrix_preds

i_cluster,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
u_cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,-0.019075,0.060605,0.007069,-0.076265,0.005804,-0.007637,0.032951,0.043483,-0.047211,0.006291,-0.019542,0.017627,-0.080186,0.013026,0.025696,0.016704,0.062624,0.023325,-0.020851,-0.044436
1,-0.078767,-0.038063,-0.057964,-0.025127,0.06281,-0.058515,0.044835,0.04247,0.139671,0.043018,-0.016643,0.046638,0.120467,-0.06235,-0.073786,0.041791,-0.095202,-0.040895,-0.00647,0.012081
2,0.014258,0.037649,0.049478,-0.010375,0.061529,-0.142975,-0.069012,0.083441,-0.038511,0.085237,-0.064807,-0.026864,0.014258,-0.042425,0.006082,0.008619,-0.069158,0.031352,0.048576,0.023648
3,0.004807,0.017414,0.003556,-0.002192,-0.03469,-0.01994,-0.010784,-0.000255,0.003175,0.005994,0.040615,0.001778,-0.000967,-0.004712,0.054807,-0.025467,0.001284,-0.020531,0.004978,-0.018871
4,-0.03436,-0.025927,-0.003106,0.009711,-0.026829,0.066755,0.047991,-0.0033,0.015294,0.047219,0.008147,-0.026018,-0.02015,0.011844,-0.018548,-0.003727,0.001375,-0.039578,-0.002912,-0.003881
5,-0.006601,0.015936,0.044042,-0.001173,-0.021988,0.109163,-0.013727,-0.003727,-0.012348,-0.086787,-0.005232,-0.016769,0.016771,-0.023551,-0.007175,0.038187,0.001566,-0.044866,0.001645,0.016631
6,-0.051963,-0.005157,-0.046766,-0.0094,-0.029388,-0.008056,0.071621,0.041483,-0.000975,-0.053371,0.005001,0.023581,0.046782,-0.018877,0.000411,0.035325,-0.030128,0.037298,-0.070469,0.063048
7,-0.046216,-0.063633,-0.050073,0.041812,0.019991,0.128611,0.048651,-0.032355,-0.011425,-0.031796,0.070596,-0.024849,0.013721,0.03319,-0.05811,-0.005123,-0.013064,-0.00956,0.027132,-0.037499
8,0.113298,0.017748,-0.089876,-0.013686,0.030861,0.007573,0.072981,-0.065131,-0.021235,0.005589,-0.047699,0.018366,-0.107733,-0.084361,-0.01934,0.039681,0.091578,0.035765,0.051273,-0.035651
9,0.177109,-0.004636,-0.084706,-0.074743,0.098654,-0.053932,0.091973,0.080989,0.019626,-0.048641,-0.07827,-0.021941,-0.009805,-0.021703,0.005751,-0.053329,-0.007426,-0.02593,0.025365,-0.014405
