In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender, Data, Apriori
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS, Birch
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer

In [2]:
import random 
def test_error(epochs, recommender,data:Data, seed=196, sample_size=1000):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in range(epochs):
        test_data = data.test_data.sample(n=sample_size, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [8]:
imputers = [SimpleImputer(strategy='mean'),
            SimpleImputer(strategy='median'),
            KNNImputer(n_neighbors=4, weights='uniform')]
preprocessors = [[], [MinMaxScaler()],
                 [StandardScaler()], [Normalizer()],
                 [StandardScaler()],
                 [MinMaxScaler(), Normalizer()]]
clusterers = [KMeans(n_clusters=20), 
              DBSCAN(eps=0.5, min_samples=5),
              AgglomerativeClustering(n_clusters=20),
              SpectralClustering(n_clusters=20),
              OPTICS(min_samples=5),
              Birch(n_clusters=10)]
errors = {}

for i,imputer in enumerate(tqdm(imputers)):
    for preprocessor in preprocessors:
        for clusterer in clusterers:
            data = Data('ml-latest-small', 
                        imputer=imputer,
                        preprocessors=preprocessor)

            recommender = ClusteringBasedRecommender(
                data=data.train_data_table_for_clustering_normalized,
                data_unnormalized=data.train_data_table_for_clustering,
                movie_genres=data.movie_genres,
                clusterer=clusterer,apriori=False
                )
            recommender.train()
            errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]




[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]




[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]




[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]




[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]




[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]


  super()._check_params_vs_input(X, default_n_init=10)


[978, 883]
[978, 883]
[978, 883]
[978, 883]
[978, 883]




[978, 883]


100%|██████████| 3/3 [07:04<00:00, 141.48s/it]


In [3]:
'''best option tried with assosciation rules apriori
watch out- process like 30 mins
printing, the sizes of frequent  itemsets generated, to see progress
and printing the matched assosciation rules to have some comparison '''
imputer = SimpleImputer(strategy='median')
preprocessor = []
clusterer = AgglomerativeClustering(n_clusters=20)
#clusterer =KMeans(n_clusters=20)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}

recommender = ClusteringBasedRecommender(
    data=data.train_data_table_for_clustering_normalized,
    data_unnormalized=data.train_data_table_for_clustering,
    movie_genres=data.movie_genres,
    clusterer=clusterer,apriori=True)
recommender.train()
errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

  0%|          | 0/20 [00:00<?, ?it/s]

980


  5%|▌         | 1/20 [00:55<17:28, 55.18s/it]

623
788


 10%|█         | 2/20 [00:56<06:58, 23.26s/it]

881
1610


 15%|█▌        | 3/20 [06:38<47:51, 168.91s/it]

773


 20%|██        | 4/20 [06:38<27:19, 102.48s/it]

531
455
1233


 25%|██▌       | 5/20 [09:20<30:57, 123.85s/it]

690
998


 30%|███       | 6/20 [09:46<21:05, 90.39s/it] 

824
1309


 35%|███▌      | 7/20 [10:04<14:27, 66.72s/it]

962
1178


 40%|████      | 8/20 [10:22<10:16, 51.34s/it]

935
898


 45%|████▌     | 9/20 [10:23<06:31, 35.61s/it]

708
1471


 50%|█████     | 10/20 [18:29<29:07, 174.71s/it]

582
1082


 55%|█████▌    | 11/20 [18:35<18:26, 122.93s/it]

1487
280
218
617


 65%|██████▌   | 13/20 [18:35<07:39, 65.66s/it] 

492
814


 70%|███████   | 14/20 [18:45<05:09, 51.66s/it]

1592
1207


 75%|███████▌  | 15/20 [18:49<03:16, 39.28s/it]

1351
915


 80%|████████  | 16/20 [18:51<01:55, 28.97s/it]

1024
1543


 85%|████████▌ | 17/20 [20:11<02:09, 43.30s/it]

862
1474


 90%|█████████ | 18/20 [29:11<06:10, 185.35s/it]

676
1059


 95%|█████████▌| 19/20 [29:38<02:19, 139.43s/it]

1062
354
319


100%|██████████| 20/20 [29:38<00:00, 88.92s/it] 


('Sci-Fi', 'rating_4.0', 'userId_186')
('rating_3.0', 'userId_54')
('355', 'rating_3.0')
('159093', 'rating_4.0')
('Comedy', 'rating_3.0', 'userId_436')
('2600', 'rating_4.0')
('2291', 'rating_4.0')
('81834', 'rating_3.0')
('rating_3.0', 'userId_109')
('50', 'rating_2.0')
average error when predicting ratings:
0.6469251206559179 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))


In [9]:
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

average error when predicting ratings:
0.6691391529062851 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))
0.6721764552273872 (SimpleImputer(strategy='median'), (), KMeans(n_clusters=20))
0.6750110407597765 (SimpleImputer(strategy='median'), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6761412865641284 (SimpleImputer(strategy='median'), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.6761706837328801 (SimpleImputer(strategy='median'), (StandardScaler(),), KMeans(n_clusters=20))
0.6762948602212068 (SimpleImputer(), (), KMeans(n_clusters=20))
0.6768241960831751 (SimpleImputer(), (), Birch(n_clusters=10))
0.6774146558400831 (SimpleImputer(), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6802747075594189 (SimpleImputer(), (StandardScaler(),), KMeans(n_clusters=20))
0.6809074763464446 (KNNImputer(n_neighbors=4), (StandardScaler(),), KMeans(n_clusters=20))
0.6810594346258958 (SimpleImputer(), (StandardScaler(),), KMeans(n_clusters=20))
0.68220609473333