In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender, Data, Apriori
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS, Birch
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer

In [2]:
import random 
def test_error(epochs, recommender,data:Data, seed=196, sample_size=1000):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in range(epochs):
        test_data = data.test_data.sample(n=sample_size, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [4]:
'''testing without apriori'''
imputers = [SimpleImputer(strategy='mean'),
            SimpleImputer(strategy='median'),
            KNNImputer(n_neighbors=4, weights='uniform')]
preprocessors = [[], [MinMaxScaler()],
                 [StandardScaler()], [Normalizer()],
                 [StandardScaler()],
                 [MinMaxScaler(), Normalizer()]]
clusterers = [KMeans(n_clusters=20), 
              DBSCAN(eps=0.5, min_samples=5),
              AgglomerativeClustering(n_clusters=20),
              SpectralClustering(n_clusters=20),
              OPTICS(min_samples=5),
              Birch(n_clusters=10)]
errors = {}

for i,imputer in enumerate(tqdm(imputers)):
    for preprocessor in preprocessors:
        for clusterer in clusterers:
            data = Data('ml-latest-small', 
                        imputer=imputer,
                        preprocessors=preprocessor)

            recommender = ClusteringBasedRecommender(
                data=data.train_data_table_for_clustering_normalized,
                data_unnormalized=data.train_data_table_for_clustering,
                movie_genres=data.movie_genres,
                clusterer=clusterer,apriori=False
                )
            recommender.train()
            errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
100%|█████

In [3]:
'''best option tried with assosciation rules apriori
watch out- process like 30 mins
printing, the sizes of frequent  itemsets generated, to see progress
and printing the matched assosciation rules to have some comparison '''
imputer = SimpleImputer(strategy='median')
preprocessor = []
clusterer = AgglomerativeClustering(n_clusters=20)
#clusterer =KMeans(n_clusters=20)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}

recommender = ClusteringBasedRecommender(
    data=data.train_data_table_for_clustering_normalized,
    data_unnormalized=data.train_data_table_for_clustering,
    movie_genres=data.movie_genres,
    clusterer=clusterer,apriori=True)
recommender.train()
errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

  0%|          | 0/20 [00:00<?, ?it/s]

662


  5%|▌         | 1/20 [00:05<01:44,  5.49s/it]

130
539


 10%|█         | 2/20 [00:06<00:46,  2.58s/it]

699
872


 15%|█▌        | 3/20 [01:27<10:53, 38.42s/it]

277
531


 20%|██        | 4/20 [01:27<06:13, 23.34s/it]

455
751


 25%|██▌       | 5/20 [01:49<05:45, 23.05s/it]

188
703


 30%|███       | 6/20 [01:56<04:04, 17.44s/it]

268
857


 35%|███▌      | 7/20 [02:02<02:58, 13.71s/it]

386
799


 40%|████      | 8/20 [02:08<02:13, 11.12s/it]

376
561


 45%|████▌     | 9/20 [02:08<01:25,  7.78s/it]

605
855


 50%|█████     | 10/20 [03:19<04:34, 27.40s/it]

168
696


 60%|██████    | 12/20 [03:21<01:49, 13.67s/it]

626
280
218


 65%|██████▌   | 13/20 [03:22<01:07,  9.62s/it]

617
492
631


 70%|███████   | 14/20 [03:24<00:45,  7.53s/it]

593
760


 75%|███████▌  | 15/20 [03:26<00:28,  5.77s/it]

616
620


 80%|████████  | 16/20 [03:27<00:17,  4.39s/it]

895
871


 85%|████████▌ | 17/20 [03:49<00:28,  9.54s/it]

304
884


 90%|█████████ | 18/20 [04:59<00:55, 27.68s/it]

185
751


100%|██████████| 20/20 [05:04<00:00, 15.23s/it]

329
354
319





('Sci-Fi', 'rating_4.0', 'userId_186')
('rating_3.0', 'userId_54')
('159093', 'rating_4.0')
('2291', 'rating_4.0')
('rating_3.0', 'userId_109')
('50', 'rating_2.0')
average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))


In [5]:
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

average error when predicting ratings:
0.6637463398206978 (SimpleImputer(strategy='median'), (), KMeans(n_clusters=20))
0.6691391529062851 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))
0.6695451222899327 (SimpleImputer(), (), KMeans(n_clusters=20))
0.6724078684063539 (SimpleImputer(strategy='median'), (StandardScaler(),), KMeans(n_clusters=20))
0.6740757132654545 (SimpleImputer(), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6742554176260739 (SimpleImputer(strategy='median'), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6761412865641284 (SimpleImputer(strategy='median'), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.6768241960831751 (SimpleImputer(), (), Birch(n_clusters=10))
0.6783108620864392 (KNNImputer(n_neighbors=4), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6798312898108865 (KNNImputer(n_neighbors=4), (StandardScaler(),), KMeans(n_clusters=20))
0.6822060947333347 (KNNImputer(n_neighbors=4), (MinMaxScaler(),), AgglomerativeClustering