In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender,ClusteringAndAprioriBasedRecommender, Data, Apriori
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS, Birch
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer

In [2]:
import random 
def test_error(epochs, recommender,data:Data, seed=196, sample_size=1000):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in range(epochs):
        test_data = data.test_data.sample(n=sample_size, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [3]:
'''testing without apriori'''
imputers = [SimpleImputer(strategy='mean'),
            SimpleImputer(strategy='median'),
            KNNImputer(n_neighbors=4, weights='uniform')]
preprocessors = [[], [MinMaxScaler()],
                 [StandardScaler()], [Normalizer()],
                 [StandardScaler()],
                 [MinMaxScaler(), Normalizer()]]
clusterers = [KMeans(n_clusters=20), 
              DBSCAN(eps=0.5, min_samples=5),
              AgglomerativeClustering(n_clusters=20),
              SpectralClustering(n_clusters=20),
              OPTICS(min_samples=5),
              Birch(n_clusters=10)]
errors = {}

for i,imputer in enumerate(tqdm(imputers)):
    for preprocessor in preprocessors:
        for clusterer in clusterers:
            data = Data('ml-latest-small', 
                        imputer=imputer,
                        preprocessors=preprocessor)

            recommender = ClusteringBasedRecommender(
                data=data.train_data_table_for_clustering_normalized,
                data_unnormalized=data.train_data_table_for_clustering,
                movie_genres=data.movie_genres,
                clusterer=clusterer
                )
            recommender.train()
            errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)


  0%|          | 0/3 [00:00<?, ?it/s]

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
100%|█████

In [8]:
'''best option tried with assosciation rules apriori
printing, the sizes of frequent  itemsets generated, to see progress
and printing the matched assosciation rules to have some comparison '''
imputer = SimpleImputer(strategy='median')
preprocessor = []
clusterer = AgglomerativeClustering(n_clusters=20)
#clusterer =KMeans(n_clusters=20)
data = Data('ml-latest-small', 
                        imputer= imputer,
                        preprocessors=preprocessor)
errors = {}

recommender = ClusteringAndAprioriBasedRecommender(
    data=data.train_data_table_for_clustering_normalized,
    data_unnormalized=data.train_data_table_for_clustering,
    movie_genres=data.movie_genres,
    clusterer=clusterer)
recommender.train()
errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

  0%|          | 0/20 [00:00<?, ?it/s]

662


  5%|▌         | 1/20 [00:05<01:45,  5.57s/it]

130
539


 10%|█         | 2/20 [00:06<00:47,  2.62s/it]

699
872


 15%|█▌        | 3/20 [01:26<10:45, 37.99s/it]

277
531


 20%|██        | 4/20 [01:26<06:09, 23.09s/it]

455
751


 25%|██▌       | 5/20 [01:49<05:45, 23.01s/it]

188
703


 30%|███       | 6/20 [01:55<04:03, 17.39s/it]

268
857


 35%|███▌      | 7/20 [02:01<02:57, 13.68s/it]

386
799


 40%|████      | 8/20 [02:07<02:12, 11.07s/it]

376
561


 45%|████▌     | 9/20 [02:07<01:25,  7.73s/it]

605
855


 50%|█████     | 10/20 [03:15<04:24, 26.42s/it]

168
696


 55%|█████▌    | 11/20 [03:17<02:49, 18.82s/it]

626
280
218
617


 65%|██████▌   | 13/20 [03:17<01:10, 10.11s/it]

492
631


 70%|███████   | 14/20 [03:20<00:49,  8.21s/it]

593
760


 75%|███████▌  | 15/20 [03:21<00:32,  6.44s/it]

616
620


 80%|████████  | 16/20 [03:22<00:19,  4.97s/it]

895
871


 85%|████████▌ | 17/20 [03:42<00:27,  9.16s/it]

304
884


 90%|█████████ | 18/20 [04:48<00:50, 25.21s/it]

185
751


100%|██████████| 20/20 [04:54<00:00, 14.72s/it]

329
354
319





('Sci-Fi', 'rating_4.0', 'userId_186')
('rating_3.0', 'userId_54')
('159093', 'rating_4.0')
('2291', 'rating_4.0')
('rating_3.0', 'userId_109')
('50', 'rating_2.0')
average error when predicting ratings:
0.6375 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))


In [4]:
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

average error when predicting ratings:
0.65 (SimpleImputer(), (), KMeans(n_clusters=20))
0.6525000000000001 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))
0.6575 (SimpleImputer(), (), Birch(n_clusters=10))
0.6575 (KNNImputer(n_neighbors=4), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.66 (SimpleImputer(strategy='median'), (), KMeans(n_clusters=20))
0.66 (KNNImputer(n_neighbors=4), (StandardScaler(),), KMeans(n_clusters=20))
0.6625 (SimpleImputer(strategy='median'), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6675 (SimpleImputer(strategy='median'), (StandardScaler(),), KMeans(n_clusters=20))
0.6699999999999999 (SimpleImputer(strategy='median'), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.67 (KNNImputer(n_neighbors=4), (), Birch(n_clusters=10))
0.675 (SimpleImputer(strategy='median'), (), Birch(n_clusters=10))
0.675 (KNNImputer(n_neighbors=4), (), KMeans(n_clusters=20))
0.6775 (SimpleImputer(), (MinMaxScaler(),), KMeans(n_c