In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender, Data
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering, OPTICS, Birch
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer

In [4]:
import random 
def test_error(epochs, recommender,data:Data, seed=196, sample_size=1000):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in range(epochs):
        test_data = data.test_data.sample(n=sample_size, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [9]:
imputers = [SimpleImputer(strategy='mean'),
            SimpleImputer(strategy='median'),
            KNNImputer(n_neighbors=4, weights='uniform')]
preprocessors = [[], [MinMaxScaler()],
                 [StandardScaler()], [Normalizer()],
                 [MinMaxScaler(), StandardScaler()],
                 [MinMaxScaler(), Normalizer()],
                 [StandardScaler(), Normalizer()]]
clusterers = [KMeans(n_clusters=20), 
              DBSCAN(eps=0.5, min_samples=5),
              AgglomerativeClustering(n_clusters=20),
              SpectralClustering(n_clusters=20),
              OPTICS(min_samples=5),
              Birch(n_clusters=10)]
errors = {}

for i,imputer in enumerate(tqdm(imputers)):
    for preprocessor in preprocessors:
        for clusterer in clusterers:
            data = Data('ml-latest-small', 
                        imputer=imputer,
                        preprocessors=preprocessor)

            recommender = ClusteringBasedRecommender(
                data=data.train_data_table_for_clustering_normalized,
                data_unnormalized=data.train_data_table_for_clustering,
                movie_genres=data.movie_genres,
                clusterer=clusterer
                )
            recommender.train()
            errors[(imputer, tuple(preprocessor), clusterer)] = test_error(2, recommender,data,2, sample_size=100)

100%|██████████| 7/7 [00:28<00:00,  4.08s/it]
100%|██████████| 7/7 [00:28<00:00,  4.13s/it]
100%|██████████| 7/7 [00:29<00:00,  4.24s/it]


In [12]:
print("average error when predicting ratings:")
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

average error when predicting ratings:
0.6668205613686213 (SimpleImputer(strategy='median'), (), KMeans(n_clusters=20))
0.6691391529062851 (SimpleImputer(strategy='median'), (), AgglomerativeClustering(n_clusters=20))
0.6730680179011717 (SimpleImputer(), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6745119889146399 (SimpleImputer(), (), KMeans(n_clusters=20))
0.6752099335807624 (SimpleImputer(strategy='median'), (StandardScaler(), Normalizer()), SpectralClustering(n_clusters=20))
0.6761412865641284 (SimpleImputer(strategy='median'), (MinMaxScaler(),), AgglomerativeClustering(n_clusters=20))
0.6768241960831751 (SimpleImputer(), (), Birch(n_clusters=10))
0.6792811418060363 (KNNImputer(n_neighbors=4), (StandardScaler(), Normalizer()), AgglomerativeClustering(n_clusters=20))
0.680371007312158 (SimpleImputer(strategy='median'), (MinMaxScaler(),), KMeans(n_clusters=20))
0.6806867398047693 (SimpleImputer(strategy='median'), (StandardScaler(), Normalizer()), AgglomerativeClustering(n_clusters=2