In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from recommenders import ClusteringBasedRecommender, Data
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer

In [19]:
import random 
def test_error(epochs, recommender,data:Data, seed=196, sample_size=1000):
    random.seed(seed)
    rand_seeds = [random.randint(0, 1000) for _ in range(epochs)]
    avg_error = 0
    for i in range(epochs):
        test_data = data.test_data.sample(n=sample_size, random_state=rand_seeds[i])
        test_data['predicted_rating'] = test_data.apply(
            lambda row: recommender.predict(int(row['userId']), int(row['movieId'])), axis=1
            )
        avg_error += np.mean(np.abs(test_data['rating'] - test_data['predicted_rating']))
    return avg_error / epochs

In [24]:
imputers = [SimpleImputer(strategy='mean'),
            SimpleImputer(strategy='constant', fill_value=0),
            SimpleImputer(strategy='median'),
            KNNImputer(n_neighbors=2, weights='uniform')]
preprocessors = [[], [MinMaxScaler()],
                 [StandardScaler()], [Normalizer()],
                 [MinMaxScaler(), StandardScaler()],
                 [MinMaxScaler(), Normalizer()],
                 [StandardScaler(), Normalizer()],
                 [MinMaxScaler(), StandardScaler(), Normalizer()]]

errors = {}

for i,imputer in tqdm(enumerate(imputers)):
    for preprocessor in preprocessors:
        data = Data('ml-latest-small', 
                    imputer=imputer,
                    preprocessors=preprocessor)

        kmeans = ClusteringBasedRecommender(
            data=data.train_data_table_for_clustering_normalized,
            data_unnormalized=data.train_data_table_for_clustering,
            movie_genres=data.movie_genres,
            clusterer=KMeans(n_clusters=20, random_state=1)
            )
        kmeans.train()
        errors[(imputer, tuple(preprocessor))] = test_error(2, kmeans,data,2, sample_size=100)

4it [00:14,  3.61s/it]


In [28]:
for key, value in sorted(errors.items(), key=lambda x: x[1]):
    print(value, key)

0.6560681155458934 (SimpleImputer(), (MinMaxScaler(),))
0.6752668422810978 (SimpleImputer(strategy='median'), (StandardScaler(), Normalizer()))
0.6752668422810978 (SimpleImputer(strategy='median'), (MinMaxScaler(), StandardScaler(), Normalizer()))
0.6856638854238828 (KNNImputer(n_neighbors=2), (StandardScaler(),))
0.6856638854238828 (KNNImputer(n_neighbors=2), (MinMaxScaler(), StandardScaler()))
0.6872638547713823 (SimpleImputer(), ())
0.6901642302094824 (SimpleImputer(strategy='median'), (StandardScaler(),))
0.6901642302094824 (SimpleImputer(strategy='median'), (MinMaxScaler(), StandardScaler()))
0.6910879258398093 (KNNImputer(n_neighbors=2), (MinMaxScaler(),))
0.6945610840316233 (KNNImputer(n_neighbors=2), ())
0.6950767669715316 (SimpleImputer(), (StandardScaler(), Normalizer()))
0.6950767669715316 (SimpleImputer(), (MinMaxScaler(), StandardScaler(), Normalizer()))
0.6958196424882629 (SimpleImputer(strategy='median'), ())
0.6978398546432227 (SimpleImputer(), (StandardScaler(),))
0.69

In [6]:
data = Data('ml-latest-small', normalization_type="l1")
kmeans = ClusteringBasedRecommender(
    data=data.train_data_table_for_clustering_normalized,
    data_unnormalized=data.train_data_table_for_clustering,
    movie_genres=data.movie_genres,
    Clusterer=KMeans,
    clusterer_params={'n_clusters': 10, "random_state": 1}
    )
kmeans.train()
test_error(10, kmeans,data,2)

100%|██████████| 10/10 [00:07<00:00,  1.42it/s]


0.7972607871884272

In [7]:
data = Data('ml-latest-small', normalization_type="l2")
kmeans = ClusteringBasedRecommender(
    data=data.train_data_table_for_clustering_normalized,
    data_unnormalized=data.train_data_table_for_clustering,
    movie_genres=data.movie_genres,
    Clusterer=KMeans,
    clusterer_params={'n_clusters': 10, "random_state": 1}
    )
kmeans.train()
test_error(10, kmeans,data,2)

100%|██████████| 10/10 [00:06<00:00,  1.45it/s]


0.7982009049975425

In [8]:
data = Data('ml-latest-small', normalization_type="max")
kmeans = ClusteringBasedRecommender(
    data=data.train_data_table_for_clustering_normalized,
    data_unnormalized=data.train_data_table_for_clustering,
    movie_genres=data.movie_genres,
    Clusterer=KMeans,
    clusterer_params={'n_clusters': 10, "random_state": 1}
    )
kmeans.train()
test_error(10, kmeans,data,2)

100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


0.7884868973623075