In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# import os
from surprise import BaselineOnly, Dataset, Reader, SVD, NMF, SVDpp, accuracy, PredictionImpossible, KNNWithMeans, KNNBasic, NormalPredictor, KNNWithZScore, KNNBaseline, SlopeOne, CoClustering
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV, PredefinedKFold
from surprise.model_selection.split import LeaveOneOut
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
import pandas as pd
import tempfile

In [None]:

np.random.seed(42)

genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

column_names = ["item","title","genres","movie_name","movie_year","(no genres listed)","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","IMAX","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western","user","rating","rating_timestamp","rating_year","rating_month","rating_season,tag","tag_timestamp","cleaned_tag","tag_length","tag_year"]

data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)

reader = Reader(rating_scale=(1, 5))

ratings = data[['user', 'item', 'rating']]
ratings = ratings.iloc[1:]
ratings["rating"] = ratings["rating"].astype(float)


  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)
  data = pd.read_csv('../samples/combined_movies_ratings_tags.csv', names=column_names, index_col=False, skiprows=0)


For initial testing creating a dataset of 10k users

In [3]:
custom_data = Dataset.load_from_df(ratings, reader)

print(custom_data)

<surprise.dataset.DatasetAutoFolds object at 0x16d0fbc70>


In [None]:
# Train-test split
trainset, testset = train_test_split(custom_data, test_size=0.2)

# Convert trainset to dataframe (for content-based model)
train_df = pd.DataFrame(trainset.build_testset(), columns=['user', 'item', 'rating'])
test_df = pd.DataFrame(testset, columns=['user', 'item', 'rating'])


# Step 1: Filter users with >= 5 test ratings
test_user_counts = test_df['user'].value_counts()
eligible_users = test_user_counts[test_user_counts >= 5].index.tolist()

# Different number of known ratings to test
known_ratings_list = [5]
# known_ratings_list = [5, 10, 15, 20]

Function to get the best params for a given model

In [5]:

measures=["rmse", "mae", "mse"]

def perform_grid_search(algo, params):
    
    gs = GridSearchCV(algo, params, measures=measures, cv=3, n_jobs=-1, joblib_verbose=0)

    gs.fit(custom_data)

    
    # best RMSE score
    print(gs.best_score)


    # combination of parameters that gave the best measure score
    print(gs.best_params)
    return gs.best_estimator["rmse"]

Function to run the cold start train test split 

In [None]:
def run_cold_start_models(models, model_names, train_df, test_df, eligible_users, known_ratings_list, k=10):
    all_results = []

    for algo, model_name in zip(models, model_names):
        rmse_results = []
        mae_results = []
        mse_results = []
        ndcg_results = []

        # Iterate over each number of known ratings
        for known_ratings in known_ratings_list:
            print(f"Processing for {known_ratings} known ratings per user with {model_name}...")
            
            # Step 3: Reduce training data to 'known_ratings' ratings per user for these test users (simulate cold start)
            limited_train_rows = []
            for user in eligible_users:
                user_ratings = train_df[train_df['user'] == user]
                if len(user_ratings) > known_ratings:
                    sampled = user_ratings.sample(known_ratings, random_state=42)
                else:
                    sampled = user_ratings
                limited_train_rows.append(sampled)

            # Step 4: Add all training data from non-eligible users (normal users)
            non_eligible_users_df = train_df[~train_df['user'].isin(eligible_users)]
            cold_start_train_df = pd.concat(limited_train_rows + [non_eligible_users_df], ignore_index=True)

            # Build training set for Surprise
            reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
            cold_start_data = Dataset.load_from_df(cold_start_train_df[['user', 'item', 'rating']], reader)
            cold_start_trainset = cold_start_data.build_full_trainset()

            # Train the model
            algo.fit(cold_start_trainset)

            # Build the final test set for Surprise
            final_testset = [tuple(x) for x in test_df.to_numpy()]
            predictions = algo.test(final_testset)

            # Evaluate
            rmse = accuracy.rmse(predictions, verbose=False)
            mae = accuracy.mae(predictions, verbose=False)
            mse = accuracy.mse(predictions, verbose=False)
            
            rmse_results.append((known_ratings, rmse))
            mae_results.append((known_ratings, mae))
            mse_results.append((known_ratings, mse))
            
            print(f"RMSE on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {rmse}")
            print(f"MAE on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {mae}")
            print(f"MSE on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {mse}")

            # Calculate NDCG@K
            # Calculate NDCG@K
            ndcg_scores = []
            for user_id in eligible_users:
                user_ratings = train_df[train_df['user'] == user_id]
                
                # Predict ratings for unseen movies
                all_movies = set(train_df['item'].unique())
                seen_movies = set(user_ratings['item'])
                unseen_movies = list(all_movies - seen_movies)
                
                # Filter test data for the current user
                user_test_ratings = test_df[test_df['user'] == user_id]
                if user_test_ratings.empty:
                    continue  # Skip user if no test data
                
                predictions = [ (movie_id, algo.predict(user_id, movie_id).est) for movie_id in unseen_movies ]
                predictions.sort(key=lambda x: x[1], reverse=True)
                
                # Top-K recommended movies
                top_k_movies = [movie_id for movie_id, _ in predictions[:k]]
                
                # Movies the user actually liked (rating >= 4) in the test set
                true_liked_movies = user_test_ratings[user_test_ratings['rating'] >= 4]['item'].tolist()
                
                if not true_liked_movies:
                    continue  # Skip user if no strong likes in the test set
                
                # --- Calculate NDCG@K ---
                dcg = 0.0
                for idx, movie_id in enumerate(top_k_movies):
                    if movie_id in true_liked_movies:
                        dcg += 1.0 / np.log2(idx + 2)  # +2 because index starts from 0
                
                # Ideal DCG (IDCG)
                ideal_relevant = min(len(true_liked_movies), k)
                idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_relevant))
                
                ndcg = dcg / idcg if idcg > 0 else 0.0
                ndcg_scores.append(ndcg)

            average_ndcg = np.mean(ndcg_scores)
            ndcg_results.append((known_ratings, average_ndcg))
            print(f"NDCG@{k} on filtered cold-start test users (with {known_ratings} training ratings each) for {model_name}: {average_ndcg}")

        # Create a DataFrame to store the results for the current model
        results_df = pd.DataFrame({
            'Model': model_name,
            'Known Ratings': [r[0] for r in rmse_results],
            'RMSE': [r[1] for r in rmse_results],
            'MAE': [r[1] for r in mae_results],
            'MSE': [r[1] for r in mse_results],
            'NDCG@K': [r[1] for r in ndcg_results]
        })

        # Append the results to the all_results list
        all_results.append(results_df)

    # Concatenate all results into a single DataFrame
    final_results_df = pd.concat(all_results, ignore_index=True)
    return final_results_df



In [None]:
models = []
model_names = []

<h3>Grid search for best params<h3>

First create a reusable function to run grid search for any model

<h4>SVD</h4>

In [8]:
param_grid_svd = {
    'n_factors': [20, 50, 100, 150, 200],
    'n_epochs': [20, 30, 40, 50, 60],
    'lr_all': [0.002, 0.005, 0.007, 0.01, 0.02],
    'reg_all': [0.02, 0.05, 0.1, 0.15, 0.2]
}

best_svd = perform_grid_search(SVD, param_grid_svd)

models += [best_svd]
model_names += ['SVD']



{'rmse': 0.8199089667341454, 'mae': 0.6310292204161084, 'mse': 0.6722530426724705}
{'rmse': {'n_factors': 200, 'n_epochs': 60, 'lr_all': 0.007, 'reg_all': 0.05}, 'mae': {'n_factors': 200, 'n_epochs': 60, 'lr_all': 0.007, 'reg_all': 0.05}, 'mse': {'n_factors': 200, 'n_epochs': 60, 'lr_all': 0.007, 'reg_all': 0.05}}


<h4>SVD++</h4>

In [None]:
param_grid_svdpp = {
    'n_factors': [20, 50, 100, 150, 200],
    'n_epochs': [10, 20, 30, 40, 50],
    'lr_all': [0.002, 0.005, 0.007, 0.01, 0.02],
    'reg_all': [0.02, 0.05, 0.1, 0.15, 0.2]
}

best_svdpp = perform_grid_search(SVDpp, param_grid_svdpp)

models += [best_svdpp]
model_names += ['SVD++']

python(90652) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90653) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90654) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90655) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90656) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90657) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90658) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90659) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90660) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90661) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(90662) Malloc

{'rmse': 1.0068940245308162, 'mae': 0.8043210054859121, 'mse': 1.0139565514290376}
{'rmse': {'n_factors': 20, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.2}, 'mae': {'n_factors': 50, 'n_epochs': 20, 'lr_all': 0.02, 'reg_all': 0.1}, 'mse': {'n_factors': 20, 'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.2}}


<h4>NMF</h4>

In [None]:
param_grid_nmf = {
    'n_factors': [15, 30, 50, 100, 150],
    'n_epochs': [50, 100, 150, 200, 300],
    'reg_pu': [0.06, 0.08, 0.1, 0.2, 0.3],
    'reg_qi': [0.06, 0.08, 0.1, 0.2, 0.3]
}

best_nmf = perform_grid_search(NMF, param_grid_nmf)

models += [best_nmf]
model_names += ['NMF']

<h4>KNN basic</h4>

In [None]:
param_grid_knn_basic = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['cosine', 'pearson', 'pearson_baseline'],
        'user_based': [True, False]
    }
}

best_knn_basic = perform_grid_search(KNNBasic, param_grid_knn_basic)

models += [best_knn_basic]
model_names += ['KNNBasic']

python(25545) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25546) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25547) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25548) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25549) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25550) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25551) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25552) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25553) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25554) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(25555) Malloc

Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Comput

python(25718) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the pearson similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity m

<h4>KNN with means</h4>

In [None]:
param_grid_knn_means = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['cosine', 'pearson_baseline'],
        'user_based': [True, False]
    }
}

best_knn_with_means = perform_grid_search(KNNWithMeans, param_grid_knn_means)

models += [best_knn_with_means]
model_names += ['KNNWithMeans']

Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using als...


python(23918) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cos

python(23946) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimati

<h4>KNN with Z Score</h4>

In [None]:
param_grid_knn_zscore = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['cosine', 'pearson'],
        'user_based': [True, False]
    }
}

best_knn_z_score = perform_grid_search(KNNWithZScore, param_grid_knn_zscore)

models += [best_knn_z_score]
model_names += ['KNNWithZScore']

Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done com

python(23783) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done c

python(23790) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the pearson similarity matrix...
Compu

python(23798) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...


python(23801) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(23803) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done c

python(23812) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Comput

python(23834) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson similarity matrix...
Computing the pearson similarity matrix...
Computing the pearson similarity matrix

<h4>KNN with Baseline</h4>

In [None]:
param_grid_knn_baseline = {
    'k': [10, 20, 30, 40, 50],
    'min_k': [1, 3, 5, 7, 10],
    'sim_options': {
        'name': ['pearson_baseline', 'cosine'],
        'user_based': [True, False]
    },
    'bsl_options': {
        'method': ['sgd', 'als'],
        'reg': [0.01, 0.02, 0.05, 0.1],
        'learning_rate': [0.002, 0.005, 0.01]
    }
}

best_knn_baseline = perform_grid_search(KNNBaseline, param_grid_knn_baseline)

models += [best_knn_baseline]
model_names += ['KNNBaseline']

Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done



Done computing similarity matrix.


python(22033) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Estimating biases using als...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Estimating biases using als...
Estimating biases using als...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Computing the cosine similarity matrix...
Estimating biases using als...
Estimating biases using als...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix

python(22119) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Estimating bi

python(22174) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Com

python(22182) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Done computing 

python(22217) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the cosine similarity matrix...


python(22222) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using sgd...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the pearson_baseline si

python(22386) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.


python(22422) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...


python(22442) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Est

python(23041) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Computing the pearson_baseline similarity matrix...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matr

python(23638) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


sgd...
Done computing similarity matrix.
Estimating biases using sgd...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Estimating biases using sgd...
Computing the cosine similarity matrix...
Done computing similarity matrix.
Done computing similarity matrix.
Done computing similarity matrix.
Done computing si

<h4>CoClustering</h4>

In [72]:
param_grid_coclustering = {
    'n_cltr_u': [3, 5, 10, 15, 20],
    'n_cltr_i': [3, 5, 10, 15, 20],
    'n_epochs': [10, 20, 30, 40, 50]
}

best_coclustering = perform_grid_search(CoClustering, param_grid_coclustering)

models += [best_coclustering]
model_names += ['CoClustering']

python(84144) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84148) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84149) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84150) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84151) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84152) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84153) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84154) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84155) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84156) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(84157) Malloc

{'rmse': 1.1390920740479635, 'mae': 0.9133499022692373, 'mse': 1.2975325019391735}
{'rmse': {'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 50}, 'mae': {'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 50}, 'mse': {'n_cltr_u': 5, 'n_cltr_i': 5, 'n_epochs': 50}}


<h4>Baseline Only</h4>

In [76]:
param_grid_baseline_only = {
    'bsl_options': {
        'method': ['sgd', 'als'],
        'reg': [0.01, 0.02, 0.05, 0.1],
        'learning_rate': [0.002, 0.005, 0.01],
        'n_epochs': [5, 10, 20, 30, 40]
    }
}

best_baseline_only = perform_grid_search(BaselineOnly, param_grid_baseline_only)

models += [best_baseline_only]
model_names += ['BaselineOnly']

python(7552) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7553) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7554) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7555) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7556) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7557) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7558) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7559) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7560) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7561) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(7562) MallocStackLoggin

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimati

<h4>Models with no parameters</h4>

In [79]:
slopeone_algo = SlopeOne()
normalPredictor_algo = NormalPredictor()


models += [slopeone_algo, normalPredictor_algo]
model_names += ['SlopeOne', "NormalPredictor"]

<h4>Run best models with cold start data</h4>

In [16]:
final_results_df = run_cold_start_models(models, model_names, train_df, test_df, eligible_users, known_ratings_list)
print(final_results_df)

Processing for 5 known ratings per user with SVD...
RMSE on filtered cold-start test users (with 5 training ratings each) for SVD: 0.9465914775165377
MAE on filtered cold-start test users (with 5 training ratings each) for SVD: 0.7401447051475848
MSE on filtered cold-start test users (with 5 training ratings each) for SVD: 0.896035425306942
NDCG@20 on filtered cold-start test users (with 5 training ratings each) for SVD: 0.020987686428063517
Processing for 10 known ratings per user with SVD...
RMSE on filtered cold-start test users (with 10 training ratings each) for SVD: 0.9234600022704148
MAE on filtered cold-start test users (with 10 training ratings each) for SVD: 0.7204859257991142
MSE on filtered cold-start test users (with 10 training ratings each) for SVD: 0.8527783757932745
NDCG@20 on filtered cold-start test users (with 10 training ratings each) for SVD: 0.017716831459135548
Processing for 20 known ratings per user with SVD...
RMSE on filtered cold-start test users (with 20 t

In [None]:
# Example usage:

# results_df = run_cold_start_models(models, model_names)
# print(results_df)

Processing for 5 known ratings per user with Knn Basic...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE on filtered cold-start test users (with 5 training ratings each) for Knn Basic: 0.11094367579950253
MAE on filtered cold-start test users (with 5 training ratings each) for Knn Basic: 0.02965094457650443
MSE on filtered cold-start test users (with 5 training ratings each) for Knn Basic: 0.012308499199905122
Processing for 10 known ratings per user with Knn Basic...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE on filtered cold-start test users (with 10 training ratings each) for Knn Basic: 0.11087431866284375
MAE on filtered cold-start test users (with 10 training ratings each) for Knn Basic: 0.029632408102025515
MSE on filtered cold-start test users (with 10 training ratings each) for Knn Basic: 0.012293114538949823
Processing for 20 known ratings per user with Knn Basic...
Computing the pearson similarity matri