## Libraries 

### General Libraries and settings

In [1]:
import pandas as pd
import re
import numpy as np

# Print longer cells in pd
pd.options.display.max_colwidth = 1000
pd.options.display.width = 10000
# Print all rows
pd.options.display.max_rows = None
pd.options.display.max_columns = 1000

###  Preprocessing Libraries


In [2]:
# ”#$%&'()*+,-./:;?@[\]^_`{|}~
# string.punctuation
import string

# i'm
import contractions

# i, am, he, she, on, at
import nltk
nltk.download('stopwords')

# for stopwords
from nltk.corpus import stopwords
nltk.download('wordnet')

# for pos_tag
nltk.download('punkt_tab') 
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

# english
stopwords = stopwords.words('english')
stopwords.append("\'s")
#importing the Stemming function from nltk library
from nltk.stem import WordNetLemmatizer
#defining the object for stemming
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# stem instead of lemmanize because simple, interested - interesting similar
# lemma instead of stem because of nlp word2vec word embedding

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ngpbm\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


## S1: Normalize User Preferences 


In [3]:
# Load user preference data
user_pref_df = pd.read_csv("../user_preferences.csv")
user_pref_df.head(3)

Unnamed: 0,preference
0,"I’m interested in a Master of Science in Computer Science, ideally in Berlin, where I can gain exposure to cutting-edge AI technologies."
1,A Master of Arts in Business Administration with an international focus would align perfectly with my career goals in global management.
2,"I’m looking for a Master of Science in Environmental Science in Hamburg, especially one that emphasizes sustainability and climate research."


In [4]:
# Helper function to map POS tag to WordNet POS
def get_wordnet_pos(tag):
    if tag.startswith('J'):  # Adjective
        return 'a'
    elif tag.startswith('V'):  # Verb
        return 'v'
    elif tag.startswith('N'):  # Noun
        return 'n'
    elif tag.startswith('R'):  # Adverb
        return 'r'
    else:
        return None

In [5]:
# contraction
# lowercase
# tokenization i am -> ['i','am']
# POS_tag
# remove punctuation
# remove stop words i, will, am, he, a, an, the..., && numbers && Adv
# lemmanization

def stop_words_removal_then_lemmatize(pos_tags):
    processed_tokens = []
    for word, tag in pos_tags:
        #check punctuation                          stopwords               digits
        if word not in string.punctuation and word not in stopwords and not re.search(r'\d', word):
            # if tag not in ['JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']:  # Remove adjectives & adverbs
            if tag not in ['RB', 'RBR', 'RBS']:  # Remove adverbs
                pos = get_wordnet_pos(tag)
                lemma = lemmatizer.lemmatize(word, pos) if pos else lemmatizer.lemmatize(word)
                processed_tokens.append(lemma)
                # print(f"{word}, {tag}")
    return processed_tokens


def normalize_tokenize(str):
    str = contractions.fix(str)
    str = str.lower()

    tokens = word_tokenize(str)
    pos_tags = pos_tag(tokens)

    processed_tokens = stop_words_removal_then_lemmatize(pos_tags)
    
    return processed_tokens

In [6]:
user_pref_df['normalize_tokenize'] = user_pref_df['preference'].apply(lambda x: normalize_tokenize(x))
user_pref_df.head(3)

Unnamed: 0,preference,normalize_tokenize
0,"I’m interested in a Master of Science in Computer Science, ideally in Berlin, where I can gain exposure to cutting-edge AI technologies.","[interested, master, science, computer, science, berlin, gain, exposure, cutting-edge, ai, technology]"
1,A Master of Arts in Business Administration with an international focus would align perfectly with my career goals in global management.,"[master, art, business, administration, international, focus, would, align, career, goal, global, management]"
2,"I’m looking for a Master of Science in Environmental Science in Hamburg, especially one that emphasizes sustainability and climate research.","[look, master, science, environmental, science, hamburg, one, emphasize, sustainability, climate, research]"


## S2: Normalize Uni Data

In [7]:
university_df = pd.read_csv("../university_data.csv")
university_df.iloc[[100, 3000, 8000]]

Unnamed: 0,program_name,program_url,university,location,duration,degreeType,language,subject,studyMode,admission_Modus,admission_Requirements,overview,teaching,researching
100,Aesthetics and Media Science,http://www.uni-oldenburg.de/nc/studium/studiengang/?id_studg=312,University of Oldenburg,Oldenburg,4 semesters,Master of Arts,German,Art Studies,full time,without admission restriction,1. Bachelor's degree or equivalent degree in the field of Arts and Media Studies or another subject-specific degree program2. at least 30 credit points for subject-related and didactic content.more information regarding admission requirements. Bachelor/Bakkalaureus,,,
3000,Economics,https://www.uni-heidelberg.de/de/studium/alle-studienfaecher/economicspolitische-oekonomik/wirtschaftswissenschaft-teilstudiengang-im-master-education,Heidelberg University,Heidelberg,4 semesters,Master of Education,German,"Economic Sciences, Economics",full time,without admission restriction,"Admission restrictions, see admission regulations\;more information regarding admission requirements. Bachelor/Bakkalaureus",,,
8000,Physics,https://www.zsb.uni-wuppertal.de/studieninfos/studieninfos/master/physik-msc.html,University of Wuppertal,Wuppertal,4 semesters,Master of Science,German,Physics,full time,without admission restriction,"Bachelor of Science' or equivalent degree in the Physics course or in a course recognised as equivalent at an institute of higher education in thearea of validity of Germany's Basic Law at least with the grade Satisfactory (3.0) orhas acquired a 'Bachelor of Science' or equivalent degree in the Physics course or in a course recognised as equivalent at an institute of higher education without or outside the area of validity of Germany's Basic Law anda) oral entrance exam lasting 20 to 40 minutes orb) the Graduate Record Examinations Subject (GRE) Test in Physics. Bachelor/Bakkalaureus(and other qualifications, provided that they are recognised as being equivalent)",,,


In [None]:
# merge: omit url, duration, admission1&2 from merge (no need)
university_df['merge_raw'] = university_df[university_df.columns[[0,2,3,5,6,7,8,11,12,13]]].apply(
    lambda x: '. '.join(x.dropna().astype(str)), axis=1
)
university_df['merge_normalize_tokenize'] = university_df['merge_raw'].apply(lambda x: normalize_tokenize(x))
university_df['merge_normalize_tokenize'].head(3)

## S3: Feature Extraction using Word2Vec Word Embeddings

### Train a custom Word2Vec Model using 2 datasets
1. Read data
2. Hypertuning word2vec

Word2Vec is an unsupervised learning algorithm --> no need train/test split

In [None]:
# import gensim
from gensim.models import Word2Vec
# import gensim.downloader as api
from itertools import product
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# vector_size (int, optional) – Dimensionality of the word vectors.
# window (int, optional) – Maximum distance between the current and predicted word within a sentence.
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
# w2v_model = Word2Vec(documents['tokenized'], vector_size=300, window=8, workers=5, min_count=1)


documents = pd.DataFrame()
documents['tokenized'] = pd.concat([university_df['merge_normalize_tokenize'], user_pref_df['normalize_tokenize']])
corpus = documents['tokenized'].tolist()

In [None]:
# Base model Word2Vec
base_model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, sg=0, epochs=10)

# Hyperparameter combinations
param_grid = {
    'vector_size': [100, 200, 300],
    'window': [3, 5, 10],
    'min_count': [1, 2],
    'sg': [0, 1],
    'epochs': [15, 25],
    'alpha': [0.025, 0.05]
}
# Generate all combinations of hyperparameters
param_combinations = list(product(
    param_grid['vector_size'],
    param_grid['window'],
    param_grid['min_count'],
    param_grid['sg'],
    param_grid['epochs'],
    param_grid['alpha']
))

# Store results
results = []
intrisic_pairs = [('computer','science'),('economics','finance'),('engineering','technology'),('history','sociology'),('psychology','neuroscience'),('philosophy','ethic'),('art','design'),('business','management'),('medicine','healthcare'),('education','teaching')]
for params in param_combinations:
    vector_size, window, min_count, sg, epochs, alpha = params
    print(f"Training model with: vector_size={vector_size}, window={window}, min_count={min_count}, sg={sg}, epochs={epochs}, alpha={alpha}")
    model = Word2Vec(corpus, vector_size=vector_size, window=window, min_count=min_count, sg=sg, epochs=epochs, alpha=alpha, workers=5)
    
    # Evaluate the model (example: intrinsic evaluation or similarity tasks)
    similarity_point = 0
    for a, b in intrisic_pairs:
        similarity_point += model.wv.similarity(a,b)  # Example similarity tasks
    results.append((params, similarity_point))

In [None]:
# Save all hyperparams to csv file
results_df = pd.DataFrame(results)
results_df.columns = ['params', 'similarity_point']
with open('Word2Vec_hyperparam.csv', 'a') as file:
    results_df.to_csv(file, index=False)

In [None]:
# Find the best parameters
best_params, best_score = max(results, key=lambda x: x[1])
print(f"Best parameters: {best_params} with score: {best_score}")

# Retrain with best model
vector_size, window, min_count, sg, epochs, alpha = best_params
# better core can go up to 16 worker
best_model = Word2Vec(corpus, vector_size=vector_size, window=window, min_count=min_count, sg=sg, epochs=epochs, alpha=alpha, workers=5)

# Save the model
best_model.save("word2vec_best_model.model")

In [None]:
# Load the model when needed
model = Word2Vec.load("word2vec_best_model.model")

# model = Word2Vec(
#     corpus,
#     vector_size=200, 
#     window=10, 
#     min_count=1, 
#     sg=0, 
#     epochs=5, 
#     alpha=0.025, 
#     workers=5)

### Compute TF-IDF Weights

- Emphasizes Important Words: Words with higher TF-IDF scores contribute more to the sentence vector.
- Reduces Noise: Less important words have less impact on the representation.\
Example: Words like "computer" and "science" in your example sentence will have higher weights, while words like "gain" and "in" will have lower influence.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Combine all text for TF-IDF computation
all_text_contracted = pd.concat([user_pref_df['preference'], university_df['merge_raw']])
all_text = all_text_contracted.apply(lambda x: contractions.fix(x))
# all_text.to_list()

# Init TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
# Compute
tfidf_vectorizer.fit_transform(all_text)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create a dictionary mapping each word to its IDF weight
tfidf_dict = dict(zip(feature_names, tfidf_vectorizer.idf_))

### Compute Sentence Vectors with TF-IDF Weighted Averaging

In [None]:
def tfidf_weighted_vector(tokens, model, tfidf_dict):
    word_vectors = []
    total_weight = 0
    
    for word in tokens:
        if word in model.wv and word in tfidf_dict:  # Ensure the word is in both Word2Vec and TF-IDF
            weight = tfidf_dict[word]
            word_vectors.append(model.wv[word] * weight)
            total_weight += weight

    if word_vectors:  # If there are valid word vectors
        return np.sum(word_vectors, axis=0) / total_weight
    else:  # Return a zero vector if no valid tokens are found
        return np.zeros(model.vector_size)
    
documents['sentence_vector'] = documents['tokenized'].apply(lambda x: tfidf_weighted_vector(x, model, tfidf_dict))

### Compute Similarities with cosine similarity

- TESTING: try one user with all programs

In [None]:
test_user_id = 500
test_user = user_pref_df['normalize_tokenize'].iloc[test_user_id]

# txt = "An English-taught Master of Science in Renewable Energy Systems in Hamburg would help me focus on solar and wind energy technologies, preparing me for a global career in green energy"
# test_user = normalize_tokenize(txt)

# Compute TF-IDF weighted vectors for user preference
test_user_vector = tfidf_weighted_vector(test_user, model, tfidf_dict)
test_user_vector = np.array(test_user_vector).reshape(1, -1) # (1, vector dimension)

# Compute TF-IDF weighted vectors for university programs
university_df['tfidf_vector'] = university_df['merge_normalize_tokenize'].apply(lambda x: tfidf_weighted_vector(x, model, tfidf_dict))
program_vectors = np.stack(university_df['tfidf_vector'].values)

# Compute similarity between user1 and all programs
similarities = cosine_similarity(test_user_vector, program_vectors)[0]

In [None]:
test_df = pd.DataFrame()
test_df['similarity'] = similarities
test_df_sorted = test_df.sort_values(by='similarity', ascending=False)
rankingID_top = test_df_sorted.head(5).index.to_list()

print(f"User {test_user_id} Preference: {user_pref_df['preference'].iloc[test_user_id]}")
# print(txt)
print(f"Top 5: {university_df[university_df.columns[[0,2,3,5,6]]].iloc[rankingID_top]}")

User 500 Preference: A Master's in blockchain technology appeals to me, especially with applications in finance.
Top 5:                                         program_name                                                                      university    location         degreeType language
2572                  Data Engineering and Analytics                                                 Munich University of Technology    Garching  Master of Science  English
8635           Quantitative Finance and Data Science                                         Berlin School of Applied Sciences (HTW)      Berlin  Master of Science   German
6530      Mathematical Finance and Actuarial Science                                                 Munich University of Technology    Garching  Master of Science  English
2409  Corporate Sustainability & Sustainable Finance                                          University of Applied Sciences Kempten     Kempten     Master of Arts  English
1178           

### Map Similarity Scores to Synthetic Ratings

In [None]:
def map_similarity_to_rating(similarities):
    # Scale similarity (0-1) to ratings (1-5)
    ratings = similarities * 4 + 1  # Scale to 1-5
    # Introduce randomness
    noise = np.random.normal(0, 0.5, size=similarities.shape)  # Adjust standard deviation as needed
    ratings += noise
    # Ensure ratings is within bounds (1,5)
    ratings = np.clip(ratings, 1, 5)
    # Round to nearest half
    ratings = np.round(ratings, 2)
    return ratings 

### Generate Synthetic Ratings

In [None]:
user_columns = []
uni_id = [f'uni_id_{i}' for i in range(similarities.shape[0])]

for user_index, user_preference in enumerate(user_pref_df['normalize_tokenize']):
    # Compute TF-IDF weighted vectors for user preference
    user_vector = tfidf_weighted_vector(user_preference, model, tfidf_dict)
    user_vector = np.array(user_vector).reshape(1, -1) # (1, vector dimension)

    similarities = cosine_similarity(user_vector, program_vectors)[0]

    rating = map_similarity_to_rating(similarities)
    user_columns.append(pd.Series(rating, index=uni_id, name=f'userid_{user_index}'))

user_ratings = pd.concat(user_columns, axis=1)
user_ratings.to_csv('../university_user_ratings.csv',index=True)

## S4: Introduce Data Sparsity

To simulate real-world data where users rate only a subset of items, have each user rate a random selection of programs.

In [None]:
import random
import pandas as pd

In [None]:
user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
user_ratings.index.names = ['index']

data_sparse_user_ratings = pd.DataFrame().reindex_like(user_ratings)

for user_index in range(user_ratings.shape[1]):
    # Each user randomly rate K (10-30) items
    k = random.randint(500,2000)
    uni_id_to_rate = random.sample(list(range(user_ratings.shape[0])), k=k) # choose which uni_ids will be evaluated and put in list
    for uni_id in uni_id_to_rate:
        data_sparse_user_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}'] \
            = user_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}']


In [None]:
# user_ratings = pd.read_csv('../university_user_ratings.csv').set_index('Unnamed: 0')
# user_ratings.index.names = ['index']
# data_sparse_user_ratings = user_ratings.copy()

# for user_index in range(user_ratings.shape[1]):
#     # Each user randomly rate K (10-30) items
#     k = random.randint(100,200)
#     uni_id_to_rate = random.sample(list(range(user_ratings.shape[0])), k=k) # choose which uni_ids will be evaluated and put in list
#     for uni_id in uni_id_to_rate:
#         data_sparse_user_ratings.loc[f'uni_id_{uni_id}', f'userid_{user_index}'] \
#             = None


In [None]:
melted_df_ratings = data_sparse_user_ratings.reset_index().melt(id_vars=['index'], var_name='user_id', value_name='ratings')
melted_df_ratings.rename(columns={'index': 'uni_id'}, inplace=True)
melted_df_ratings.to_csv('../user_item_df.csv', index=False)

# calculate density & sparsity
tmp = melted_df_ratings.count()
actual_ratings = tmp['ratings']
total_possible_entries = tmp['user_id']
sparsity = 1 - (actual_ratings / total_possible_entries)
density = actual_ratings / total_possible_entries
print(f"Sparsity: {sparsity:.4f}")
print(f"Density: {density:.4f}")

ratings_matrix = melted_df_ratings.pivot(index='uni_id', columns='user_id', values='ratings')
# print(ratings_matrix)
print('User-Item Matrix:')
print(ratings_matrix.head(5))

Sparsity: 0.8800
Density: 0.1200
User-Item Matrix:
user_id      userid_0  userid_1  userid_10  userid_100  userid_101  \
uni_id                                                               
uni_id_0          NaN       NaN        NaN         NaN         NaN   
uni_id_1          NaN       NaN        NaN         NaN         NaN   
uni_id_10         NaN       NaN        NaN         NaN         NaN   
uni_id_100        NaN       4.0        NaN         NaN         NaN   
uni_id_1000       3.5       NaN        NaN         2.5         NaN   

user_id      userid_102  userid_103  userid_104  userid_105  userid_106  ...  \
uni_id                                                                   ...   
uni_id_0            NaN         NaN         3.5         NaN         5.0  ...   
uni_id_1            NaN         NaN         NaN         NaN         3.5  ...   
uni_id_10           NaN         NaN         NaN         NaN         NaN  ...   
uni_id_100          NaN         NaN         NaN         3.

## Memory-based Collaborative Filtering

In [None]:
# Surprise is a Python scikit for building and analyzing recommender systems that deal with explicit rating data.
# $ conda install -c conda-forge scikit-surprise
from surprise import KNNBasic
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD, NMF


### Build Dataset for prediction

In [None]:
data = pd.read_csv('../user_item_df.csv').dropna()

reader = Reader() #default is already 1-5
dataset = Dataset.load_from_df(data[['user_id','uni_id','ratings']], reader) #It must have three columns, corresponding to the user (raw) ids, the item (raw) ids, and the ratings, in this order.


In [None]:
data.count()

uni_id     5763816
user_id    5763816
ratings    5681007
dtype: int64

### User-based CF using kNN Algorithm

In [None]:
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': True,  # User-based collaborative filtering
    'min_support': 3,   # Minimum number of common items for similarity
    'shrinkage': 100    # Shrinkage parameter in case of sparse data
}

# Define the algorithm
user_cf = KNNBasic(k=20, min_k=1,sim_options=sim_options,verbose=True)

# Perform 5-fold cross-validation
user_cf_cv_results = cross_validate(user_cf, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5752  0.5741  0.5751  0.5748  0.0005  
MAE (testset)     0.4589  0.4581  0.4592  0.4587  0.0005  
Fit time          1.77    1.97    2.12    1.96    0.14    
Test time         17.75   17.94   17.05   17.58   0.38    


### Item-based CF

In [None]:
# Define Item-Based CF algorithm
sim_options = {
    'name': 'cosine',  # Use cosine similarity
    'user_based': False,  # Item-based collaborative filtering
    'min_support': 5,   # Minimum number of common items for similarity
    'shrinkage': 100    # Shrinkage parameter in case of sparse data
}
item_cf = KNNBasic(k=20, min_k=1,sim_options=sim_options, verbose=True)

# Perform 5-fold cross-validation
item_cf_cv_results = cross_validate(item_cf, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5909  0.5917  0.5911  0.5912  0.0003  
MAE (testset)     0.4714  0.4716  0.4718  0.4716  0.0002  
Fit time          75.10   65.70   65.99   68.93   4.36    
Test time         159.62  173.46  171.87  168.32  6.19    


## Model-based Collaborative Filtering

### Singular Vector Decomposition (SVD)

In [None]:
# Define SVD algorithm
svd_algo = SVD(n_factors=10, n_epochs=20,verbose=True)

# Perform 5-fold cross-validation
svd_cv_results = cross_validate(svd_algo, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing

### Non-Negative Matrix Factorization

In [None]:
# Define NMF algorithm
nmf_algo = NMF(n_factors=10, n_epochs=20,biased=False)

# Perform 5-fold cross-validation
nmf_cv_results = cross_validate(nmf_algo, dataset, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.6708  0.6726  0.6770  0.6735  0.0026  
MAE (testset)     0.5397  0.5411  0.5444  0.5417  0.0019  
Fit time          3.90    3.93    3.80    3.88    0.06    
Test time         2.16    2.54    2.10    2.27    0.20    


## Comparison

In [None]:
def print_cv_results(algo_name, cv_results):
    mean_rmse = cv_results['test_rmse'].mean()
    mean_mae = cv_results['test_mae'].mean()
    std_rmse = cv_results['test_rmse'].std()
    std_mae = cv_results['test_mae'].std()
    print(f"{algo_name} - RMSE: {mean_rmse:.4f} (± {std_rmse:.4f}), MAE: {mean_mae:.4f} (± {std_mae:.4f})")

print_cv_results("User-Based CF", user_cf_cv_results)
print_cv_results("Item-Based CF", item_cf_cv_results)
print_cv_results("SVD", svd_cv_results)
print_cv_results("NMF", nmf_cv_results)

User-Based CF - RMSE: 0.5748 (± 0.0005), MAE: 0.4587 (± 0.0005)
Item-Based CF - RMSE: 0.5912 (± 0.0003), MAE: 0.4716 (± 0.0002)
SVD - RMSE: 0.5527 (± 0.0007), MAE: 0.4413 (± 0.0005)
NMF - RMSE: 0.6735 (± 0.0026), MAE: 0.5417 (± 0.0019)
