$$ ITI \space AI: \space Intake \space 45 $$
$$ Recommender \space Systems $$
$$ Lab \space no. \space 1 $$

# `01` Import Necessary Libraries

## `i` Default Libraries

In [185]:
import numpy as np
import pandas as pd

## `ii` Additional Libraries
Add imports for additional libraries you used throughout the notebook

----------------------------

# `02` Load Data

In [186]:
ratings = pd.read_csv("Data/songsDataset.csv", names=['userID', 'songID', 'rating'], skiprows=[0])
ratings.head()

Unnamed: 0,userID,songID,rating
0,0,90409,5
1,4,91266,1
2,5,8063,2
3,5,24427,4
4,5,105433,4


---------------------------------

# `03` Similarity Metrics

## `0` Utility Matrix
Construct utility matrix for the loaded data `ratings`
- Users as Index
- Songs as Columns

**Hint**: you can use `pandas.DataFram.pivot` method (see [Documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html))

In [187]:
utility_matrix = ratings.pivot(index='userID', columns='songID', values='rating')
utility_matrix.head()

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,2.0,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,3.0
14,,,,,,,,,,,...,,,,,,,,,,


## `i` Cosine Similarity
Finish implmenting the function below to calculate `Cosine Similarity` between two vectors

In [188]:
def cosine_sim(vec_a, vec_b):
    """
    Returns the raw cosine similarity score between two vectors.

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b 
    """
    vec_a = vec_a.fillna(0)
    vec_b = vec_b.fillna(0)

    sim_score = (vec_a@vec_b)/(np.linalg.norm(vec_a)*np.linalg.norm(vec_b))

    return sim_score

In [189]:
print(f'Cosine Similarity between userID 56 and userID 227 is: {cosine_sim(utility_matrix.iloc[56].copy(), utility_matrix.iloc[227].copy())}')

Cosine Similarity between userID 56 and userID 227 is: 0.7808688094430304


## `ii` Adjusted Cosine Similarity
Finish implmenting the function below to calculate `Adjusted Cosine Similarity` between two vectors

In [190]:
def adjusted_cosine_sim(vec_a, vec_b):
    """
    Returns the adjusted cosine similarity score between two vectors.

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b 
    """
    vec_a = vec_a.fillna(0)
    vec_b = vec_b.fillna(0)

    numerator = (vec_a-vec_a.mean())@(vec_b-vec_b.mean())
    denominator = (np.linalg.norm(vec_a-vec_a.mean()))*(np.linalg.norm(vec_b-vec_b.mean()))
    
    sim_score = numerator/denominator

    return sim_score

In [191]:
print(f'Adjusted Cosine Similarity between userID 56 and userID 227 is: {adjusted_cosine_sim(utility_matrix.iloc[56].copy(), utility_matrix.iloc[227].copy())}')

Adjusted Cosine Similarity between userID 56 and userID 227 is: 0.7764278070396685


## `iii` Pearson Correlation Coefficient
Finish implmenting the function below to calculate `Pearson Correlation Coefficient` between two vectors

In [192]:
def pearson_sim(vec_a, vec_b):
    """
    Returns the pearson similarity score between two vectors.

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b 
    """
    vec_a = vec_a.fillna(0)
    vec_b = vec_b.fillna(0)
    
    numerator = (vec_a-vec_a.mean())@(vec_b-vec_b.mean())
    denominator = (np.linalg.norm(vec_a-vec_a.mean()))*(np.linalg.norm(vec_b-vec_b.mean()))
    
    sim_score = numerator/denominator

    return sim_score

In [193]:
print(f'Pearson Similarity between songID 3785 and songID 17029 is: {pearson_sim(utility_matrix[3785].copy(), utility_matrix[17029].copy())}')

Pearson Similarity between songID 3785 and songID 17029 is: -0.015085785303531152


## `iv` Mean Squared Difference
Finish implmenting the function below to calculate `Mean Squared Difference` between two vectors

**Note**: Make sure you calculate the difference for common dimensions only (i.e. the dimensions both items/users have non-zero values in)

In [194]:
def msd_sim(vec_a, vec_b):
    """
    Returns the mean squared difference similarity score between two vectors.
    Note: Only consider common items between the two vectors

            Parameters:
                vec_a (pandas.Series): Vector A
                vec_b (pandas.Series): Vector B

            Returns:
                sim_score (float): Similarity score between vectors vec_a and vec_b 
    """
    common_mask = vec_a.notna() & vec_b.notna()
    
    common_a = vec_a[common_mask]
    common_b = vec_b[common_mask]

    MSD = np.mean((common_a - common_b)**2)

    sim_score = 1/(1+MSD)

    return sim_score

In [195]:
print(f'MSD Similarity between userID 56 and userID 227 is: {msd_sim(utility_matrix.iloc[56].copy(), utility_matrix.iloc[227].copy())}')
print(f'MSD Similarity between songID 3785 and songID 17029 is: {msd_sim(utility_matrix[3785].copy(), utility_matrix[17029].copy())}')

MSD Similarity between userID 56 and userID 227 is: 1.0
MSD Similarity between songID 3785 and songID 17029 is: 0.6363636363636364


--------------------------

# `04` Collaborative Filtering

Practice for item-based collaborative filtering

## `0` Utility Matrix
Construct utility matrix for the loaded data `ratings`
- Songs as Index
- Users as Columns

In [196]:
utility_matrix = ratings.pivot(index='songID',columns='userID',values='rating')
utility_matrix = utility_matrix.fillna(0)

In [197]:
utility_matrix.head()

userID,0,4,5,7,14,20,31,33,40,46,...,199956,199969,199973,199974,199975,199976,199980,199988,199990,199996
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
3785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8063,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## `i` Item-Item Similarity Matrix

Construct item-item (Cosine/Adjusted Cosine) similarity matrix from the utility matrix  above.

In [198]:
num_songs = utility_matrix.shape[0]
sim_mat = np.zeros((num_songs, num_songs))

for i in range(num_songs):
    for j in range(num_songs):
        sim_mat[i, j] = adjusted_cosine_sim(utility_matrix.iloc[i].copy(), utility_matrix.iloc[j].copy())

In [199]:
sim_df = pd.DataFrame(sim_mat, index=utility_matrix.index, columns=utility_matrix.index)
sim_df.head()

songID,2263,2726,3785,8063,12709,13859,16548,17029,19299,19670,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2263,1.0,-0.0065,-0.017511,-0.016326,-0.01752,-0.013347,-0.022847,-0.007725,-0.017581,-0.017882,...,-0.007331,-0.008085,-0.009286,-0.004556,-0.020674,-0.014522,-0.011948,-0.013081,-0.017415,-0.012329
2726,-0.0065,1.0,-0.016699,-0.01094,-0.016806,-0.011452,-0.023635,0.01324,-0.019354,-0.020125,...,-0.000729,0.00947,0.013797,-0.016811,-0.018107,-0.009166,-0.011642,-0.012274,-0.02302,-0.007772
3785,-0.017511,-0.016699,1.0,0.001511,-0.002429,-0.007363,-0.010149,-0.015086,-0.013344,-0.014637,...,-0.015709,-0.00797,-0.015821,-0.015284,-0.005766,-0.010261,-0.0133,-0.007578,-0.00749,-0.003461
8063,-0.016326,-0.01094,0.001511,1.0,-0.003506,-0.001862,-0.013025,-0.005731,0.007944,-0.016066,...,-0.01948,-0.001559,-0.014644,-0.015865,-0.004209,-0.006944,-0.011152,-0.006553,-0.013862,0.005777
12709,-0.01752,-0.016806,-0.002429,-0.003506,1.0,-0.011653,-0.014726,-0.004692,-0.002641,-0.006035,...,-0.014878,-0.011811,-0.006868,-0.007521,-0.013235,-0.011558,-0.016553,-0.009346,0.000393,-0.005


## `ii` Candidate Generation and Filtering

Filter out items (user 199988) has rated from the similarity matrix above.

In [200]:
user_id = 199988
potential_items = utility_matrix[user_id][utility_matrix[user_id] != 0].index
potential_items
#songs ids rated by user 199988

Index([2726, 19299, 43267, 56660], dtype='int64', name='songID')

In [201]:
filtered_sim_df = sim_df.loc[potential_items]
filtered_sim_df.drop(potential_items,axis=1,inplace=True)
# filter similarity difference matrix by the potential items and drop them columns to make it easy to the next step

In [202]:
filtered_sim_df

songID,2263,3785,8063,12709,13859,16548,17029,19670,22763,24427,...,113954,119103,120147,122065,123176,125557,126757,131048,132189,134732
songID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2726,-0.0065,-0.016699,-0.01094,-0.016806,-0.011452,-0.023635,0.01324,-0.020125,-0.023036,-0.017468,...,-0.000729,0.00947,0.013797,-0.016811,-0.018107,-0.009166,-0.011642,-0.012274,-0.02302,-0.007772
19299,-0.017581,-0.013344,0.007944,-0.002641,-0.003426,-0.001161,-0.012135,-0.004975,0.005505,0.001551,...,-0.018252,-0.005171,-0.015623,-0.002624,0.014827,-0.013707,-0.017614,-0.000595,-0.011755,0.00685
43267,-0.009534,-0.008429,-0.010259,-0.013956,-0.012019,-0.016373,0.007456,-0.018029,-0.01754,-0.007722,...,-0.00747,0.016524,0.018883,-0.015632,-0.01338,-0.006118,-0.003468,-0.010407,-0.018965,0.004009
56660,-0.016032,-0.007015,-0.009887,0.004105,-0.014507,-0.020362,-0.007994,-0.00904,-0.008463,-0.00702,...,-0.019435,-0.007092,-0.011,-0.010886,-0.003124,-0.009896,-0.014101,-0.009387,0.002315,0.000836


## `iii` Top-K Candidate Selection

Selet top-K (a k of your choice) similar items for each item (user 199988) rated from the filtered similarity matrix above.

In [203]:
def top_k_candidates(potential_items,filtered_sim_df,top_n):
    candidate_items = {}
    for i in potential_items:
        for j in filtered_sim_df.loc[i].nlargest(top_n).index:
            if j not in candidate_items:
                candidate_items[j] = [i]
            else:    
                candidate_items[j].append(i)
    return candidate_items

In [204]:
user_id = 199988  
k = 5
top_candidates = top_k_candidates(potential_items, filtered_sim_df, k)

print(f"Top {k} candidates for user '{user_id}':\n{top_candidates}")

Top 5 candidates for user '199988':
{120147: [2726, 43267], 17029: [2726, 43267], 40712: [2726], 86341: [2726], 119103: [2726, 43267], 105433: [19299], 123176: [19299], 8063: [19299], 43827: [19299], 134732: [19299], 42906: [43267], 45026: [43267], 90409: [56660], 48731: [56660], 12709: [56660], 60465: [56660], 25182: [56660]}


## `iv` Candidate Rating Prediction

Calculate the predicted rating for each of the candidate items.

In [205]:
def get_predicted_rating(top_candidates):
    predicted_ratings = {}
    for i in top_candidates:
        predicted_ratings[i] = 0
        for j in top_candidates[i]:
            predicted_ratings[i] = predicted_ratings[i] +utility_matrix.loc[j][user_id]*sim_df.loc[j][i]
        predicted_ratings[i] = predicted_ratings[i]/sum(sim_df.loc[j][i] for j in top_candidates[i])
    return predicted_ratings

In [206]:
predicted_ratings = get_predicted_rating(top_candidates)
print(f'Predicted ratings for user {user_id} are: \n{predicted_ratings}')

Predicted ratings for user 199988 are: 
{120147: 3.8443619615832922, 17029: 4.279497123471393, 40712: 5.0, 86341: 5.0, 119103: 3.7286203728626686, 105433: 5.0, 123176: 5.0, 8063: 5.0, 43827: 5.000000000000001, 134732: 5.0, 42906: 3.0, 45026: 3.0, 90409: 5.0, 48731: 5.0, 12709: 5.0, 60465: 5.0, 25182: 5.0}


In [207]:
# Create the DataFrame from the predicted ratings dictionary
candidate_df = pd.DataFrame.from_dict(predicted_ratings, orient='index', columns=['predicted_rating'])
candidate_df.index.name = 'Candidate'

# Extract top candidates and their similarities and ratings
top_candidates_1 = [top_candidates[x][0] for x in candidate_df.index]
top_candidates_2 = [top_candidates[x][1] if len(top_candidates[x]) > 1 else '<NA>' for x in candidate_df.index]

# Create intermediate dictionaries for vectorized operations
ref_1_similarity_dict = {x: sim_df.loc[x, top_candidates[x][0]] for x in candidate_df.index}
ref_1_rating_dict = {x: utility_matrix.loc[top_candidates[x][0], user_id] for x in candidate_df.index}

ref_2_similarity_dict = {x: sim_df.loc[x, top_candidates[x][1]] if len(top_candidates[x]) > 1 else '<NA>' for x in candidate_df.index}
ref_2_rating_dict = {x: utility_matrix.loc[top_candidates[x][1], user_id] if len(top_candidates[x]) > 1 else '<NA>' for x in candidate_df.index}

# Add the top candidates and their similarities and ratings to the DataFrame
candidate_df['ref_1'] = top_candidates_1
candidate_df['ref_1_similarity'] = candidate_df.index.map(ref_1_similarity_dict)
candidate_df['ref_1_rating'] = candidate_df.index.map(ref_1_rating_dict)

candidate_df['ref_2'] = top_candidates_2
candidate_df['ref_2_similarity'] = candidate_df.index.map(ref_2_similarity_dict)
candidate_df['ref_2_rating'] = candidate_df.index.map(ref_2_rating_dict)

# Display the DataFrame
candidate_df

Unnamed: 0_level_0,predicted_rating,ref_1,ref_1_similarity,ref_1_rating,ref_2,ref_2_similarity,ref_2_rating
Candidate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
120147,3.844362,2726,0.013797,5.0,43267.0,0.018883,3.0
17029,4.279497,2726,0.01324,5.0,43267.0,0.007456,3.0
40712,5.0,2726,0.012574,5.0,,,
86341,5.0,2726,0.009534,5.0,,,
119103,3.72862,2726,0.00947,5.0,43267.0,0.016524,3.0
105433,5.0,19299,0.027774,5.0,,,
123176,5.0,19299,0.014827,5.0,,,
8063,5.0,19299,0.007944,5.0,,,
43827,5.0,19299,0.00763,5.0,,,
134732,5.0,19299,0.00685,5.0,,,


------------------------------------------------------

# `05` Additional Tasks

## `i` Explore Surprise Library

- Install Scikit Surprise library.
- Explore the Library Documentation

In [208]:
import surprise

## `ii` Implement Item-Based KNN Approach

- Follow the steps explained in the sessions to prepare the KNN approach.
- Generate prediction ratings for user $199988$ on all songs.

In [209]:
from surprise import KNNBasic, Dataset, Reader

ratings = pd.read_csv("Data/songsDataset.csv", names=['userID', 'songID', 'rating'], skiprows=[0])

reader = Reader(rating_scale=(1, 5))

data = Dataset.load_from_df(ratings[['userID', 'songID', 'rating']], reader)

trainset = data.build_full_trainset()

knn_model = KNNBasic(k=40, min_k=1, sim_options={'name': 'cosine', 'user_based': False}, verbose=True)

knn_model.fit(trainset)

all_songs = ratings['songID'].unique()

user_id = 199988

predicted_ratings = []
for song_id in all_songs:
    predicted_rating = knn_model.predict(uid=user_id, iid=song_id).est
    predicted_ratings.append((song_id, predicted_rating))

predicted_ratings_df = pd.DataFrame(predicted_ratings, columns=['songID', 'predicted_rating'])
predicted_ratings_df = predicted_ratings_df.sort_values(by='predicted_rating', ascending=False)

print(predicted_ratings_df)


Computing the cosine similarity matrix...
Done computing similarity matrix.
    songID  predicted_rating
36   48731          4.641854
25   52611          4.586935
19   71582          4.572546
33   56660          4.569170
11   72017          4.558660
16   68572          4.552532
41   43827          4.551698
54  122065          4.549041
49   94604          4.535097
1    91266          4.529862
17  112023          4.529725
32   40712          4.529466
38   19299          4.528764
9    86341          4.527983
22   22763          4.526030
29   16548          4.516918
0    90409          4.516248
26  126757          4.516032
4   105433          4.515403
27   55240          4.513969
37    2726          4.513252
39   36561          4.512271
48   25182          4.509537
7    19670          4.508931
15  119103          4.508537
6   105421          4.508134
35   72309          4.502709
40   12709          4.499484
13   54042          4.497531
50   94535          4.494997
28   92881          4.494

----------------------------------------------

$$ Wish \space you \space all \space the \space best \space ♡ $$
$$ Mahmoud \space Shawqi $$