## Dataset 1: MovieLens 100K (Explicit Ratings)

### 🔹 Setup Summary
- **Data type:** Explicit ratings (1–5 stars)
- **Training/Test Split:** [e.g., u1.base & u1.test (80/20 split)]
- **Parameters Tested:** 
- **Evaluation Metrics:** RMSE, MAE, Precision@5, Recall@5, Precision@10, Recall@10, NDCG@10


In [22]:
#import relevant libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.spatial.distance import pdist, squareform


# 1. Load data


In [None]:
#familiar with the train and test data
train = pd.read_csv('ml-100k/u1.base', sep='\t',
                    names=['user_id','item_id','rating','timestamp'])
test  = pd.read_csv('ml-100k/u1.test', sep='\t',
                    names=['user_id','item_id','rating','timestamp'])

print(train.shape, test.shape)
train.head()

(80000, 4) (20000, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [12]:
#familiar with the data
with open('ml-100k/u.info', 'r') as f:
    info = f.read()
print(info)

943 users
1682 items
100000 ratings



In [None]:
#familiar with the movie data
column_names = [
    'movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
    'unknown', 'Action', 'Adventure', 'Animation', "Children's",
    'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
    'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
    'War', 'Western'
]

u_item = pd.read_csv('ml-100k/u.item', sep='|', names=column_names, encoding='latin-1')
print(u_item.head())

   movie_id              title release_date  video_release_date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb_URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   

   Adventure  Animation  Children's  ...  Fantasy  Film-Noir  Horror  Musical  \
0          0          1           1  ...        0          0       0        0

# 2. Create user-item matrix


In [38]:
all_users = np.union1d(train['user_id'].unique(), test['user_id'].unique())
all_items = np.union1d(train['item_id'].unique(), test['item_id'].unique())

# Pivot train matrix
train_matrix = train.pivot(index='user_id', columns='item_id', values='rating')
train_matrix = train_matrix.reindex(index=all_users, columns=all_items, fill_value=0)
train_matrix.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,,4.0,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


# 3. user-based collaborative filtering

In [39]:
def user_based_similarity(matrix, method='cosine'):
    if method == 'cosine':
        sim = cosine_similarity(matrix.values)
    elif method == 'pearson':
        sim = fast_pearson_similarity(matrix)
    elif method == 'euclidean':
        dist = squareform(pdist(matrix.values, metric='euclidean'))
        sim = 1 / (1 + dist)
    else:
        raise ValueError("Unknown similarity method")
    return pd.DataFrame(sim, index=matrix.index, columns=matrix.index)

def fast_pearson_similarity(matrix):
    X = matrix.values
    mean_user = X.mean(axis=1, keepdims=True)
    X_centered = X - mean_user
    numerator = X_centered @ X_centered.T
    denom = np.linalg.norm(X_centered, axis=1)
    denominator = np.outer(denom, denom)
    denominator[denominator == 0] = 1e-8
    return numerator / denominator

In [41]:
def user_based_collaborative_filtering(train_matrix, user_sim, k=None):
    X = train_matrix.values
    n_users, n_items = X.shape
    pred = np.zeros_like(X)
    
    for u in range(n_users):
        if k:
            top_k_users = np.argsort(user_sim[u, :])[-(k+1):]
            top_k_users = top_k_users[top_k_users != u]
        else:
            top_k_users = np.arange(n_users)
            top_k_users = top_k_users[top_k_users != u]
        
        for i in range(n_items):
            neighbors = [v for v in top_k_users if X[v, i] > 0]
            if neighbors:
                weights = user_sim[u, neighbors]
                pred[u, i] = np.dot(weights, X[neighbors, i]) / np.sum(np.abs(weights))
            else:
                pred[u, i] = 0
    return pd.DataFrame(pred, index=train_matrix.index, columns=train_matrix.columns)


# 4. evaluation

In [42]:
def rmse(pred_matrix, test_df):
    preds, trues = [], []
    for row in test_df.itertuples(index=False):
        user, item, rating = row.user_id, row.item_id, row.rating
        if user in pred_matrix.index and item in pred_matrix.columns:
            preds.append(pred_matrix.loc[user, item])
            trues.append(rating)
    return np.sqrt(np.mean((np.array(preds) - np.array(trues))**2))

def mae(pred_matrix, test_df):
    preds, trues = [], []
    for row in test_df.itertuples(index=False):
        user, item, rating = row.user_id, row.item_id, row.rating
        if user in pred_matrix.index and item in pred_matrix.columns:
            preds.append(pred_matrix.loc[user, item])
            trues.append(rating)
    return np.mean(np.abs(np.array(preds) - np.array(trues)))

def precision_recall_at_k(pred_matrix, test_df, k=5):
    precisions, recalls = [], []
    for user in test_df['user_id'].unique():
        if user not in pred_matrix.index:
            continue
        # Top-k recommended items
        user_preds = pred_matrix.loc[user].sort_values(ascending=False)
        top_k_items = user_preds.index[:k]
        true_items = test_df[test_df['user_id'] == user]['item_id'].values
        hits = len(set(top_k_items) & set(true_items))
        precisions.append(hits / k)
        recalls.append(hits / len(true_items))
    return np.mean(precisions), np.mean(recalls)

def ndcg_at_k(pred_matrix, test_df, k=10):
    ndcgs = []
    for user in test_df['user_id'].unique():
        if user not in pred_matrix.index:
            continue
        user_preds = pred_matrix.loc[user].sort_values(ascending=False)
        top_k_items = user_preds.index[:k]
        true_items = test_df[test_df['user_id'] == user]['item_id'].values
        dcg = sum([1 / np.log2(i + 2) if item in true_items else 0 for i, item in enumerate(top_k_items)])
        idcg = sum([1 / np.log2(i + 2) for i in range(min(k, len(true_items)))])
        ndcgs.append(dcg / idcg if idcg > 0 else 0)
    return np.mean(ndcgs)


# main

In [44]:
similarities = ['cosine', 'pearson', 'euclidean']
k_values = range(5, 31, 5)

results = []

for sim in similarities:
    print(f"\n--- Evaluating similarity: {sim} ---")
    user_sim_matrix = user_based_similarity(train_matrix, method=sim).values
    for k in k_values:
        pred_matrix = user_based_collaborative_filtering(train_matrix, user_sim_matrix, k=k)
        
        result = {
            'similarity': sim,
            'k': k,
            'RMSE': rmse(pred_matrix, test),
            'MAE': mae(pred_matrix, test),
            'P@5': precision_recall_at_k(pred_matrix, test, k=5)[0],
            'R@5': precision_recall_at_k(pred_matrix, test, k=5)[1],
            'P@10': precision_recall_at_k(pred_matrix, test, k=10)[0],
            'R@10': precision_recall_at_k(pred_matrix, test, k=10)[1],
            'NDCG@10': ndcg_at_k(pred_matrix, test, k=10)
        }
        results.append(result)
        print(f"k={k} → RMSE={result['RMSE']:.4f}, MAE={result['MAE']:.4f}, P@5={result['P@5']:.4f}")

  
# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df)


--- Evaluating similarity: cosine ---


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').