<a href="https://colab.research.google.com/github/paulusshewamre/Content-Collab-Hybrid-recsys/blob/main/CollaborativeBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install dependencies
!pip install implicit kagglehub pandas scipy --quiet

# Step 2: Imports
import pandas as pd
from scipy.sparse import csr_matrix
import implicit
import kagglehub
import os
import numpy as np

# Step 3: Download dataset from Kaggle
path = kagglehub.dataset_download("rounakbanik/the-movies-dataset")

# Step 4: Load ratings
ratings_path = os.path.join(path, "ratings_small.csv")
df_ratings = pd.read_csv(ratings_path)
df_ratings = df_ratings[['userId', 'movieId', 'rating']]
print("Sample ratings data:")
print(df_ratings.head())

# Step 5: Load movies metadata
movies_path = os.path.join(path, "movies_metadata.csv")
df_movies = pd.read_csv(movies_path, low_memory=False)

# Step 6: Use proper movie ID column
if 'id' in df_movies.columns:
    df_movies = df_movies[['id', 'title']].rename(columns={'id': 'movieId'})
else:
    df_movies = df_movies[['movieId', 'title']]

# Step 7: Ensure movieId is integer and filter ratings
df_movies['movieId'] = pd.to_numeric(df_movies['movieId'], errors='coerce')
df_movies = df_movies.dropna(subset=['movieId'])
df_movies['movieId'] = df_movies['movieId'].astype(int)

df_ratings = df_ratings[df_ratings['movieId'].isin(df_movies['movieId'])]

# Step 8: Map userId and movieId to zero-based indices
user_map = {id: i for i, id in enumerate(df_ratings['userId'].unique())}
movie_map = {id: i for i, id in enumerate(df_ratings['movieId'].unique())}
df_ratings['user_index'] = df_ratings['userId'].map(user_map)
df_ratings['movie_index'] = df_ratings['movieId'].map(movie_map)

# Step 9: Create item-user CSR matrix (items x users)
item_user_csr = csr_matrix(
    (df_ratings['rating'].astype(float), (df_ratings['movie_index'], df_ratings['user_index']))
)

# Step 10: Train ALS model
alpha = 15
data_conf = (item_user_csr * alpha).astype('double')
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)
model.fit(data_conf)

# Step 11: Prepare single-row CSR matrix for a user
user_idx = df_ratings['user_index'].iloc[0]  # example user
user_data = df_ratings[df_ratings['user_index'] == user_idx]
user_vector = csr_matrix(
    (user_data['rating'].astype(float),
     (np.zeros(len(user_data)), user_data['movie_index'])),
    shape=(1, item_user_csr.shape[0])
)

# Step 12: Make recommendations
recommended = model.recommend(
    userid=0,             # row index in user_vector
    user_items=user_vector,
    N=5,
    filter_already_liked_items=True
)

# Step 13: Map recommendations back to movie IDs and titles
movie_idx_reverse_map = {v: k for k, v in movie_map.items()}

print(f"\nTop 5 recommendations for user {df_ratings['userId'].iloc[0]}:")
for rec in recommended:
    movie_idx = int(rec[0])
    score = rec[1]
    movie_id = movie_idx_reverse_map[movie_idx]
    title_row = df_movies[df_movies['movieId'] == movie_id]
    if len(title_row) > 0:
        title = title_row['title'].values[0]
        print(f"Movie: {title}, Score: {score:.2f}")


Using Colab cache for faster access to the 'the-movies-dataset' dataset.
Sample ratings data:
   userId  movieId  rating
0       1       31     2.5
1       1     1029     3.0
2       1     1061     3.0
3       1     1129     2.0
4       1     1172     4.0


  0%|          | 0/20 [00:00<?, ?it/s]


Top 5 recommendations for user 1:
Movie: Terminator Salvation, Score: 524.00
Movie: Greed, Score: 1.23
