In [None]:
"""
Hybrid Movie Recommender (User-based CF + Content-based genres)
Dataset expected: MovieLens 100K (folder path provided by user)
Files used:
 - u.data      (tab-separated: user_id, item_id, rating, timestamp)
 - u.item      (pipe-separated; fields include movie id, movie title, release, ... then 19 genre flags)

This script is the same fully-featured recommender I provided earlier but configured to use your dataset path:
D:\Internship\task_5\ml-100k

Save this file and run with: python hybrid_recommender_ml100k.py
Requirements: pandas, numpy, scikit-learn
"""

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Tuple
import random
import argparse

# --------- Config ----------
# Default path (updated to the path you provided)
ML100K_PATH = r"D:\Internship\task_5\ml-100k"
RANDOM_SEED = 42
HYBRID_ALPHA = 0.7        # weight for collaborative part (0..1). content weight = 1 - alpha
# ---------------------------

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


def load_movielens_100k(path=ML100K_PATH):
    # u.data: user id | item id | rating | timestamp
    udata = pd.read_csv(f"{path}/u.data", sep="\t", names=["user_id", "item_id", "rating", "timestamp"], encoding='latin-1')
    # u.item: movie id | movie title | release date | ... | genre flags (19)
    uitem_cols = ["item_id", "title", "release_date", "video_release_date", "IMDb_URL"] + [f"genre_{i}" for i in range(19)]
    uitem = pd.read_csv(f"{path}/u.item", sep="|", names=uitem_cols, encoding='latin-1', engine='python')
    genre_cols = [c for c in uitem.columns if c.startswith("genre_")]
    uitem[genre_cols] = uitem[genre_cols].fillna(0).astype(int)
    return udata, uitem, genre_cols


def train_test_split_per_user(ratings: pd.DataFrame, test_ratio=0.2, min_test_items=1) -> Tuple[pd.DataFrame, pd.DataFrame]:
    users = ratings['user_id'].unique()
    train_list = []
    test_list = []
    for u in users:
        ur = ratings[ratings['user_id'] == u]
        n = len(ur)
        if n <= 1:
            train_list.append(ur)
            continue
        k = max(min_test_items, int(np.floor(n * test_ratio)))
        k = min(k, n - 1)
        test_idx = ur.sample(k, random_state=RANDOM_SEED).index
        test_list.append(ur.loc[test_idx])
        train_list.append(ur.drop(test_idx))
    train = pd.concat(train_list).reset_index(drop=True)
    test = pd.concat(test_list).reset_index(drop=True) if test_list else pd.DataFrame(columns=ratings.columns)
    return train, test


def build_user_item_matrix(ratings: pd.DataFrame) -> pd.DataFrame:
    piv = ratings.pivot(index='user_id', columns='item_id', values='rating')
    return piv


def compute_user_similarity(user_item_matrix: pd.DataFrame) -> pd.DataFrame:
    users = user_item_matrix.index
    filled = user_item_matrix.fillna(0).values
    sim = cosine_similarity(filled)
    sim_df = pd.DataFrame(sim, index=users, columns=users)
    return sim_df


def predict_scores_user_based(user_id: int, user_item_matrix: pd.DataFrame, user_sim: pd.DataFrame, k_neigh=None) -> pd.Series:
    users = user_item_matrix.index
    items = user_item_matrix.columns
    sim_vec = user_sim.loc[user_id].copy()
    sim_vec[user_id] = 0.0
    if k_neigh is not None:
        topk = sim_vec.nlargest(k_neigh).index
        drop_idx = [u for u in users if u not in topk]
        sim_vec.loc[drop_idx] = 0.0
    R = user_item_matrix.fillna(0)
    numer = (sim_vec.values.reshape(-1, 1) * R.values).sum(axis=0)
    denom = np.abs(sim_vec.values).sum()
    if denom == 0:
        scores = np.zeros(len(items))
    else:
        scores = numer / denom
    return pd.Series(scores, index=items)


def build_item_genre_matrix(uitem: pd.DataFrame, genre_cols: List[str]) -> pd.DataFrame:
    item_genres = uitem.set_index('item_id')[genre_cols].astype(float)
    return item_genres


def compute_user_genre_profile(user_id: int, user_item_matrix: pd.DataFrame, item_genres: pd.DataFrame) -> np.ndarray:
    user_ratings = user_item_matrix.loc[user_id].dropna()
    if user_ratings.empty:
        return np.zeros(item_genres.shape[1])
    common_items = user_ratings.index.intersection(item_genres.index)
    if len(common_items) == 0:
        return np.zeros(item_genres.shape[1])
    ratings = user_ratings.loc[common_items].astype(float)
    genres = item_genres.loc[common_items].values
    w = (ratings - ratings.mean()).values.reshape(-1, 1)
    profile = (w * genres).sum(axis=0)
    if np.linalg.norm(profile) > 0:
        profile = profile / np.linalg.norm(profile)
    return profile


def predict_scores_content_based(user_id: int, user_item_matrix: pd.DataFrame, item_genres: pd.DataFrame) -> pd.Series:
    profile = compute_user_genre_profile(user_id, user_item_matrix, item_genres)
    if np.linalg.norm(profile) == 0:
        scores = np.zeros(item_genres.shape[0])
    else:
        scores = item_genres.values.dot(profile)
    return pd.Series(scores, index=item_genres.index)


def hybrid_predict(user_id: int, user_item_matrix: pd.DataFrame, user_sim: pd.DataFrame, item_genres: pd.DataFrame, alpha=HYBRID_ALPHA) -> pd.Series:
    cf_scores = predict_scores_user_based(user_id, user_item_matrix, user_sim)
    content_scores = predict_scores_content_based(user_id, user_item_matrix, item_genres)
    all_items = user_item_matrix.columns.union(item_genres.index)
    cf_scores = cf_scores.reindex(all_items).fillna(0)
    content_scores = content_scores.reindex(all_items).fillna(0)
    if np.linalg.norm(cf_scores.values) > 0:
        cf_vec = cf_scores.values / np.linalg.norm(cf_scores.values)
    else:
        cf_vec = cf_scores.values
    if np.linalg.norm(content_scores.values) > 0:
        cont_vec = content_scores.values / np.linalg.norm(content_scores.values)
    else:
        cont_vec = content_scores.values
    hybrid = alpha * cf_vec + (1 - alpha) * cont_vec
    return pd.Series(hybrid, index=all_items)


def recommend_for_user(user_id: int, user_item_matrix: pd.DataFrame, user_sim: pd.DataFrame, item_genres: pd.DataFrame, movies_df: pd.DataFrame, K=10, alpha=HYBRID_ALPHA) -> pd.DataFrame:
    scores = hybrid_predict(user_id, user_item_matrix, user_sim, item_genres, alpha=alpha)
    seen = user_item_matrix.loc[user_id].dropna().index if user_id in user_item_matrix.index else []
    scores = scores.drop(index=seen, errors='ignore')
    topk = scores.nlargest(K)
    titles = movies_df.set_index('item_id')['title']
    recs = pd.DataFrame({
        'item_id': topk.index,
        'score': topk.values,
        'title': [titles.get(i, "") for i in topk.index]
    })
    recs = recs.reset_index(drop=True)
    return recs


def precision_at_k_for_user(user_id: int, train_matrix: pd.DataFrame, test_df: pd.DataFrame, user_sim: pd.DataFrame, item_genres: pd.DataFrame, K=10, alpha=HYBRID_ALPHA, movies_df=None) -> float:
    recs = recommend_for_user(user_id, train_matrix, user_sim, item_genres, movies_df=movies_df, K=K, alpha=alpha)
    test_items = test_df[test_df['user_id'] == user_id]['item_id'].unique()
    if len(test_items) == 0:
        return np.nan
    hit = sum(1 for i in recs['item_id'] if i in test_items)
    return hit / K


def evaluate_precision_at_k(train_ratings: pd.DataFrame, test_ratings: pd.DataFrame, user_item_matrix: pd.DataFrame, user_sim: pd.DataFrame, item_genres: pd.DataFrame, K=10, alpha=HYBRID_ALPHA, movies_df=None) -> float:
    users = train_ratings['user_id'].unique()
    precisions = []
    for u in users:
        p = precision_at_k_for_user(u, train_ratings, test_ratings, user_sim, item_genres, K=K, alpha=alpha, movies_df=movies_df)
        if np.isnan(p):
            continue
        precisions.append(p)
    if not precisions:
        return 0.0
    return float(np.mean(precisions))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Hybrid recommender using MovieLens 100K")
    parser.add_argument('--data_path', type=str, default=ML100K_PATH, help='Path to ml-100k folder')
    parser.add_argument('--alpha', type=float, default=HYBRID_ALPHA, help='Hybrid alpha (CF weight)')
    parser.add_argument('--K', type=int, default=10, help='Top-K recommendations and evaluation')
    parser.add_argument('--sample_user', type=int, default=1, help='User id to print sample recommendations for')
    args = parser.parse_args()

    ML100K_PATH = args.data_path
    HYBRID_ALPHA = args.alpha

    print("Loading MovieLens 100K data from:", ML100K_PATH)
    udata, uitem, genre_cols = load_movielens_100k(ML100K_PATH)

    print(f"Ratings: {len(udata)}, Movies: {uitem.shape[0]}, Users: {udata['user_id'].nunique()}")

    print("Creating train/test split (per-user holdout)...")
    train_ratings, test_ratings = train_test_split_per_user(udata, test_ratio=0.2, min_test_items=1)
    print(f"Train ratings: {len(train_ratings)}, Test ratings: {len(test_ratings)}")

    print("Building user-item rating matrix (train set)...")
    user_item = build_user_item_matrix(train_ratings)
    all_users = sorted(udata['user_id'].unique())
    user_item = user_item.reindex(all_users)

    print("Computing user-user cosine similarity...")
    user_sim = compute_user_similarity(user_item.fillna(0))

    print("Building item-genre matrix...")
    item_genres = build_item_genre_matrix(uitem, genre_cols)

    sample_user = args.sample_user
    print(f"\nTop-{args.K} Recommendations for user {sample_user}:")
    recs = recommend_for_user(sample_user, user_item, user_sim, item_genres, uitem, K=args.K, alpha=HYBRID_ALPHA)
    print(recs[['item_id', 'title', 'score']].to_string(index=False))

    print(f"\nEvaluating Precision@K (K={args.K}) across users...")
    prec = evaluate_precision_at_k(train_ratings, test_ratings, user_item, user_sim, item_genres, K=args.K, alpha=HYBRID_ALPHA, movies_df=uitem)
    print(f"Average Precision@{args.K}: {prec:.4f}")
