In [3]:
# Load datasets from uploaded folder
ratings = pd.read_csv('/kaggle/input/movielens-25m-dataset/ml-25m/ratings.csv')
movies = pd.read_csv('/kaggle/input/movielens-25m-dataset/ml-25m/movies.csv')

#print(ratings.shape)
#print(movies.shape)
#ratings.head()


In [27]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity


NEWWWWW

In [4]:
# ‚úÖ Filter movies with > 100 ratings
movie_counts = ratings['movieId'].value_counts()
popular_movies = movie_counts[movie_counts > 100].index
ratings = ratings[ratings['movieId'].isin(popular_movies)]

# ‚úÖ Filter users with > 50 ratings
user_counts = ratings['userId'].value_counts()
active_users = user_counts[user_counts > 50].index
ratings = ratings[ratings['userId'].isin(active_users)]

# ‚úÖ Merge movie titles
ratings = ratings.merge(movies, on='movieId')


In [5]:
# üßπ Encode IDs
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
ratings['user_idx'] = user_encoder.fit_transform(ratings['userId'])
ratings['movie_idx'] = movie_encoder.fit_transform(ratings['movieId'])

In [6]:
# ‚úÖ Build sparse matrices
user_movie_sparse = coo_matrix((ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])))
movie_user_sparse = user_movie_sparse.transpose().tocsr()


In [23]:
from scipy.sparse import coo_matrix, csr_matrix

# üß± Build sparse matrix
user_movie_sparse_coo = coo_matrix((ratings['rating'], (ratings['user_idx'], ratings['movie_idx'])))
user_movie_sparse = user_movie_sparse_coo.tocsr()  # ‚úÖ Convert to CSR

movie_user_sparse = user_movie_sparse.transpose()


In [7]:
# üîß Build maps
movie_id_map = {movie_id: idx for idx, movie_id in enumerate(movie_encoder.classes_)}
reverse_movie_map = {idx: movie_id for movie_id, idx in movie_id_map.items()}
movie_title_map = dict(zip(movies['movieId'], movies['title']))


In [8]:
# ‚úÖ Train KNN model (item-based)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(movie_user_sparse)


In [15]:
# ‚úÖ Train user-based KNN model (once)
user_knn = NearestNeighbors(metric='cosine', algorithm='brute')
user_knn.fit(user_movie_sparse)


In [16]:
# üéØ Item-based KNN Recommendations
def knn_scores(movie_title, k=25):
    matches = movies[movies['title'].str.contains(movie_title, case=False, na=False)]
    if matches.empty:
        return {}
    movie_id = matches.iloc[0]['movieId']
    if movie_id not in movie_id_map:
        return {}
    movie_idx = movie_id_map[movie_id]
    distances, indices = knn_model.kneighbors(movie_user_sparse.getrow(movie_idx), n_neighbors=k + 1)
    distances, indices = distances.flatten(), indices.flatten()
    scores = {}
    for i in range(1, len(distances)):
        neighbor_idx = indices[i]
        movie_id_neighbor = reverse_movie_map[neighbor_idx]
        title = movie_title_map[movie_id_neighbor]
        scores[title] = 1 - distances[i]  # similarity
    return scores


In [49]:
# üë• User-based Collaborative Filtering
def recommend_for_user(user_id, n_recommendations=10):
    if user_id not in user_encoder.classes_:
        print(f"‚ùå User ID {user_id} not in dataset.")
        return pd.DataFrame()

    user_idx = user_encoder.transform([user_id])[0]
    distances, indices = user_knn.kneighbors(user_movie_sparse[user_idx], n_neighbors=6)
    similar_user_idxs = indices.flatten()[1:]
    similar_user_ids = user_encoder.inverse_transform(similar_user_idxs)

    similar_ratings = ratings[ratings['userId'].isin(similar_user_ids)]
    seen_movies = set(ratings[ratings['userId'] == user_id]['movieId'])
    unseen = similar_ratings[~similar_ratings['movieId'].isin(seen_movies)]
    top_movies = unseen.groupby('movieId')['rating'].mean().sort_values(ascending=False).head(n_recommendations)
    return movies[movies['movieId'].isin(top_movies.index)][['title']].reset_index(drop=True)


In [60]:
def tfidf_scores(movie_title, k=25):
    movies['combined'] = movies['title'] + ' ' + movies['genres'].fillna('')
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(movies['combined'])
    matches = movies[movies['title'].str.contains(movie_title, case=False, na=False)]
    if matches.empty:
        return {}
    idx = matches.index[0]
    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_indices = cosine_sim.argsort()[::-1][1:k+1]
    scores = {}
    for i in sim_indices:
        scores[movies.iloc[i]['title']] = cosine_sim[i]
    return scores


In [19]:
# üß™ Score Normalization
def normalize_scores(score_dict):
    if not score_dict: return {}
    values = list(score_dict.values())
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(np.array(values).reshape(-1, 1)).flatten()
    return dict(zip(score_dict.keys(), scaled))

In [51]:
def super_hybrid_recommend(movie_title, user_id, top_n=10, 
                           weight_item=0.4, weight_user=0.3, weight_content=0.5):
    item_scores = normalize_scores(knn_scores(movie_title))

    user_scores_df = recommend_for_user(user_id, n_recommendations=top_n*3)
    if user_scores_df.empty:
        user_scores = {}
    else:
        user_scores = dict(zip(user_scores_df['title'], [1.0 - i/len(user_scores_df) for i in range(len(user_scores_df))]))
        user_scores = normalize_scores(user_scores)

    content_scores = normalize_scores(tfidf_scores(movie_title, k=top_n*3))

    all_titles = set(item_scores) | set(user_scores) | set(content_scores)
    combined = {}
    for title in all_titles:
        combined[title] = (
            weight_item * item_scores.get(title, 0) +
            weight_user * user_scores.get(title, 0) +
            weight_content * content_scores.get(title, 0)
        )

    sorted_combined = sorted(combined.items(), key=lambda x: x[1], reverse=True)
    return pd.DataFrame(sorted_combined[:top_n], columns=["Title", "Hybrid Score"])


In [52]:
print(super_hybrid_recommend("Avengers", user_id=2, top_n=10))


                                     Title  Hybrid Score
0                     Avengers, The (2012)      0.500000
1                     Lost in Space (1998)      0.443409
2                    Avengers Grimm (2015)      0.429325
3                        3 Avengers (1964)      0.425606
4                  Shaolin Avengers (1994)      0.384148
5   Avengers: Infinity War - Part I (2018)      0.380935
6              The Shaolin Avengers (1976)      0.377918
7                   Masked Avengers (1981)      0.325485
8  Avengers: Infinity War - Part II (2019)      0.316125
9               Ultimate Avengers 2 (2006)      0.312406


In [53]:
def show_all_recommendations(movie_title, user_id, top_n=10):
    print(f"üé¨ Recommendations for: **{movie_title}** | User ID: **{user_id}**\n")

    # 1. Item-based KNN
    print("üìå Item-based Collaborative Filtering:")
    item_df = pd.DataFrame(knn_scores(movie_title).items(), columns=["Title", "Item_Score"]).head(top_n)
    print(item_df, "\n")

    # 2. User-based Collaborative Filtering
    print("üë• User-based Collaborative Filtering:")
    user_df = recommend_for_user(user_id, n_recommendations=top_n)
    print(user_df, "\n")

    # 3. Content-based TF-IDF
    print("üß† Content-based Filtering (TF-IDF):")
    tfidf_df = pd.DataFrame(tfidf_scores(movie_title).items(), columns=["Title", "TFIDF_Score"]).head(top_n)
    print(tfidf_df, "\n")

    # 4. Hybrid Recommendation
    print("üîó Hybrid Recommendation:")
    hybrid_df = super_hybrid_recommend(movie_title, user_id=user_id, top_n=top_n)
    print(hybrid_df)


In [61]:
    show_all_recommendations("Avengers", user_id=83, top_n=10)


üé¨ Recommendations for: **Avengers** | User ID: **83**

üìå Item-based Collaborative Filtering:
                       Title  Item_Score
0       Lost in Space (1998)    0.286189
1      Batman & Robin (1997)    0.262523
2          Saint, The (1997)    0.252303
3          Dick Tracy (1990)    0.249650
4      Batman Returns (1992)    0.248326
5      Rocketeer, The (1991)    0.242684
6  Mask of Zorro, The (1998)    0.241427
7       U.S. Marshals (1998)    0.239765
8            Godzilla (1998)    0.238823
9              Sphere (1998)    0.232683 

üë• User-based Collaborative Filtering:
‚ùå User ID 83 not in dataset.
Empty DataFrame
Columns: []
Index: [] 

üß† Content-based Filtering (TF-IDF):
                                     Title  TFIDF_Score
0                     Avengers, The (2012)     0.649091
1                    Avengers Grimm (2015)     0.600057
2                        3 Avengers (1964)     0.597476
3                  Shaolin Avengers (1994)     0.568712
4   Avengers: Inf