# Movie Recommendation System (Advanced)
Build a hybrid recommender (content-based + collaborative). This notebook includes a small sample dataset and demonstrates TF-IDF content similarity and simple user-user collaborative filtering.

In [None]:
# !pip install pandas numpy scikit-learn scikit-surprise joblib matplotlib seaborn


In [None]:
import pandas as pd
movies = pd.DataFrame([
    {"movieId":1,"title":"Toy Story (1995)","genres":"Adventure|Animation|Children|Comedy|Fantasy","description":"Toys come to life."},
    {"movieId":2,"title":"Jumanji (1995)","genres":"Adventure|Children|Fantasy","description":"Magical board game."},
    {"movieId":3,"title":"Grumpier Old Men (1995)","genres":"Comedy|Romance","description":"Neighbors feud."},
    {"movieId":4,"title":"Waiting to Exhale (1995)","genres":"Comedy|Drama|Romance","description":"Friendship of four women."},
    {"movieId":5,"title":"Father of the Bride Part II (1995)","genres":"Comedy","description":"Family comedy sequel."},
    {"movieId":6,"title":"Inception (2010)","genres":"Action|Sci-Fi","description":"Dream heist."},
])
ratings = pd.DataFrame([
    {"userId":1,"movieId":6,"rating":5},
    {"userId":1,"movieId":1,"rating":4},
    {"userId":2,"movieId":1,"rating":5},
    {"userId":2,"movieId":2,"rating":3},
    {"userId":3,"movieId":3,"rating":4},
    {"userId":3,"movieId":5,"rating":3.5},
    {"userId":4,"movieId":6,"rating":4},
    {"userId":4,"movieId":3,"rating":4.5},
    {"userId":5,"movieId":2,"rating":4},
    {"userId":5,"movieId":4,"rating":4.5},
])
movies.head(), ratings.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# build simple content text by combining description + genres
movies['content'] = (movies['description'].fillna('') + ' ' + movies['genres'].fillna('')).str.lower()

tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(movies['content'])

def recommend_content(title, k=5):
    idx = movies[movies['title'].str.lower()==title.lower()].index
    if len(idx)==0: raise ValueError("Title not found")
    idx = idx[0]
    sims = cosine_similarity(tfidf_matrix[idx:idx+1], tfidf_matrix).flatten()
    top_idx = sims.argsort()[::-1][1:k+1]
    return movies.iloc[top_idx][['movieId','title']].assign(score = sims[top_idx])

# Example
recommend_content("Inception (2010)", k=4)

In [None]:
user_item = ratings.pivot_table(index='userId', columns='movieId', values='rating')
user_item_filled = user_item.fillna(0)
from sklearn.metrics.pairwise import cosine_similarity
user_sim = cosine_similarity(user_item_filled)
user_sim_df = pd.DataFrame(user_sim, index=user_item.index, columns=user_item.index)

# recommend to user function (simple weighted sum)
def recommend_cf(target_user, top_k_users=2, n_recs=5):
    if target_user not in user_sim_df.index:
        raise ValueError("User not found")
    # find top similar users
    sims = user_sim_df[target_user].sort_values(ascending=False)
    top_users = sims.drop(target_user).head(top_k_users).index
    # aggregate ratings from top similar users for items target hasn't rated
    target_rated = set(user_item.loc[target_user].dropna().index) if target_user in user_item.index else set()
    candidates = ratings[ratings['userId'].isin(top_users)]
    candidates = candidates[~candidates['movieId'].isin(target_rated)]
    if candidates.empty:
        return pd.DataFrame(columns=['movieId','score'])
    agg = candidates.groupby('movieId')['rating'].mean().sort_values(ascending=False)
    return movies.set_index('movieId').loc[agg.index][['title']].assign(score=agg.values).reset_index().head(n_recs)

# Example recommend for user 1
recommend_cf(1)

In [None]:
try:
    from surprise import Dataset, Reader, SVD
    from surprise.model_selection import train_test_split
    reader = Reader(rating_scale=(ratings['rating'].min(), ratings['rating'].max()))
    data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader)
    trainset = data.build_full_trainset()
    algo = SVD(random_state=42)
    algo.fit(trainset)
    def recommend_svd(user_id, n=5):
        seen = set(ratings[ratings['userId']==user_id]['movieId'])
        candidates = [mid for mid in movies['movieId'] if mid not in seen]
        preds = [(mid, algo.predict(user_id, mid).est) for mid in candidates]
        preds.sort(key=lambda x: x[1], reverse=True)
        return pd.DataFrame(preds, columns=['movieId','score']).merge(movies[['movieId','title']], on='movieId').head(n)
    recommend_svd(1)
except Exception as e:
    print('Surprise not installed or error occurred:', e)

In [None]:
def hybrid_for_user(user_id, content_title, w_content=0.4, w_cf=0.6, k=10):
    content_recs = recommend_content(content_title, k=50).rename(columns={'score':'score_content'})
    cf_recs = recommend_cf(user_id, top_k_users=3, n_recs=200).rename(columns={'score':'score_cf'})
    merged = content_recs.merge(cf_recs, on='movieId', how='outer').fillna(0)
    merged['hybrid'] = w_content*merged['score_content'] + w_cf*merged['score_cf']
    return merged.sort_values('hybrid', ascending=False).head(10)

# Example hybrid for user 1, seeded by "Inception"
hybrid_for_user(1, "Inception (2010)", w_content=0.4, w_cf=0.6)

## Notes & Next steps
- This demo uses tiny data. On a real MovieLens dataset:
  - Use TF-IDF on plot summaries + metadata for content features.
  - Use Surprise or Implicit ALS for scalable matrix factorization.
  - Evaluate with Precision@K, Recall@K or MAP.
  - Deploy the hybrid system as a Streamlit or Flask app.