📌 Cell 1: Imports and Load Data

In [1]:
import pandas as pd
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load processed dataset
df = pd.read_csv('../data/processed/movies.csv')
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,title,release_date,video_release_date,IMDb_URL,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,...,0,0,0,0,0,0,0,0,0,0


🧠 Cell 2: Define Genre Columns & Create combined_genres

In [2]:
genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime',
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
              'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

df['combined_genres'] = df[genre_cols].apply(
    lambda row: ' '.join([genre for genre in genre_cols if row[genre] == 1]), axis=1
)

df[['title', 'combined_genres']].head()


Unnamed: 0,title,combined_genres
0,Kolya (1996),Comedy
1,L.A. Confidential (1997),Crime Film-Noir Mystery Thriller
2,Heavyweights (1994),Children Comedy
3,Legends of the Fall (1994),Drama Romance War Western
4,Jackie Brown (1997),Crime Drama


📊 Cell 3: Content-Based Filtering (TF-IDF + Cosine Similarity)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import joblib
import os

# 1. Build TF-IDF Matrix (limit features to reduce RAM)
tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_genres'])

# 2. Fit KNN model (no full pairwise matrix in memory)
knn = NearestNeighbors(n_neighbors=30, metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix)

# 3. Save models
os.makedirs('../models', exist_ok=True)
joblib.dump(tfidf, '../models/tfidf_vectorizer.pkl')
joblib.dump(knn, '../models/knn_model.pkl')

print("[✅] Saved TF-IDF vectorizer and KNN model")


[✅] Saved TF-IDF vectorizer and KNN model


📚 Cell 4: Example Content-Based Recommendations

In [6]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Load saved models
tfidf = joblib.load('../models/tfidf_vectorizer.pkl')
knn = joblib.load('../models/knn_model.pkl')

# Regenerate the TF-IDF matrix (since .kneighbors needs it)
tfidf_matrix = tfidf.fit_transform(df['combined_genres'])

# Create title index mapping
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

def get_content_recommendations(title, top_n=10):
    if title not in indices:
        return []
    idx = indices[title]
    distances, indices_list = knn.kneighbors(tfidf_matrix[idx], n_neighbors=top_n + 1)
    movie_indices = indices_list.flatten()[1:]  # exclude self
    return df[['title', 'combined_genres']].iloc[movie_indices].reset_index(drop=True)

# ✅ Example call
get_content_recommendations("Star Wars (1977)")


Unnamed: 0,title,combined_genres
0,Star Wars (1977),Action Adventure Romance Sci-Fi War
1,Return of the Jedi (1983),Action Adventure Romance Sci-Fi War
2,Star Wars (1977),Action Adventure Romance Sci-Fi War
3,Return of the Jedi (1983),Action Adventure Romance Sci-Fi War
4,Star Wars (1977),Action Adventure Romance Sci-Fi War
...,...,...
6407,Star Wars (1977),Action Adventure Romance Sci-Fi War
6408,Return of the Jedi (1983),Action Adventure Romance Sci-Fi War
6409,Star Wars (1977),Action Adventure Romance Sci-Fi War
6410,Star Wars (1977),Action Adventure Romance Sci-Fi War


🤝 Cell 5: Collaborative Filtering (Surprise SVD)

In [7]:
reader = Reader(rating_scale=(1, 5))
ratings_data = df[['user_id', 'movie_id', 'rating']]
data = Dataset.load_from_df(ratings_data, reader)

trainset, testset = train_test_split(data, test_size=0.2)

svd_model = SVD()
svd_model.fit(trainset)

predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)

joblib.dump(svd_model, '../models/svd_model.pkl')
print(f"[✅] Saved svd_model.pkl | RMSE: {rmse:.4f}")


RMSE: 0.9380
[✅] Saved svd_model.pkl | RMSE: 0.9380


👤 Cell 6: Example Collaborative Recommendations

In [8]:
movie_id_map = df[['movie_id', 'title']].drop_duplicates().set_index('movie_id')['title']

def get_collab_recommendations(user_id, top_n=10):
    seen = df[df['user_id'] == user_id]['movie_id'].tolist()
    unseen = [mid for mid in df['movie_id'].unique() if mid not in seen]

    preds = [svd_model.predict(user_id, mid) for mid in unseen]
    top_preds = sorted(preds, key=lambda x: x.est, reverse=True)[:top_n]
    movie_titles = movie_id_map.loc[[p.iid for p in top_preds]].values

    return pd.DataFrame({'title': movie_titles, 'predicted_rating': [p.est for p in top_preds]})

get_collab_recommendations(user_id=10)


Unnamed: 0,title,predicted_rating
0,Wallace & Gromit: The Best of Aardman Animatio...,4.846028
1,Some Folks Call It a Sling Blade (1993),4.765416
2,"Wrong Trousers, The (1993)",4.734447
3,Schindler's List (1993),4.722577
4,"Close Shave, A (1995)",4.721714
5,High Noon (1952),4.70095
6,Shall We Dance? (1996),4.695286
7,Henry V (1989),4.669549
8,To Kill a Mockingbird (1962),4.658337
9,Good Will Hunting (1997),4.638155
