In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
import random
random.seed(10)

In [30]:
movies = pd.read_csv('../ml-20m/movies.csv')
data = pd.read_csv('../samples/combined_movies_ratings_tags.csv')

data_subset = data.sample(frac=0.03, random_state=42)

data_subset =  data_subset.drop_duplicates(subset=['userId', 'movieId'])


In [15]:
data_subset =  data_subset.drop_duplicates(subset=['userId', 'movieId'])

user_item_matrix = data_subset.pivot(index='userId', columns='movieId', values='user_rating').fillna(0)

In [16]:
# Matrix Factorization using SVD
svd = TruncatedSVD(n_components=50, random_state=52)
latent_factors = svd.fit_transform(user_item_matrix)
Vt = svd.components_
#we are creating SVD model here. 

In [19]:
# Step 4: Reconstruct the User-Item Matrix (Predicted Ratings)

# Reconstruct the matrix using the latent factors
predicted_ratings = np.dot(latent_factors, Vt)

# Convert it back to a DataFrame with the same shape as the original user-item matrix
predicted_ratings_df = pd.DataFrame(predicted_ratings, index=user_item_matrix.index, columns=user_item_matrix.columns)

In [6]:
# TF-IDF Vectorizer for genres
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data_subset['genres'])


In [9]:

# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
content_based_similarity_df = pd.DataFrame(cosine_sim, index=data_subset.index, columns=data_subset.index)

In [31]:
# Hybrid Recommendation Function
def hybrid_recommendation(user_id, top_n=10, alpha=0.5):
    # Get collaborative filtering predictions
    user_predictions = predicted_ratings_df.loc[user_id]
    
    # Sort movies by collaborative filtering score
    top_movies_cf = user_predictions.sort_values(ascending=False).index.tolist()
    
    # Get content-based scores for these movies
    movie_ids = [movie_id for movie_id in top_movies_cf if movie_id in content_based_similarity_df.index]
    movie_scores = {movie_id: np.mean(content_based_similarity_df[movie_id].loc[movie_ids]) for movie_id in movie_ids}
# Combine collaborative filtering and content-based scores
    combined_scores = {}
    for movie_id in movie_ids:
        cf_score = user_predictions[movie_id]
        content_score = movie_scores[movie_id]
        combined_scores[movie_id] = alpha * cf_score + (1 - alpha) * content_score
# Sort movies by combined score and return top N
    recommended_movie_ids = sorted(combined_scores, key=combined_scores.get, reverse=True)[:top_n]
    
    # Return movie titles
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)][['movieId', 'title']]
    
    return recommended_movies      

In [33]:
# Example: Get top 10 recommendations for user with ID 1
user_id = 1
top_n_recommendations = hybrid_recommendation(user_id, top_n=   10, alpha=0.5)

print(f"Top 10 Movie Recommendations for User {user_id}:")
for idx, movie in enumerate(top_n_recommendations.itertuples(), 1):
    print(f"{idx}. {movie.title}")

Top 10 Movie Recommendations for User 1:
1. Hunchback of Notre Dame, The (1996)
2. Sleeper (1973)
3. Jerry Maguire (1996)
4. Liar Liar (1997)
5. Con Air (1997)
6. Sliding Doors (1998)
7. One True Thing (1998)
8. Elizabeth (1998)
9. Last Picture Show, The (1971)
10. Footloose (1984)
