In [36]:
import pandas as pd

# File paths
movies_path = '/Users/madhaviraval/Downloads/movies.csv'
ratings_path = '/Users/madhaviraval/Downloads/ratings.csv'

# Load datasets
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

# Display first few rows of each dataset
print("Movies Dataset:")
print(movies.head())

print("\nRatings Dataset:")
print(ratings.head())


Movies Dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings Dataset:
   userId  movieId  rating  timestamp
0       1        1       4  964982703
1       1        3       4  964981247
2       1        6       4  964982224
3       1       47       5  964983815
4       1       50       5  964982931


In [42]:
# Create user-item matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
user_item_matrix_filled = user_item_matrix.fillna(0)


In [44]:
# Determine number of features
n_features = user_item_matrix_filled.shape[1]
print(f"Number of features (movies): {n_features}")


Number of features (movies): 7


In [46]:
# Adjust n_components
n_components = min(50, n_features)
print(f"Using n_components = {n_components}")


Using n_components = 7


In [48]:
# Perform SVD
svd = TruncatedSVD(n_components=n_components, random_state=42)
matrix_reduced = svd.fit_transform(user_item_matrix_filled)
print(f"Shape of Reduced Matrix: {matrix_reduced.shape}")


Shape of Reduced Matrix: (2, 2)


In [50]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(matrix_reduced)


In [52]:
# Function to recommend movies
def recommend_movies(movie_title, movies_df, similarity_matrix, n_recommendations=5):
    # Find movie index
    try:
        movie_idx = movies_df[movies_df['title'] == movie_title].index[0]
    except IndexError:
        return f"Movie '{movie_title}' not found in dataset."
    
    # Get similarity scores
    similarity_scores = list(enumerate(similarity_matrix[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Get top recommendations
    top_recommendations = [movies_df.iloc[i[0]]['title'] for i in similarity_scores[1:n_recommendations + 1]]
    return top_recommendations


In [54]:
# Example Usage
movie_name = "Toy Story (1995)"  # Replace with a movie title from your dataset
recommended_movies = recommend_movies(movie_name, movies, similarity_matrix)

print(f"Movies recommended for '{movie_name}':")
for idx, movie in enumerate(recommended_movies, 1):
    print(f"{idx}. {movie}")

Movies recommended for 'Toy Story (1995)':
1. Jumanji (1995)


In [68]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.dataset import Reader

In [70]:
# Step 1: Load datasets
movies_path = "/Users/madhaviraval/Downloads/movies.csv"
ratings_path = "/Users/madhaviraval/Downloads/ratings.csv"

movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

print("Movies dataset loaded:")
print(movies.head())

print("Ratings dataset loaded:")
print(ratings.head())

Movies dataset loaded:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Ratings dataset loaded:
   userId  movieId  rating  timestamp
0       1        1       4  964982703
1       1        3       4  964981247
2       1        6       4  964982224
3       1       47       5  964983815
4       1       50       5  964982931


In [72]:
# Step 2: Preprocess data
print("Preprocessing the datasets...")

# Check for missing values
print("Missing values in movies:", movies.isnull().sum().sum())
print("Missing values in ratings:", ratings.isnull().sum().sum())

# Merge movies and ratings
data = pd.merge(ratings, movies, on='movieId')
print("Merged dataset preview:")
print(data.head())

Preprocessing the datasets...
Missing values in movies: 0
Missing values in ratings: 0
Merged dataset preview:
   userId  movieId  rating  timestamp                    title  \
0       1        1       4  964982703         Toy Story (1995)   
1       1        3       4  964981247  Grumpier Old Men (1995)   
2       1        6       4  964982224              Heat (1995)   
3       2        1       5  964982703         Toy Story (1995)   
4       2       10       4  964981247         GoldenEye (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3  Adventure|Animation|Children|Comedy|Fantasy  
4                    Action|Adventure|Thriller  


In [74]:
# Step 3: Prepare data for collaborative filtering
print("Preparing data for collaborative filtering...")
reader = Reader(rating_scale=(data['rating'].min(), data['rating'].max()))
data_surprise = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)

# Split data into train and test sets
trainset, testset = surprise_train_test_split(data_surprise, test_size=0.2, random_state=42)

Preparing data for collaborative filtering...


In [76]:
# Step 4: Train a recommendation system (SVD)
print("Training the recommendation system using SVD...")
model = SVD()
model.fit(trainset)

Training the recommendation system using SVD...


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x310512c90>

In [78]:
# Step 5: Evaluate the system
print("Evaluating the model...")
predictions = model.test(testset)
rmse = mean_squared_error([pred.r_ui for pred in predictions], [pred.est for pred in predictions], squared=False)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Evaluating the model...
Root Mean Squared Error (RMSE): 1.0




In [80]:
# Step 6: Make predictions for recommendations
def recommend_movies(user_id, num_recommendations=5):
    """Recommend top N movies for a given user."""
    all_movies = data['movieId'].unique()
    rated_movies = data[data['userId'] == user_id]['movieId'].unique()
    unrated_movies = [movie for movie in all_movies if movie not in rated_movies]

    predictions = [
        (movie, model.predict(user_id, movie).est) for movie in unrated_movies
    ]
    predictions.sort(key=lambda x: x[1], reverse=True)

    recommended_movies = pd.DataFrame(predictions[:num_recommendations], columns=['movieId', 'predicted_rating'])
    recommended_movies = recommended_movies.merge(movies, on='movieId')
    return recommended_movies[['title', 'genres', 'predicted_rating']]

In [82]:
# Example: Recommend movies for a user
user_id = 1
print(f"Top 5 movie recommendations for User {user_id}:")
recommendations = recommend_movies(user_id, num_recommendations=5)
print(recommendations)

Top 5 movie recommendations for User 1:
              title                     genres  predicted_rating
0  GoldenEye (1995)  Action|Adventure|Thriller          4.011588
