In [63]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import mean_squared_error
from math import sqrt

In [64]:
# Step 1: Download and Extract the Dataset
%cd /content/drive/MyDrive/moive
!unzip /content/drive/MyDrive/moive/ml-latest-small.zip

/content/drive/MyDrive/moive
Archive:  /content/drive/MyDrive/moive/ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [65]:
# Step 2: Load the Data
# Load the movies data
movies = pd.read_csv('/content/drive/MyDrive/moive/ml-latest-small/movies.csv')

# Load the ratings data
ratings = pd.read_csv('/content/drive/MyDrive/moive/ml-latest-small/ratings.csv')

# Step 3: Preprocess the Data
# Convert the movie genres into a single string
movies['genres'] = movies['genres'].str.replace('|', ' ')

# Step 4: Compute Movie Similarities
# Create a TF-IDF vectorizer to compute similarity based on movie titles and genres
tfidf = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1, 2), stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['title'] + ' ' + movies['genres'])

# Calculate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

  movies['genres'] = movies['genres'].str.replace('|', ' ')


In [66]:
# Step 6: Model Prediction
def get_user_recommendations(user_id, cosine_sim_matrix, movies_data, ratings_data, top_n=3):
    # Get the movies rated by the given user
    user_ratings = ratings_data[ratings_data['userId'] == user_id]

    # Get the movieIds rated by the user
    user_rated_movie_ids = user_ratings['movieId'].tolist()

    # Get the indices of the movies rated by the user
    user_rated_movie_indices = movies_data[movies_data['movieId'].isin(user_rated_movie_ids)].index

    # Calculate the average rating given by the user
    user_mean_rating = user_ratings['rating'].mean()

    # Initialize an empty dictionary to store movie recommendations
    recommendations = {}

    # Iterate over the movies rated by the user
    for movie_index in user_rated_movie_indices:
        # Get the pairwise similarity scores for the current movie
        sim_scores = list(enumerate(cosine_sim_matrix[movie_index]))

        # Sort the movies based on the similarity scores in descending order
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Get the top-n most similar movies (excluding the rated movies)
        top_movies_indices = [i[0] for i in sim_scores if i[0] not in user_rated_movie_indices][:top_n]

        # Update the recommendations dictionary with the top movies and their similarity scores
        for index in top_movies_indices:
            if index in recommendations:
                recommendations[index]['similarity'] += sim_scores[index][1]
            else:
                recommendations[index] = {
                    'movieId': movies_data.loc[index]['movieId'],
                    'title': movies_data.loc[index]['title'],
                    'similarity': sim_scores[index][1]
                }

    # Convert the recommendations dictionary into a DataFrame
    recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index')
    recommendations_df['similarity'] /= top_n

    # Sort the recommendations based on the similarity scores in descending order
    recommendations_df = recommendations_df.sort_values('similarity', ascending=False).reset_index(drop=True)

    return recommendations_df.head(top_n)

In [67]:
# Step 7: Model Evaluation
# Calculate the RMSE
rmse = sqrt(mean_squared_error(ratings['rating'], [ratings['rating'].mean()] * len(ratings)))
print(f"RMSE for the Model: {rmse}")

# Calculate the Recall
relevant_movies = ratings[ratings['rating'] >= 4.0]
relevant_movie_ids = relevant_movies['movieId'].tolist()
recommended_movie_ids = user_recommendations['movieId'].tolist()
recall = len(set(relevant_movie_ids) & set(recommended_movie_ids)) / len(relevant_movie_ids)
print(f"Recall for the Model: {recall}")

RMSE for the Model: 1.0425240696180562
Recall for the Model: 4.1169205434335116e-05


In [68]:
# Example: Predicting Recommendations for User 1
user_id = 1
user_recommendations = get_user_recommendations(user_id, cosine_sim, movies, ratings, top_n=3)

# Print the top 3 recommended movies for the user
print(f"Top 3 Recommendations for User {user_id}:")
print(user_recommendations[['movieId', 'title', 'similarity']])

Top 3 Recommendations for User 1:
   movieId                 title  similarity
0       64  Two if by Sea (1996)    0.110481
1      252           I.Q. (1994)    0.108168
2      168   First Knight (1995)    0.101830
