In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
path='/content/gdrive/MyDrive/Datasets/'
movies_df= pd.read_csv(path+'movies.csv', dtype={'movieId': 'int32', 'title': 'str'})
ratings_df= pd.read_csv(path+'ratings.csv',usecols=['userId', 'movieId', 'rating'],
            dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [4]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
ratings_df.head()


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df = pd.merge(ratings_df,movies_df,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [7]:
df['rating'] = df['rating'].fillna(0)

In [11]:
# Create a TF-IDF matrix for movie genres
tfidf= TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['genres'])

# Calculate cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create an index mapping movie titles to movie indices
indices = pd.Series(movies_df.index, index=movies_df['title']).drop_duplicates()



In [12]:
def get_content_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = enumerate(cosine_sim[idx])
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    sim_index = [i[0] for i in sim_scores]
    recommended_movies = movies_df['title'].iloc[sim_index]

    # Print only the movie names, not their indices
    recommended_movie_names = recommended_movies.values.tolist()
    return recommended_movie_names




# Example usage
title='Tangled (2010)'
content_based_recommended_movies= get_content_recommendations(title)
print("Recommended Movies:")

for idx, movie in enumerate(content_based_recommended_movies, start=1):
        print(f"{idx}. {movie}")



Recommended Movies:
1. Beauty and the Beast (1991)
2. Fantasia 2000 (1999)
3. Dr. Seuss' The Lorax (2012)
4. Aladdin and the King of Thieves (1996)
5. Cinderella (1950)
6. Princess and the Frog, The (2009)
7. Cloudy with a Chance of Meatballs (2009)
8. Christmas Carol, A (2009)
9. Frosty the Snowman (1969)
10. Enchanted (2007)


In [13]:
# Define a threshold for positive interactions
threshold = 3.5

# Create ground truth labels (1 for positive interaction, 0 for negative interaction)
df['ground_truth'] = (df['rating'] >= threshold).astype(int)

# Create a dictionary of ground truth interactions for each user
ground_truth_dict = df.groupby('userId')['ground_truth'].apply(list).to_dict()


In [14]:
# Create an empty dictionary to store recommendations for each user
recommendations_dict = {}

for user_id, user_data in df.groupby('userId'):
    user_rated_movies = user_data['title'].tolist()
    recommendations = get_content_recommendations(user_rated_movies[0], cosine_sim=cosine_sim)
    recommendations_dict[user_id] = recommendations


In [15]:
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
from sklearn.metrics import ndcg_score

# Initialize lists to store evaluation metrics
map_list = []
ndcg_list = []


for user_id, recommendations in recommendations_dict.items():
    # Get ground truth interactions for the user
    ground_truth = ground_truth_dict.get(user_id, [])

    # Create a binary list indicating recommended items (1 if recommended, 0 otherwise)
    recommended_binary = [1 if movie in recommendations else 0 for movie in df['title']]

    # Pad recommended_binary with zeros if it's shorter than ground_truth
    if len(recommended_binary) < len(ground_truth):
        recommended_binary += [0] * (len(ground_truth) - len(recommended_binary))


     # Calculate Mean Average Precision (MAP)
    average_precision = average_precision_score(ground_truth, recommended_binary[:len(ground_truth)])

    # Calculate Normalized Discounted Cumulative Gain (NDCG)
    ndcg = ndcg_score([ground_truth], [recommended_binary[:len(ground_truth)]])

    # Append metrics to lists
    map_list.append(average_precision)
    ndcg_list.append(ndcg)

# Calculate average metrics across all users

avg_map = sum(map_list) / len(map_list)
avg_ndcg = sum(ndcg_list) / len(ndcg_list)

print(f'Average Mean Average Precision (MAP): {avg_map}')
print(f'Average Normalized Discounted Cumulative Gain (NDCG): {avg_ndcg}')



Average Mean Average Precision (MAP): 0.6528757444560414
Average Normalized Discounted Cumulative Gain (NDCG): 0.8658187641198107
