In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
dataset_Diretory="./Datasets/ml-100k"

# Genre
genre_df = pd.read_csv(f'{dataset_Diretory}/u.genre', sep='|', encoding='latin-1')
genre_df.head(50)

In [None]:
# Movie
genre_columns = ["unknown"] + list(genre_df[genre_df.columns[0]].values)
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies_df = pd.read_csv(f'{dataset_Diretory}/u.item', sep='|', names=movie_columns+genre_columns,
                     encoding='latin-1')
# movies_df.drop(["video_release_date","release_date","imdb_url","release_date","movie_id","title"],axis=1,inplace=True)
# movies_df.head()

# # Cleaning timestamp from titles
# def clean_title(title):
#     return re.sub("[\(\[].*?[\)\]]", "",title)

# movies_df['title'] = movies_df['title'].apply(clean_title)
# movies_df['title'] = movies_df['title'].str.strip()

movies_df.head()

In [None]:
# Compute the cosine similarity matrix
# similarity = cosine_similarity(movies_df)
# print(similarity)

In [None]:
#Ratings
ratings_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_df = pd.read_csv(f'{dataset_Diretory}/u.data', sep='\t', names=ratings_columns)
ratings_df.drop( "unix_timestamp", inplace = True, axis = 1 ) 
ratings_df.head(2)

In [None]:
# Step 1: Merge the ratings and movies dataframes on 'movie_id'
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

# Step 2: Identify the genre columns (columns from index 5 onwards in movies_df)
genre_cols = movies_df.columns[5:]

# Multiply each genre column by the 'rating' to get weighted genres
merged_df[genre_cols] = merged_df[genre_cols].mul(merged_df['rating'], axis=0)

# Step 3: Group by 'user_id' and sum the genre columns
user_genre_scores = merged_df.groupby('user_id')[genre_cols].sum().reset_index()

# Resulting dataset
print(user_genre_scores.head())

In [None]:
# # NORMALIZING USER DATA 

# # Identify genre columns (exclude 'user_id')
# genre_cols = user_genre_scores.columns.drop('user_id')

# # Calculate the maximum value for each user's row
# max_values = user_genre_scores[genre_cols].max(axis=1)

# # Normalize genre columns by dividing by the row's max value (handle zeros)
# user_genre_scores[genre_cols] = user_genre_scores[genre_cols].div(max_values, axis=0).fillna(0)

# # Display the normalized data
# print(user_genre_scores.head())

In [None]:
def get_unrated_movies(user_id, movies_df, ratings_df):
    # Get all unique movie IDs from the movies dataset
    all_movies = movies_df['movie_id'].unique()
    
    # Get movies rated by the specific user
    user_rated = ratings_df.loc[ratings_df['user_id'] == user_id, 'movie_id'].unique()
    
    # Find movies not rated by the user using set difference
    unrated_movies = np.setdiff1d(all_movies, user_rated)
    
    return unrated_movies

# Example: Get movies not rated by user_id = 1
unrated_movies = get_unrated_movies(user_id=1, movies_df=movies_df, ratings_df=ratings_df)
print("Unrated Movie IDs:", len(unrated_movies))

In [None]:
mod_mov=movies_df.drop(["video_release_date","release_date","imdb_url","release_date","movie_id","title"],axis=1,inplace=False)
mov_matrix=mod_mov.values

mod_user=user_genre_scores.drop(["user_id"],axis=1,inplace=False)
user_matrix=mod_user.values

score = np.dot(mov_matrix[5], user_matrix[1])
score

In [None]:
# Calculate scores for all movies against user 1
scores = np.dot(mov_matrix, user_matrix[582])
helf_len=int(len(mov_matrix)/2)
# Get indices of top 10 scores in descending order
top_10_indices = np.argsort(scores)[-10:][::-1]

# Extract corresponding scores
top_scores = scores[top_10_indices]

# Get movie details with scores
top_10_movies = movies_df.iloc[top_10_indices].copy()
top_10_movies['prediction_score'] = top_scores  # Add scores column

# Print results with scores
print("Top 10 Recommended Movies with Scores:")
for idx, movie in top_10_movies.iterrows():
    print(f"\nScore: {movie['prediction_score']:.4f}")
    print(f"Movie ID: {movie['movie_id']}")
    print(f"Title: {movie['title']}")
    print(f"Release Date: {movie['release_date']}")
    print("Genres:", ", ".join([col for col in movies_df.columns[5:] if movie[col] == 1]))
    print(f"IMDB URL: {movie['imdb_url']}")
    print("-" * 60)