### Import Necessary Packages

In [112]:
import numpy as np
import pandas as pd

#### Set dataset directory

In [113]:
dataset_Diretory="./Datasets/ml-100k"
dataset_with_credit="./Datasets"

### Get the Genres from the dataset

In [114]:
# Genres
genre_df = pd.read_csv(f'{dataset_Diretory}/u.genre', sep='|', encoding='latin-1')
genre_columns = ["unknown"] + list(genre_df[genre_df.columns[0]].values)
genre_df.head()

Unnamed: 0,unknown,0
0,Action,1
1,Adventure,2
2,Animation,3
3,Children's,4
4,Comedy,5


### Loading the Movies dataset

In [115]:
# Movie
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies_df = pd.read_csv(f'{dataset_with_credit}/movies_with_credits.csv', sep=',',
                     encoding='latin-1')
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
# movies_df = pd.read_csv(f'{dataset_Diretory}/u.item', sep='|', names=movie_columns+genre_columns,
#                      encoding='latin-1')

movies_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Aleksei Ananishnov,Gudrun Geyer,Asia Argento,Jonathan Rhys Meyers,Julie T. Wallace,Werner Herzog,Vittorio Mezzogiorno,Stefan Glowacz,Mathilda May,Gunilla Karlzen
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Loading ratings dataset

In [116]:
#Ratings
ratings_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_df = pd.read_csv(f'{dataset_Diretory}/u.data', sep='\t', names=ratings_columns)
ratings_df.drop( "unix_timestamp", inplace = True, axis = 1 ) 
ratings_df.head(2)

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3


### Remove test data from ratings dataframe


In [117]:
# Group by user_id and take 2 random samples per user
test_ratings=ratings_df.groupby('user_id', group_keys=False).apply(lambda x: x.sample(n=2, random_state=42))

# Drop those sampled rows from the original dataframe
remaining_df = ratings_df.drop(test_ratings.index)

# Copy the dataframe into ratings_df
ratings_df=remaining_df.copy()

  test_ratings=ratings_df.groupby('user_id', group_keys=False).apply(lambda x: x.sample(n=2, random_state=42))


### Making user profiles 
At this stage we are making the profiles to understand user's taste in movies.
In order to do that we are multiplying the movie features with the rating user has provided and 
summing up all the features for each user to get a score.

In [118]:
# Step 1: Merge the ratings and movies dataframes on 'movie_id'
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

# Step 2: Identify the genre & artists columns (columns from index 5 onwards in movies_df)
genre_and_artists_cols = movies_df.columns[5:]

# Multiply each genre column by the 'rating' to get weighted genres
merged_df[genre_and_artists_cols] = merged_df[genre_and_artists_cols].mul(merged_df['rating'], axis=0)

# Step 3: Group by 'user_id' and sum the genre columns
user_genre_scores = merged_df.groupby('user_id')[genre_and_artists_cols].sum().reset_index()

# Resulting dataset
user_genre_scores.head()


Unnamed: 0,user_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Aleksei Ananishnov,Gudrun Geyer,Asia Argento,Jonathan Rhys Meyers,Julie T. Wallace,Werner Herzog,Vittorio Mezzogiorno,Stefan Glowacz,Mathilda May,Gunilla Karlzen
0,1,4,246,123,40,55,316,82,24,417,...,0,0,0,0,0,0,0,0,0,0
1,2,0,38,13,4,12,61,29,0,126,...,0,0,0,0,0,0,0,0,0,0
2,3,0,39,14,0,0,26,30,5,57,...,0,0,0,0,0,0,0,0,0,0
3,4,0,26,9,0,0,20,15,5,27,...,0,0,0,0,0,0,0,0,0,0
4,5,4,172,107,53,71,243,35,0,72,...,0,0,0,0,0,0,0,0,0,0


In [120]:
def get_unrated_movies(user_id, movies_df, ratings_df):
    # Get all unique movie IDs from the movies dataset
    all_movies = movies_df['movie_id'].unique()
    
    # Get movies rated by the specific user
    user_rated = ratings_df.loc[ratings_df['user_id'] == user_id, 'movie_id'].unique()
    
    # Find movies not rated by the user using set difference
    unrated_movies = np.setdiff1d(all_movies, user_rated)
    
    return unrated_movies

# Example: Get movies not rated by user_id = 1
unrated_movies = get_unrated_movies(user_id=1, movies_df=movies_df, ratings_df=ratings_df)
print("Rated Movies:",len(movies_df)-len(unrated_movies))
print("Unrated Movies:", len(unrated_movies))

Rated Movies: 270
Unrated Movies: 1412


In [121]:
mod_mov=movies_df.drop(["video_release_date","release_date","imdb_url","release_date","movie_id","title"],axis=1,inplace=False)
mov_matrix=mod_mov.values

mod_user=user_genre_scores.drop(["user_id"],axis=1,inplace=False)
user_matrix=mod_user.values

score = np.dot(mov_matrix[5], user_matrix[1])
score

126

### Testing to come up with top 10 recommendation for a specific user

In [122]:
# Calculate scores for all movies against user 1
scores = np.dot(mov_matrix, user_matrix[1])
helf_len=int(len(mov_matrix)/2)
# Get indices of top 10 scores in descending order
top_10_indices = np.argsort(scores)[-10:][::-1]

# Extract corresponding scores
top_scores = scores[top_10_indices]

# Get movie details with scores
top_10_movies = movies_df.iloc[top_10_indices].copy()
top_10_movies['prediction_score'] = top_scores  # Add scores column

# Print results with scores
print("Top 10 Recommended Movies with Scores:")
for idx, movie in top_10_movies.iterrows():
    print(f"\nScore: {movie['prediction_score']:.4f}")
    print(f"Movie ID: {movie['movie_id']}")
    print(f"Title: {movie['title']}")
    print(f"Release Date: {movie['release_date']}")
    print("Genres:", ", ".join([col for col in movies_df.columns[5:] if movie[col] == 1]))
    print(f"IMDB URL: {movie['imdb_url']}")
    print("-" * 60)

Top 10 Recommended Movies with Scores:

Score: 285.0000
Movie ID: 172
Title: Empire Strikes Back, The (1980)
Release Date: 01-Jan-1980
Genres: Action, Adventure, Drama, Romance, Sci-Fi, War, Mark Hamill, Harrison Ford, Carrie Fisher, Irvin Kershner, Billy Dee Williams, Anthony Daniels
IMDB URL: http://us.imdb.com/M/title-exact?Empire%20Strikes%20Back,%20The%20(1980)
------------------------------------------------------------

Score: 279.0000
Movie ID: 855
Title: Diva (1981)
Release Date: 01-Jan-1981
Genres: Action, Drama, Mystery, Romance, Thriller, Roland Bertin, Richard Bohringer, Jean-Jacques Beineix, FrÃ©dÃ©ric AndrÃ©i, GÃ©rard Darmon, Chantal Deruaz
IMDB URL: http://us.imdb.com/M/title-exact?Diva%20(1981)
------------------------------------------------------------

Score: 265.0000
Movie ID: 313
Title: Titanic (1997)
Release Date: 01-Jan-1997
Genres: Action, Drama, Romance, Kathy Bates, Leonardo DiCaprio, James Cameron, Kate Winslet, Billy Zane, Frances Fisher
IMDB URL: http://us

### Test for accurcy

In [123]:
test_ratings.head()
total_test_cases=len(test_ratings)
hit=0
print(total_test_cases)
for index, row in test_ratings.iterrows():
    user_id=row['user_id']
    movie_id=row['movie_id']
    movie=movies_df[movies_df['movie_id'] == movie_id].iloc[0]
    movie_name=movie['title']
    user_rating_for_movie=row['rating']
    user_average_rating=round(ratings_df[ratings_df['user_id'] == user_id]['rating'].mean(),3)
    score_for_movie=np.dot(mov_matrix[movie_id-1], user_matrix[user_id-1])
    all_scores_for_user=np.dot(mov_matrix, user_matrix[user_id-1])
    average_score_for_user=np.mean(all_scores_for_user)
    user_has_rated="Below Average"
    score_has_been_given="Below Average"
    if user_rating_for_movie>user_average_rating:
        user_has_rated="Above Average"
    if score_for_movie>average_score_for_user:
        score_has_been_given="Above Average"
    if user_has_rated==score_has_been_given:
        hit=hit+1
    # print(f"user id : {user_id} \n movie_name: {movie_name} \n user has rated :{user_rating_for_movie} \n user's average rating :{user_average_rating} \n system score for movie :{score_for_movie} \n average score for user :{average_score_for_user} \n user_rated : {user_has_rated} \n score_given : {score_has_been_given}")
average_accuracy=round((hit/total_test_cases)*100,2)
print(f"Average accuracy is : {average_accuracy} % ")

1886
Average accuracy is : 54.67 % 
