## Imports

In [2]:
# basics
import pandas as pd
import numpy as np

# for part 3 to build the recommender
from sklearn.neighbors import NearestNeighbors

# Dummy dataset (supposed to include user id and movie id as well as ratings and genres)

In [3]:
dummy_df = pd.DataFrame({
    'userId': {0: 1, 1: 2, 2: 3, 3: 1, 4: 2, 5: 1},
    'movieId': {0: 101, 1: 102, 2: 103, 3: 104, 4: 101, 5: 103},
    'genres': {0: 'Action,Comedy,Romance', 1: 'Drama', 2: 'Action,Drama', 3: 'Comedy,Romance', 4: 'Action,Comedy,Romance', 5: 'Action,Drama'},
    'ratings': {0: 4.0, 1: 5.0, 2: 4.0, 3: 3.0, 4: 4.0, 5: 4.5},
    'Action': {0: 1, 1: 0, 2: 1, 3: 0, 4: 1, 5: 1},
    'Comedy': {0: 1, 1: 0, 2: 0, 3: 1, 4: 1, 5: 0},
    'Drama': {0: 0, 1: 1, 2: 1, 3: 0, 4: 0, 5: 1},
    'Romance': {0: 1, 1: 0, 2: 0, 3: 1, 4: 1, 5: 0}
})
dummy_df

Unnamed: 0,userId,movieId,genres,ratings,Action,Comedy,Drama,Romance
0,1,101,"Action,Comedy,Romance",4.0,1,1,0,1
1,2,102,Drama,5.0,0,0,1,0
2,3,103,"Action,Drama",4.0,1,0,1,0
3,1,104,"Comedy,Romance",3.0,0,1,0,1
4,2,101,"Action,Comedy,Romance",4.0,1,1,0,1
5,1,103,"Action,Drama",4.5,1,0,1,0


In [4]:
# computing average rating for each movie
average_ratings_df = dummy_df.groupby('movieId')['ratings'].mean().reset_index()
average_ratings_df.columns = ['movieId', 'average_rating']

average_ratings_df

Unnamed: 0,movieId,average_rating
0,101,4.0
1,102,5.0
2,103,4.25
3,104,3.0


## Creating dataframe that holds the info of the movies in the "hyperspace" of features
(contains no user info)

In [5]:
# merge the average rating with the genre binary features (Action, Comedy, etc.)
movie_features_df = dummy_df[['movieId', 'Action', 'Comedy', 'Drama', 'Romance']].drop_duplicates()
movie_features_df = movie_features_df.merge(average_ratings_df, on='movieId')

movie_features_df

Unnamed: 0,movieId,Action,Comedy,Drama,Romance,average_rating
0,101,1,1,0,1,4.0
1,102,0,0,1,0,5.0
2,103,1,0,1,0,4.25
3,104,0,1,0,1,3.0


## Creating a per user dataframe to hold info on what they have seen (which movies and how they rated them)

In [6]:
# getting the movies that user 1 (in this case) has rated
user_movies = dummy_df[dummy_df['userId'] == 1][['movieId', 'ratings']]
user_movies

Unnamed: 0,movieId,ratings
0,101,4.0
3,104,3.0
5,103,4.5


In [7]:
# getting movies the user has rated highly ( >= 4.0 )
high_rated_movies = user_movies[user_movies['ratings'] >= 4.0]['movieId'].tolist()
high_rated_movies

[101, 103]

In [8]:
# retrieving feature vectors of the high-rated movies
high_rated_vectors = movie_features_df[movie_features_df['movieId'].isin(high_rated_movies)].drop('movieId', axis=1)
high_rated_vectors

Unnamed: 0,Action,Comedy,Drama,Romance,average_rating
0,1,1,0,1,4.0
2,1,0,1,0,4.25


# Recommender

## Movie hyperspace object check

In [15]:
for i, movie_features in high_rated_vectors.iterrows():    # iterrows creates an iterator object to iterate through each row
                                                           # it produces an index and a row object
                                                           # movie_features is the vector to be fed on knn 
    print(f"{i},\n {movie_features}")

0,
 Action            1.0
Comedy            1.0
Drama             0.0
Romance           1.0
average_rating    4.0
Name: 0, dtype: float64
2,
 Action            1.00
Comedy            0.00
Drama             1.00
Romance           0.00
average_rating    4.25
Name: 2, dtype: float64


## Initializing KNN

In [16]:
# dummy knn
knn = NearestNeighbors(n_neighbors=2, algorithm='auto')
knn.fit(movie_features_df.drop('movieId', axis=1))

In [17]:
# finding nearest neighbors for each movie
# recommendations is a set so there are no duplicate movies
recommendations = set()

# loop 
for i, movie_features in high_rated_vectors.iterrows():    
    distances, indices = knn.kneighbors([movie_features])
    print(f"distance is {distances}, index is {indices}")
    similar_movies = movie_features_df.iloc[indices[0]]['movieId'].values
    recommendations.update(similar_movies)    # appending recommended movie to the set
    
# filtering out movies that the user has already rated
recommendations = recommendations - set(user_movies['movieId'].tolist())

list(recommendations)

distance is [[0.         1.41421356]], index is [[0 3]]
distance is [[0.   1.25]], index is [[2 1]]




[np.int64(102)]

## Packing into a function

In [19]:
def recommend_movies(user_id, df, movie_features_df, n_neighbors=2):
    # getting the movies the user has rated
    user_movies = df[df['userId'] == user_id][['movieId', 'ratings']]
    
    # Step 2: Get movies the user has rated highly (let's assume >= 4.0 is a high rating)
    high_rated_movies = user_movies[user_movies['ratings'] >= 4.0]['movieId'].tolist()
    
    # Step 3: Get the feature vectors of the high-rated movies
    high_rated_features = movie_features_df[movie_features_df['movieId'].isin(high_rated_movies)].drop('movieId', axis=1)
    
    # Step 4: Fit the KNN model on all movies
    knn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto')
    knn.fit(movie_features_df.drop('movieId', axis=1))
    
    # Step 5: Find the nearest neighbors for each high-rated movie
    recommendations = set()
    for i, movie_features in high_rated_features.iterrows():
        distances, indices = knn.kneighbors([movie_features])
        similar_movies = movie_features_df.iloc[indices[0]]['movieId'].values
        recommendations.update(similar_movies)
    
    # Step 6: Filter out movies that the user has already rated
    recommendations = recommendations - set(user_movies['movieId'].tolist())
    
    return list(recommendations)

## Recommendation

In [20]:
recommend_movies(2, dummy_df, movie_features_df, n_neighbors=2)



[np.int64(104), np.int64(103)]

# Further Ideas

We could further filter what is recommended by making sure that the recommendations are above the average of the user's average rating in these genres maybe, instead of the blanket 4.0 i have here.
For example if the user has a 4.2 average rating in Action movies and the recommendation movie is an Action movie with average 4.0, we don't recommend it after all.

(captain obvious) Other than that, we can build a more comprehensive feature space if we enrich our data and filter the recommendations further.