### <span style="color: pink;"> Importing Libraries </span>

In [1]:
import pandas as pd
import numpy as np
import cv2
import matplotlib
from matplotlib import pyplot as plt
import os
from scipy.sparse import csr_matrix
import scipy.sparse as sp
import itertools
from sklearn.neighbors import NearestNeighbors
%matplotlib inline

### <span style="color: pink;"> Importing Data </span>

In [2]:
movies = pd.read_csv(r"C:/Users/Mandula Thrimanne/OneDrive/Documents/Personal/Medium Articles/ml-25m/movies.csv")
ratings = pd.read_csv(r"C:/Users/Mandula Thrimanne/OneDrive/Documents/Personal/Medium Articles/ml-25m/ratings.csv")

In [3]:
# getting a glimpse of the data

movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# getting a glimpse of the data

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
ratings['userId'].nunique()

162541

In [6]:
# debug attempt: how many unique movies are there

ratings['movieId'].nunique()

59047

### <span style="color: pink;"> Tranform the dataframe into a pivot table </span>


In [None]:
import multiprocessing

def process_chunk(chunk):
    # Process each chunk and return the pivot table
    return chunk.pivot_table(index='movieId',columns='userId',values='rating',aggfunc='mean')

def parallel_pivot(df, num_processes):
    # Split dataframe into chunks for parallel processing
    chunks = np.array_split(df, num_processes)

    # Create a multiprocessing pool
    pool = multiprocessing.Pool(processes=num_processes)

    # Apply process_chunk function to each chunk in parallel
    pivot_tables = pool.map(process_chunk, chunks)

    # Close the pool
    pool.close()
    pool.join()

    # Combine pivot tables from different processes
    final_pivot_table = pd.concat(pivot_tables)

    return final_pivot_table

# Example usage
if __name__ == '__main__':
    # Load your dataframe
    ratings = pd.read_csv(r"C:/Users/Mandula Thrimanne/OneDrive/Documents/Personal/Medium Articles/ml-25m/ratings.csv")

    # Define the number of processes to use
    num_processes = 4

    # Perform parallel pivot table processing
    parallel_result = parallel_pivot(ratings, num_processes)


In [None]:
end

### <span style="color: pink;"> Removing noise from the data </span>

In [None]:
no_user_voted = ratings.groupby('movieId')['rating'].agg('count')
no_movies_voted = ratings.groupby('userId')['rating'].agg('count')

In [None]:
final_dataset = final_dataset.loc[no_user_voted[no_user_voted > 10].index,:]

In [None]:
final_dataset=final_dataset.loc[:,no_movies_voted[no_movies_voted > 50].index]
final_dataset

### <span style="color: pink;"> Converting the ratings df to a sparse matrix </span>

In [None]:
csr_data = csr_matrix(final_dataset.values)
final_dataset.reset_index(inplace=True)

### <span style="color: pink;"> Building the movie recommendation system using KNN </span>

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=11, n_jobs=-1)
knn.fit(csr_data)

In [None]:
def get_movie_recommendation(movie_name):
    n_movies_to_reccomend = 10
    movie_list = movies[movies['title'].str.contains(movie_name)]  
    if len(movie_list):        
        movie_idx= movie_list.iloc[0]['movieId']
        movie_idx = final_dataset[final_dataset['movieId'] == movie_idx].index[0]
        distances , indices = knn.kneighbors(csr_data[movie_idx],n_neighbors=n_movies_to_reccomend+1)    
        rec_movie_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
        recommend_frame = []
        for val in rec_movie_indices:
            movie_idx = final_dataset.iloc[val[0]]['movieId']
            idx = movies[movies['movieId'] == movie_idx].index
            recommend_frame.append({'Title':movies.iloc[idx]['title'].values[0],'Distance':val[1]})
        df = pd.DataFrame(recommend_frame,index=range(1,n_movies_to_reccomend+1))
        return df
    else:
        return "No movies found. Please check your input"

In [None]:
get_movie_recommendation("Good Will Hunting")