In [87]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [9]:
def read_files(movies_path,ratings_path):
    movies_df = pd.read_csv(movies_path,usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
    rating_df=pd.read_csv(ratings_path,usecols=['userId', 'movieId', 'rating'],
        dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


    return movies_df,rating_df

movies_df,ratings_df = read_files(movies_path="./Data/movies.csv",ratings_path="./Data/ratings.csv")


In [15]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [14]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [34]:
def merge_dataframes(first_data_frame, second_data_frame,on_key):
    merge_dataframe = pd.merge(second_data_frame,first_data_frame,on = on_key)
    return merge_dataframe
merged_dataframe = merge_dataframes(movies_df,ratings_df,on_key='movieId')
merged_dataframe.head() 

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"


In [25]:
null_counts = merged_dataframes.isnull().sum()
print(null_counts)

userId     0
movieId    0
rating     0
title      0
dtype: int64


In [36]:
def dataframe_count_of_ratings(df,first_key,second_key ):
    movie_ratingCount = (df.
         groupby(by = [first_key])[second_key].
        count().reset_index()
    )
    movie_ratingCount = movie_ratingCount.rename(columns = {'rating': 'Number_of_Ratings'})
    return movie_ratingCount
ratings_count_df = dataframe_count_of_ratings(merged_dataframe,first_key="title",second_key="rating")
ratings_count_df

Unnamed: 0,title,Number_of_Ratings
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [37]:
final_dataframe = merge_dataframes(merged_dataframe ,ratings_count_df,on_key='title')
final_dataframe

Unnamed: 0,title,Number_of_Ratings,userId,movieId,rating
0,'71 (2014),1,610,117867,4.0
1,'Hellboy': The Seeds of Creation (2004),1,332,97757,4.0
2,'Round Midnight (1986),2,332,26564,3.5
3,'Round Midnight (1986),2,377,26564,3.5
4,'Salem's Lot (2004),1,345,27751,5.0
...,...,...,...,...,...
100831,¡Three Amigos! (1986),26,555,2478,3.0
100832,¡Three Amigos! (1986),26,561,2478,4.0
100833,¡Three Amigos! (1986),26,597,2478,3.0
100834,¡Three Amigos! (1986),26,599,2478,2.5


In [39]:
def drop_threshold_popularity(dataframe,column_name ,threshold = 200):
    dataframe = dataframe[dataframe[column_name]>= threshold]
    return dataframe
final_dataframe = drop_threshold_popularity(final_dataframe,"Number_of_Ratings" ,threshold = 200)
final_dataframe 
final_dataframe.shape
 

(4538, 5)

In [43]:
def pivot_table(df,index, columns ,values):
    df_pivot = df.pivot_table(index = index,columns= columns, values= values)
    df_pivot = df_pivot.fillna(0)

    return df_pivot

df_pivot = pivot_table(final_dataframe,index = 'title',columns='userId',values='rating')
df_pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
American Beauty (1999),5.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,0.0,1.0,...,0.0,0.0,5.0,0.0,0.0,4.5,3.0,5.0,0.0,3.5
Apollo 13 (1995),0.0,0.0,0.0,0.0,3.0,4.0,4.5,4.0,0.0,0.0,...,0.0,4.0,0.0,0.0,5.0,0.0,5.0,2.0,3.0,0.0
Braveheart (1995),4.0,0.0,0.0,0.0,4.0,5.0,0.0,3.0,0.0,0.0,...,0.0,5.0,1.0,3.0,3.0,3.5,5.0,4.0,3.0,4.5
Fight Club (1999),5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.5,...,5.0,0.0,4.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0
Forrest Gump (1994),4.0,0.0,0.0,0.0,0.0,5.0,5.0,3.0,0.0,3.5,...,0.0,3.0,3.0,0.0,3.0,4.0,0.0,3.0,4.0,3.0


In [44]:
def distances_model_nneigbours(df):
    df_csr_matrix = csr_matrix(df.values)
    model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
    model_knn.fit(df_csr_matrix)
    return model_knn

model = distances_model_nneigbours(df_pivot)

In [54]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,1,3,4.0,Grumpier Old Men (1995)
2,1,6,4.0,Heat (1995)
3,1,47,5.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,"Usual Suspects, The (1995)"


In [58]:
def operating_model(movie_vector, n_neighbours):
    distances, indices = model.kneighbors(movie_vector, n_neighbors = n_neighbours )
    return distances, indices

number_of_movies = df_pivot.shape[0]
movieId =  np.random.choice(number_of_movies)
movie_vector = df_pivot.iloc[movieId,:].values.reshape(1, -1)
distances, indices = operating_model(movie_vector, n_neighbours =4 )

In [68]:
def final_output(distances, inidces,movieId):
    print(f"The recommendation for the movie{df_pivot.index[movieId]}")
    for i in range(1, len(distances)):
        print(f" The: {i} recommendation is:{df_pivot.index[indices[i]]} with distance of :{distances[i]}")
distances = distances.flatten()
indices = indices.flatten()
final_output(distances,indices ,movieId )


The recommendation for the movieShawshank Redemption, The (1994)
 The: 1 recommendation is:Forrest Gump (1994) with distance of :0.28700655698776245
 The: 2 recommendation is:Pulp Fiction (1994) with distance of :0.29763418436050415
 The: 3 recommendation is:Silence of the Lambs, The (1991) with distance of :0.35293471813201904


# Content based Filtering

In [84]:
def read_content(credits_path , movies_path):
    movies_df = pd.read_csv(movies_path)
    credits_df = pd.read_csv(credits_path).rename(columns = {"movie_id":"id"})
    movies_df_merge = movies_df.merge(credits_df, on='id').drop(columns=['title_x', 'title_y','homepage',  'status','production_countries'])
    return movies_df_merge
merged_df = read_content(credits_path = "./Data/tmdb_5000_credits.csv",movies_path ="./Data/tmdb_5000_movies.csv")

In [85]:
merged_df.head()

Unnamed: 0,budget,genres,id,keywords,original_language,original_title,overview,popularity,production_companies,release_date,revenue,runtime,spoken_languages,tagline,vote_average,vote_count,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Enter the World of Pandora.,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]","At the end of the world, the adventure begins.",6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",A Plan No One Escapes,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",The Legend Ends,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]","Lost in our world, found in another.",6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [92]:
def transforming_overview_movie(df, column_name):
    df[column_name] = df[column_name].fillna('')
    tf_vectorizer = TfidfVectorizer(
        min_df=3, # Ignore terms that appear in fewer than 3 documents 
        max_features=None, 
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 3), #Include unigrams and 3-grams
        stop_words = 'english'
        )
    
    tfv_matrix = tf_vectorizer.fit_transform(df[column_name])
    similarity_matrix = sigmoid_kernel(tfv_matrix, tfv_matrix)
    return tfv_matrix,similarity_matrix
tfv_matrix,similarity_matrix = transforming_overview_movie(merged_df, column_name = 'overview')




In [94]:
similarity_matrix.shape

(4803, 4803)

In [126]:
def content_based_recommender(df,movie_name,similarity_matrix,num_recommended):

    index_values = df.loc[df['original_title'] == movie_name].index.values[0]
    similarity_score = list(enumerate(similarity_matrix[index_values]))
    # Sort the movies 
    sig_scores = sorted(similarity_score, key=lambda x: x[1], reverse=True)
    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:num_recommended]
    # Movie indices
    movie_indices = [i[0] for i in sig_scores]
    # Top num_recommended most similar movies
     
    df_recom =df['original_title'].iloc[movie_indices]
    return df_recom

df_recom =  content_based_recommender(merged_df,"Titanic",similarity_matrix,num_recommended= 9)


In [127]:
df_recom

1269                                  Raise the Titanic
2289                                         The Switch
296                                         End of Days
2287                         I Can Do Bad All By Myself
2143                                         Ghost Ship
4287                                            Niagara
171     Master and Commander: The Far Side of the World
4035                                              Stung
Name: original_title, dtype: object

In [99]:
df

Series([], Name: original_title, dtype: object)