In [1]:
import pandas as pd
import numpy as np
import io
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
from sklearn.feature_extraction.text import * 
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Content-Based Filtering

In [2]:
tmdb = pd.read_csv('tmdb_final_data.csv')
tmdb.head()

Unnamed: 0.1,Unnamed: 0,id,imdb_id,movieId,original_title,title,cast,director,production_companies,overview,budget_adj,revenue_adj,popularity,vote_count,vote_average
0,0,262500,tt2908446,130490,Insurgent,Insurgent (2015),ShaileneWoodley TheoJames KateWinslet AnselElg...,RobertSchwentke,Summit Entertainment|Mandeville Films|Red Wago...,Beatrice Prior must confront her inner demons ...,101200000.0,271619025.4,13.112507,2480,6.3
1,1,76757,tt1617661,113345,Jupiter Ascending,Jupiter Ascending (2015),MilaKunis ChanningTatum SeanBean EddieRedmayne...,LanaWachowski|LillyWachowski,Village Roadshow Pictures|Dune Entertainment|A...,In a universe where human genetic material is ...,161919900.0,169268630.7,6.189369,1937,5.2
2,2,260346,tt2446042,120635,Taken 3,Taken 3 (2015),LiamNeeson ForestWhitaker MaggieGrace FamkeJan...,OlivierMegaton,Twentieth Century Fox Film Corporation|M6 Film...,Ex-government operative Bryan Mills finds his ...,44159980.0,299709578.2,5.749758,1578,6.1
3,3,150689,tt1661199,130073,Cinderella,Cinderella (2015),LilyJames CateBlanchett RichardMadden HelenaBo...,KennethBranagh,Walt Disney Pictures|Genre Films|Beagle Pug Fi...,"When her father unexpectedly passes away, youn...",87399960.0,498963025.2,5.556818,1495,6.8
4,4,216015,tt2322441,125916,Fifty Shades of Grey,Fifty Shades of Grey (2015),DakotaJohnson JamieDornan JenniferEhle EloiseM...,SamTaylor-Johnson,Focus Features|Trigger Street Productions|Mich...,When college senior Anastasia Steele steps in ...,36799980.0,524079119.0,4.710402,1865,5.3


In [3]:
tmdb.drop(columns=['Unnamed: 0'], inplace=True)
tmdb.head()

Unnamed: 0,id,imdb_id,movieId,original_title,title,cast,director,production_companies,overview,budget_adj,revenue_adj,popularity,vote_count,vote_average
0,262500,tt2908446,130490,Insurgent,Insurgent (2015),ShaileneWoodley TheoJames KateWinslet AnselElg...,RobertSchwentke,Summit Entertainment|Mandeville Films|Red Wago...,Beatrice Prior must confront her inner demons ...,101200000.0,271619025.4,13.112507,2480,6.3
1,76757,tt1617661,113345,Jupiter Ascending,Jupiter Ascending (2015),MilaKunis ChanningTatum SeanBean EddieRedmayne...,LanaWachowski|LillyWachowski,Village Roadshow Pictures|Dune Entertainment|A...,In a universe where human genetic material is ...,161919900.0,169268630.7,6.189369,1937,5.2
2,260346,tt2446042,120635,Taken 3,Taken 3 (2015),LiamNeeson ForestWhitaker MaggieGrace FamkeJan...,OlivierMegaton,Twentieth Century Fox Film Corporation|M6 Film...,Ex-government operative Bryan Mills finds his ...,44159980.0,299709578.2,5.749758,1578,6.1
3,150689,tt1661199,130073,Cinderella,Cinderella (2015),LilyJames CateBlanchett RichardMadden HelenaBo...,KennethBranagh,Walt Disney Pictures|Genre Films|Beagle Pug Fi...,"When her father unexpectedly passes away, youn...",87399960.0,498963025.2,5.556818,1495,6.8
4,216015,tt2322441,125916,Fifty Shades of Grey,Fifty Shades of Grey (2015),DakotaJohnson JamieDornan JenniferEhle EloiseM...,SamTaylor-Johnson,Focus Features|Trigger Street Productions|Mich...,When college senior Anastasia Steele steps in ...,36799980.0,524079119.0,4.710402,1865,5.3


In [4]:
tmdb_useable = tmdb.drop(['id', 'imdb_id', 'movieId','original_title', 'title', 'cast', 'director', 'production_companies', 'overview'], axis=1)
min_max_scaler = preprocessing.MinMaxScaler() 
scaled_values = min_max_scaler.fit_transform(tmdb_useable)
tmdb_scaled = tmdb_useable.copy()
tmdb_scaled.loc[:,:] = scaled_values

In [5]:
tmdb_scaled

Unnamed: 0,budget_adj,revenue_adj,popularity,vote_count,vote_average
0,0.238118,0.096076,0.525566,0.253152,0.685714
1,0.380988,0.059873,0.248074,0.197499,0.528571
2,0.103906,0.106012,0.230453,0.160705,0.657143
3,0.205647,0.176491,0.222720,0.152198,0.757143
4,0.086588,0.185375,0.188794,0.190120,0.542857
...,...,...,...,...,...
8364,0.000000,0.000000,0.003223,0.000102,0.842857
8365,0.000000,0.000000,0.002620,0.001025,0.600000
8366,0.000000,0.000000,0.002603,0.000102,0.714286
8367,0.000000,0.000000,0.002570,0.001230,0.557143


## TF-IDF for Overview

In [6]:
overview = tmdb['overview']
descriptions = list(overview)
vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(descriptions)
overview_cos_sim = cosine_similarity(tfidf)
overview_cos_sim

array([[1.        , 0.04065231, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.04065231, 1.        , 0.        , ..., 0.        , 0.04133514,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.01740296,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.04133514, 0.01740296, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [7]:
#return list of cosine_sim given the movie
def overview_similarity(movie):
    movie_id = tmdb.index[tmdb['title'] == movie].tolist()
    movie_id = int (movie_id[0])
    return overview_cos_sim[movie_id]

In [8]:
#test
overview_cosine = overview_similarity("Interstellar (2014)")
overview_cosine

array([0.        , 0.0242673 , 0.        , ..., 0.        , 0.01542867,
       0.        ])

## TF-IDF for Cast

In [9]:
cast_and_directors = tmdb['cast'].tolist()
vect = TfidfVectorizer(min_df=1, stop_words="english")
Countvectorizer = vect.fit_transform(cast_and_directors)
cast_pairwise_similarity = Countvectorizer * Countvectorizer.T
cast_pairwise_similarity.A

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [10]:
#cosine similarity
def cast_similarity(movie):
    if movie in list(tmdb['title']):
        index_table = tmdb.index[tmdb['title'] == movie].tolist()
        index = index_table[0]
        return cast_pairwise_similarity.A[index]
    return "That's not a valid movie name, could you please try again?"

In [11]:
#test
cast_sim= cast_similarity("Interstellar (2014)")
cast_sim

array([0., 0., 0., ..., 0., 0., 0.])

## Recommendation Function

In [12]:
# INPUT: String of movie title
# OUTPUT: Array of All Recommended Movies and their tmdb_id in Descending Order (we need this to average out with collab filter results)

def content_recommend(movie):
    recommend_table = tmdb_scaled.copy()
    overview_cosine = overview_similarity(movie)
    cast_cosine = cast_similarity(movie)
    
    recommend_table['overview_cosine'] = overview_cosine
    recommend_table['cast_cosine'] = cast_cosine
    
    model = NearestNeighbors(n_neighbors = len(cast_cosine), algorithm = 'brute', metric = 'cosine')
    model.fit(recommend_table)
    
    movie_id = tmdb.index[tmdb['title'] == movie].tolist()
    movie_id = int (movie_id[0])
    
    recommendations = model.kneighbors([recommend_table.iloc[movie_id]], return_distance = False)
    result = list()
    for i in recommendations:
        result.append(np.array(tmdb.iloc[i][["id","title"]]))
    final_data =  result[0][1:]
    df = pd.DataFrame(final_data, columns = ["tmdbId", "title"])
    df['ranking'] = np.arange(1, len(df)+1)
    return df

In [13]:
#Checking
content = content_recommend("Toy Story (1995)")

In [14]:
content

Unnamed: 0,tmdbId,title,ranking
0,863,Toy Story 2 (1999),1
1,213121,Toy Story of Terror (2013),2
2,130925,Toy Story Toons: Partysaurus Rex (2012),3
3,256835,Toy Story That Time Forgot (2014),4
4,60164,"Speed Of Thought, The (2011)",5
...,...,...,...
8363,1452,Superman Returns (2006),8364
8364,46528,"Warrior's Way, The (2010)",8365
8365,116977,Foodfight! (2012),8366
8366,22293,Manos: The Hands of Fate (1966),8367


# Collaborative-Based Filtering

In [15]:
movies = pd.read_csv('movies.csv') 

In [16]:
ratings = pd.read_csv('thres500_ratings.csv')

In [24]:
cnt = ratings.groupby(['userId']).count()
cnt.sort_values(by= ['rating'])

Unnamed: 0_level_0,Unnamed: 0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
59931,227,227,227
77478,259,259,259
24443,275,275,275
93657,294,294,294
8655,300,300,300
...,...,...,...
125794,2461,2461,2461
34576,2499,2499,2499
74142,2588,2588,2588
118205,2768,2768,2768


In [17]:
ids = pd.read_csv('link.csv')

In [18]:
movies.set_index(['movieId'], inplace = True)

In [19]:
movies

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
131254,Kein Bund für's Leben (2007),Comedy
131256,"Feuer, Eis & Dosenbier (2002)",Comedy
131258,The Pirates (2014),Adventure
131260,Rentun Ruusu (2001),(no genres listed)


In [20]:
merged_data = movies.merge(ids, on = 'movieId')

In [158]:
merged_data = merged_data.set_index(['movieId']).drop(['imdbId'], axis = 1)
merged_data

Unnamed: 0_level_0,title,genres,tmdbId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,862.0
2,Jumanji (1995),Adventure|Children|Fantasy,8844.0
3,Grumpier Old Men (1995),Comedy|Romance,15602.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,31357.0
5,Father of the Bride Part II (1995),Comedy,11862.0
...,...,...,...
131254,Kein Bund für's Leben (2007),Comedy,4436.0
131256,"Feuer, Eis & Dosenbier (2002)",Comedy,9274.0
131258,The Pirates (2014),Adventure,285213.0
131260,Rentun Ruusu (2001),(no genres listed),32099.0


In [159]:
ratings

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating
0,2537,24,1,4.0
1,2538,24,5,2.0
2,2539,24,6,4.0
3,2540,24,7,3.0
4,2541,24,10,3.0
...,...,...,...,...
5203245,19998292,138474,5349,3.0
5203246,19998293,138474,5378,4.0
5203247,19998294,138474,5401,1.0
5203248,19998295,138474,5449,4.0


In [160]:
ratings.drop(ratings.columns[0], axis=1, inplace = True)

In [161]:
ratings

Unnamed: 0,userId,movieId,rating
0,24,1,4.0
1,24,5,2.0
2,24,6,4.0
3,24,7,3.0
4,24,10,3.0
...,...,...,...
5203245,138474,5349,3.0
5203246,138474,5378,4.0
5203247,138474,5401,1.0
5203248,138474,5449,4.0


In [162]:
mtx = ratings.pivot_table(index='userId', columns='movieId', values='rating')
mtx

movieId,1,2,3,4,5,6,7,9,10,11,...,106916,106920,108932,109374,109487,110102,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,4.0,,,,2.0,4.0,3.0,,3.0,,...,,,,,,,,,,
54,4.0,3.0,,,3.0,3.0,,,4.0,5.0,...,,,,,,,,,,
58,5.0,,,,,4.5,,,,4.5,...,,,,,,,,,,
91,4.0,3.5,3.0,,,,2.5,,4.0,4.0,...,,,,,,,,,,
104,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138397,,,5.0,,,5.0,,,4.0,4.0,...,,,,,,,,,,
138406,4.0,3.0,,,,,,,3.0,,...,,,,,,,,,,
138411,5.0,,,,,5.0,,,3.5,2.5,...,,,,,,,,,,
138437,4.0,,,,,4.0,,,,,...,,,,,,,,,,


In [163]:
mean = np.nanmean(mtx, axis = 0)
mtx1 = mtx - mean

In [164]:
mtx_table = mtx1.fillna(0)

In [165]:
mtx_table

movieId,1,2,3,4,5,6,7,9,10,11,...,106916,106920,108932,109374,109487,110102,111362,111759,112556,112852
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
24,0.143248,0.000000,0.000000,0.0,-0.6811,0.210514,-0.036751,0.000000,-0.287054,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
54,0.143248,0.051482,0.000000,0.0,0.3189,-0.789486,0.000000,0.000000,0.712946,1.570728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
58,1.143248,0.000000,0.000000,0.0,0.0000,0.710514,0.000000,0.000000,0.000000,1.070728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,0.143248,0.551482,0.133621,0.0,0.0000,0.000000,-0.536751,0.000000,0.712946,0.570728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,0.000000,0.000000,0.000000,0.0,0.0000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138397,0.000000,0.000000,2.133621,0.0,0.0000,1.210514,0.000000,0.000000,0.712946,0.570728,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138406,0.143248,0.051482,0.000000,0.0,0.0000,0.000000,0.000000,0.000000,-0.287054,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138411,1.143248,0.000000,0.000000,0.0,0.0000,1.210514,0.000000,0.000000,0.212946,-0.929272,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138437,0.143248,0.000000,0.000000,0.0,0.0000,0.210514,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [166]:
id_movie = {i:movies.loc[i]['title'] for i in ratings['movieId'].unique()}

In [167]:
movie_id = {movie:id_ for id_, movie in id_movie.items()}

In [168]:
def collab_rec(movie):
    movie_indx = movie_id[movie]

    cos_sim = cosine_similarity(mtx_table.drop(columns= [movie_indx]).T, pd.DataFrame(mtx_table[movie_indx].to_numpy()).T)

    movies = pd.DataFrame(cos_sim, index = mtx_table.drop(columns= [movie_indx]).columns).rename(columns = {0: 'cos_sim'}).sort_values(by= ['cos_sim'], ascending = False)

    recommendations = movies.index
    rec_titles = [id_movie[id_] for id_ in recommendations]

    df = merged_data.loc[recommendations,]

    df['ranking'] = np.arange(1, len(df) + 1)
    return df

In [176]:
# Checking
collab = collab_rec("Toy Story (1995)")
collab

Unnamed: 0_level_0,title,genres,tmdbId,ranking
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,863.0,1
2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,9487.0,2
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,812.0,3
6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,12.0,4
4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,585.0,5
...,...,...,...,...
2361,Pink Flamingos (1972),Comedy,692.0,3109
47044,Miami Vice (2006),Action|Crime|Drama|Thriller,82.0,3110
60037,"Happening, The (2008)",Drama|Sci-Fi|Thriller,8645.0,3111
2862,Caligula (1979),Drama,9453.0,3112


# Hybrid Function

In [178]:
merged = collab.merge(content,left_on='tmdbId', right_on='tmdbId')
merged

Unnamed: 0,title_x,genres,tmdbId,ranking_x,title_y,ranking_y
0,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,863,1,Toy Story 2 (1999),1
1,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,9487,2,"Bug's Life, A (1998)",72
2,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,812,3,Aladdin (1992),319
3,Finding Nemo (2003),Adventure|Animation|Children|Comedy,12,4,Finding Nemo (2003),523
4,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,585,5,"Monsters, Inc. (2001)",702
...,...,...,...,...,...,...
2711,Pink Flamingos (1972),Comedy,692,3109,Pink Flamingos (1972),5944
2712,Miami Vice (2006),Action|Crime|Drama|Thriller,82,3110,Miami Vice (2006),404
2713,"Happening, The (2008)",Drama|Sci-Fi|Thriller,8645,3111,"Happening, The (2008)",1717
2714,Caligula (1979),Drama,9453,3112,Caligula (1979),2373


In [170]:
def rec_movies(movie, k = 5):
    collab = collab_rec(movie)
    content = content_recommend(movie)
    
    merged = collab.merge(content,left_on='tmdbId', right_on='tmdbId')

    merged['avg_rank'] = (merged['ranking_x'] + merged['ranking_y'])/2
    merged = merged.sort_values(by = ['avg_rank'], ascending = True)
    
    return merged['title_x'][:5]

In [179]:
rec_movies('Monsters, Inc. (2001)')

2                   Toy Story 2 (1999)
0                  Finding Nemo (2003)
3              Incredibles, The (2004)
40                           Up (2009)
46    Emperor's New Groove, The (2000)
Name: title_x, dtype: object