## Import Libraries

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import ast
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Load the data

In [2]:
credits = pd.read_csv("credits.csv")

In [3]:
movies = pd.read_csv("movies_metadata.csv",low_memory = False)

In [4]:
ratings = pd.read_csv("ratings.csv")

In [5]:
ratings_small = pd.read_csv("ratings_small.csv")

In [6]:
links = pd.read_csv("links.csv")

In [7]:
links_small = pd.read_csv("links_small.csv")

In [8]:
keywords = pd.read_csv("keywords.csv")

## Dataset Info

### Credit Dataset

In [9]:
credits.shape

(45476, 3)

In [10]:
credits.columns

Index(['cast', 'crew', 'id'], dtype='object')



*   cast: Information about casting. Name of actor, gender and it's character name in movie
*   crew: Information about crew members. Like who directed the movie, editor of the movie and so on.
*   id: It's movie ID given by TMDb








In [11]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [12]:
credits.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [13]:
movies.shape

(45466, 24)

In [14]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [15]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [16]:
movies.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [17]:
ratings.shape

(12473155, 4)

In [18]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [19]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12473155 entries, 0 to 12473154
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 380.7 MB


In [20]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [21]:
ratings_small.shape

(100004, 4)

In [22]:
ratings_small.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [23]:
ratings_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [24]:
ratings_small.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [25]:
links.shape

(45843, 3)

In [26]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [27]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45843 entries, 0 to 45842
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  45843 non-null  int64  
 1   imdbId   45843 non-null  int64  
 2   tmdbId   45624 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.0 MB


In [28]:
links.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [29]:
links_small.shape

(9125, 3)

In [30]:
links_small.shape

(9125, 3)

In [31]:
links_small.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [32]:
links_small.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9125 entries, 0 to 9124
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9125 non-null   int64  
 1   imdbId   9125 non-null   int64  
 2   tmdbId   9112 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 214.0 KB


In [33]:
links_small.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [34]:
keywords.shape

(46419, 2)

In [35]:
keywords.columns

Index(['id', 'keywords'], dtype='object')

In [36]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [37]:
keywords.head(5)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [38]:
movies['id'].head(10)

0      862
1     8844
2    15602
3    31357
4    11862
5      949
6    11860
7    45325
8     9091
9      710
Name: id, dtype: object

In [39]:
movies.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [40]:
movies= movies.drop_duplicates()

In [41]:
movies.shape

(45449, 24)

In [42]:
m1= movies[movies['id'].str.contains('^\d{4}-\d{2}-\d{2}$') == True]['id'].tolist()

In [43]:
movies =movies[~movies['id'].isin(m1)]

In [44]:
movies.shape

(45446, 24)

In [45]:
movies= movies.astype({'id': 'int64'})

In [46]:
m_data = movies.merge(credits,on='id')

In [47]:
m_data.shape

(45502, 26)

In [48]:
m_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

In [49]:
m_data.columns[m_data.nunique()==1]


Index([], dtype='object')

In [50]:
m_data['original_language'].value_counts()

en    32296
fr     2441
it     1529
ja     1356
de     1081
      ...  
ay        1
rw        1
cy        1
tg        1
si        1
Name: original_language, Length: 89, dtype: int64

In [51]:
m_data = m_data.merge(keywords,on='id')

In [52]:
m_data.shape

(46548, 27)

In [53]:
mdata_ohe = m_data[['id','title','overview','genres','keywords','cast','crew']]

In [54]:
mdata_ohe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46548 entries, 0 to 46547
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46548 non-null  int64 
 1   title     46544 non-null  object
 2   overview  45553 non-null  object
 3   genres    46548 non-null  object
 4   keywords  46548 non-null  object
 5   cast      46548 non-null  object
 6   crew      46548 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.8+ MB


In [55]:
mdata_ohe.isnull().sum()

id            0
title         4
overview    995
genres        0
keywords      0
cast          0
crew          0
dtype: int64

In [56]:
mdata_ohe.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mdata_ohe.dropna(inplace=True)


In [57]:
mdata_ohe.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [58]:
mdata_ohe.shape

(45549, 7)

In [59]:
mdata_ohe.duplicated().sum()

1067

In [60]:
mdata_ohe= mdata_ohe.drop_duplicates()

In [61]:
mdata_ohe

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
...,...,...,...,...,...,...,...
46543,439050,Subdue,Rising and falling between a man and woman.,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...","[{'id': 10703, 'name': 'tragic love'}]","[{'cast_id': 0, 'character': '', 'credit_id': ...","[{'credit_id': '5894a97d925141426c00818c', 'de..."
46544,111109,Century of Birthing,An artist struggles to finish his work while a...,"[{'id': 18, 'name': 'Drama'}]","[{'id': 2679, 'name': 'artist'}, {'id': 14531,...","[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de..."
46545,67758,Betrayal,"When one of her hits goes wrong, a professiona...","[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",[],"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de..."
46546,227506,Satan Triumphant,"In a small town live two brothers, one a minis...",[],[],"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de..."


In [62]:
mdata_ohe.iloc[0].genres

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [63]:
import ast


In [64]:
def convert(obj):
    l=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            l.append(i['name'])
            counter+=1
        else:
            break
    return l

In [65]:
mdata_ohe['genres']=mdata_ohe['genres'].apply(convert)

In [66]:
mdata_ohe['genres'].head(10)

0      [Animation, Comedy, Family]
1     [Adventure, Fantasy, Family]
2                [Romance, Comedy]
3         [Comedy, Drama, Romance]
4                         [Comedy]
5           [Action, Crime, Drama]
6                [Comedy, Romance]
7       [Action, Adventure, Drama]
8    [Action, Adventure, Thriller]
9    [Adventure, Action, Thriller]
Name: genres, dtype: object

In [67]:
mdata_ohe['keywords']=mdata_ohe['keywords'].apply(convert)

In [68]:
mdata_ohe['keywords'].head(10)

0                                 [jealousy, toy, boy]
1    [board game, disappearance, based on children'...
2         [fishing, best friend, duringcreditsstinger]
3    [based on novel, interracial relationship, sin...
4                   [baby, midlife crisis, confidence]
5                           [robbery, detective, bank]
6     [paris, brother brother relationship, chauffeur]
7                                                   []
8                      [terrorist, hostage, explosive]
9             [cuba, falsely accused, secret identity]
Name: keywords, dtype: object

In [69]:
mdata_ohe['cast']=mdata_ohe['cast'].apply(convert)

In [70]:
def fetchd(obj):
    l=[]

    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            l.append(i['name'])

        else:
            break
    return l

In [71]:
mdata_ohe['crew']=mdata_ohe['crew'].apply(fetchd)

In [72]:
mdata_ohe.head(5)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]","[jealousy, toy, boy]","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[]
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]","[fishing, best friend, duringcreditsstinger]","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],"[baby, midlife crisis, confidence]","[Steve Martin, Diane Keaton, Martin Short]",[]


In [73]:
mdata_ohe['overview']=mdata_ohe['overview'].apply(lambda x:x.split())

In [74]:
mdata_ohe

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy]","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[]
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, best friend, duringcreditsstinger]","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlife crisis, confidence]","[Steve Martin, Diane Keaton, Martin Short]",[]
...,...,...,...,...,...,...,...
46543,439050,Subdue,"[Rising, and, falling, between, a, man, and, w...","[Drama, Family]",[tragic love],"[Leila Hatami, Kourosh Tahami, Elham Korda]",[Hamid Nematollah]
46544,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,...",[Drama],"[artist, play, pinoy]","[Angel Aquino, Perry Dizon, Hazel Orencio]",[Lav Diaz]
46545,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr...","[Action, Drama, Thriller]",[],"[Erika Eleniak, Adam Baldwin, Julie du Page]",[Mark L. Lester]
46546,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one...",[],[],"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...",[Yakov Protazanov]


In [75]:
mdata_ohe['genres']=mdata_ohe['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [76]:
mdata_ohe

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy]","[Tom Hanks, Tim Allen, Don Rickles]",[John Lasseter]
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",[]
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, best friend, duringcreditsstinger]","[Walter Matthau, Jack Lemmon, Ann-Margret]",[Howard Deutch]
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]",[Forest Whitaker]
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlife crisis, confidence]","[Steve Martin, Diane Keaton, Martin Short]",[]
...,...,...,...,...,...,...,...
46543,439050,Subdue,"[Rising, and, falling, between, a, man, and, w...","[Drama, Family]",[tragic love],"[Leila Hatami, Kourosh Tahami, Elham Korda]",[Hamid Nematollah]
46544,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,...",[Drama],"[artist, play, pinoy]","[Angel Aquino, Perry Dizon, Hazel Orencio]",[Lav Diaz]
46545,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr...","[Action, Drama, Thriller]",[],"[Erika Eleniak, Adam Baldwin, Julie du Page]",[Mark L. Lester]
46546,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one...",[],[],"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...",[Yakov Protazanov]


In [77]:
mdata_ohe['keywords']=mdata_ohe['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
mdata_ohe['crew']=mdata_ohe['crew'].apply(lambda x:[i.replace(" ","") for i in x])
mdata_ohe['cast']=mdata_ohe['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [78]:
mdata_ohe

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy]","[TomHanks, TimAllen, DonRickles]",[JohnLasseter]
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[boardgame, disappearance, basedonchildren'sbook]","[RobinWilliams, JonathanHyde, KirstenDunst]",[]
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, bestfriend, duringcreditsstinger]","[WalterMatthau, JackLemmon, Ann-Margret]",[HowardDeutch]
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[basedonnovel, interracialrelationship, single...","[WhitneyHouston, AngelaBassett, LorettaDevine]",[ForestWhitaker]
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlifecrisis, confidence]","[SteveMartin, DianeKeaton, MartinShort]",[]
...,...,...,...,...,...,...,...
46543,439050,Subdue,"[Rising, and, falling, between, a, man, and, w...","[Drama, Family]",[tragiclove],"[LeilaHatami, KouroshTahami, ElhamKorda]",[HamidNematollah]
46544,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,...",[Drama],"[artist, play, pinoy]","[AngelAquino, PerryDizon, HazelOrencio]",[LavDiaz]
46545,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr...","[Action, Drama, Thriller]",[],"[ErikaEleniak, AdamBaldwin, JulieduPage]",[MarkL.Lester]
46546,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one...",[],[],"[IwanMosschuchin, NathalieLissenko, PavelPavlov]",[YakovProtazanov]


In [79]:
mdata_ohe['tags'] = mdata_ohe['genres'] + mdata_ohe['keywords'] + mdata_ohe['crew'] + mdata_ohe['cast'] + mdata_ohe['overview']

In [80]:
mdata_ohe

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tags
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy]","[TomHanks, TimAllen, DonRickles]",[JohnLasseter],"[Animation, Comedy, Family, jealousy, toy, boy..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[boardgame, disappearance, basedonchildren'sbook]","[RobinWilliams, JonathanHyde, KirstenDunst]",[],"[Adventure, Fantasy, Family, boardgame, disapp..."
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, bestfriend, duringcreditsstinger]","[WalterMatthau, JackLemmon, Ann-Margret]",[HowardDeutch],"[Romance, Comedy, fishing, bestfriend, duringc..."
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[basedonnovel, interracialrelationship, single...","[WhitneyHouston, AngelaBassett, LorettaDevine]",[ForestWhitaker],"[Comedy, Drama, Romance, basedonnovel, interra..."
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlifecrisis, confidence]","[SteveMartin, DianeKeaton, MartinShort]",[],"[Comedy, baby, midlifecrisis, confidence, Stev..."
...,...,...,...,...,...,...,...,...
46543,439050,Subdue,"[Rising, and, falling, between, a, man, and, w...","[Drama, Family]",[tragiclove],"[LeilaHatami, KouroshTahami, ElhamKorda]",[HamidNematollah],"[Drama, Family, tragiclove, HamidNematollah, L..."
46544,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,...",[Drama],"[artist, play, pinoy]","[AngelAquino, PerryDizon, HazelOrencio]",[LavDiaz],"[Drama, artist, play, pinoy, LavDiaz, AngelAqu..."
46545,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr...","[Action, Drama, Thriller]",[],"[ErikaEleniak, AdamBaldwin, JulieduPage]",[MarkL.Lester],"[Action, Drama, Thriller, MarkL.Lester, ErikaE..."
46546,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one...",[],[],"[IwanMosschuchin, NathalieLissenko, PavelPavlov]",[YakovProtazanov],"[YakovProtazanov, IwanMosschuchin, NathalieLis..."


In [81]:
new_data=mdata_ohe[['id','title','tags']]

In [82]:
new_data.columns

Index(['id', 'title', 'tags'], dtype='object')

In [83]:
new_data['tags']=new_data['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tags']=new_data['tags'].apply(lambda x:" ".join(x))


In [84]:
new_data

Unnamed: 0,id,title,tags
0,862,Toy Story,Animation Comedy Family jealousy toy boy JohnL...
1,8844,Jumanji,Adventure Fantasy Family boardgame disappearan...
2,15602,Grumpier Old Men,Romance Comedy fishing bestfriend duringcredit...
3,31357,Waiting to Exhale,Comedy Drama Romance basedonnovel interracialr...
4,11862,Father of the Bride Part II,Comedy baby midlifecrisis confidence SteveMart...
...,...,...,...
46543,439050,Subdue,Drama Family tragiclove HamidNematollah LeilaH...
46544,111109,Century of Birthing,Drama artist play pinoy LavDiaz AngelAquino Pe...
46545,67758,Betrayal,Action Drama Thriller MarkL.Lester ErikaElenia...
46546,227506,Satan Triumphant,YakovProtazanov IwanMosschuchin NathalieLissen...


In [85]:
new_data['tags']=new_data['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['tags']=new_data['tags'].apply(lambda x:x.lower())


In [86]:
new_data.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,animation comedy family jealousy toy boy johnl...
1,8844,Jumanji,adventure fantasy family boardgame disappearan...
2,15602,Grumpier Old Men,romance comedy fishing bestfriend duringcredit...
3,31357,Waiting to Exhale,comedy drama romance basedonnovel interracialr...
4,11862,Father of the Bride Part II,comedy baby midlifecrisis confidence stevemart...


# Popularity Based Recommendation System

In [87]:
ratings_small = ratings_small.rename(columns = {'movieId':'id'})

In [88]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [89]:
p_data = movies.merge(ratings_small, on='id')

In [90]:
p_data= p_data.merge(credits,on='id')

In [91]:
p_data.shape

(45006, 29)

In [92]:
p_data = p_data[['id','title','genres','overview','release_date','rating','cast','crew']]

In [93]:
p_data.shape

(45006, 8)

In [94]:
num_rating=p_data.groupby('title').count()['rating'].reset_index()

In [95]:
num_rating.columns


Index(['title', 'rating'], dtype='object')

In [96]:
num_rating = num_rating.rename(columns = {'rating':'num_rating'})
num_rating

Unnamed: 0,title,num_rating
0,!Women Art Revolution,2
1,'Gator Bait,1
2,'Twas the Night Before Christmas,2
3,...And God Created Woman,1
4,00 Schneider - Jagd auf Nihil Baxter,2
...,...,...
2789,xXx,28
2790,¡Three Amigos!,1
2791,À nos amours,14
2792,Ödipussi,1


In [97]:
avg_rating=p_data.groupby('title').mean()['rating'].reset_index()

  avg_rating=p_data.groupby('title').mean()['rating'].reset_index()


In [98]:
avg_rating = avg_rating.rename(columns = {'rating':'avg_rating'})
avg_rating

Unnamed: 0,title,avg_rating
0,!Women Art Revolution,3.250000
1,'Gator Bait,0.500000
2,'Twas the Night Before Christmas,4.000000
3,...And God Created Woman,4.000000
4,00 Schneider - Jagd auf Nihil Baxter,4.000000
...,...,...
2789,xXx,3.517857
2790,¡Three Amigos!,4.000000
2791,À nos amours,3.285714
2792,Ödipussi,4.500000


In [99]:
popular_data = num_rating.merge(avg_rating, on='title')

In [100]:
popular_data

Unnamed: 0,title,num_rating,avg_rating
0,!Women Art Revolution,2,3.250000
1,'Gator Bait,1,0.500000
2,'Twas the Night Before Christmas,2,4.000000
3,...And God Created Woman,1,4.000000
4,00 Schneider - Jagd auf Nihil Baxter,2,4.000000
...,...,...,...
2789,xXx,28,3.517857
2790,¡Three Amigos!,1,4.000000
2791,À nos amours,14,3.285714
2792,Ödipussi,1,4.500000


In [101]:
popular_data = popular_data[popular_data['num_rating']>=100].sort_values('avg_rating',ascending=True).head(50)

In [102]:
popular_data

Unnamed: 0,title,num_rating,avg_rating
1252,Lost in Translation,129,2.782946
253,"Bang, Boom, Bang",175,2.871429
2723,Who Killed Bambi?,113,3.044248
2594,Tough Enough,106,3.084906
60,A Clockwork Orange,102,3.102941
1932,Syriana,158,3.110759
2686,Wag the Dog,129,3.112403
2565,Titanic,209,3.227273
570,Dave Chappelle's Block Party,110,3.309091
320,Big Fish,126,3.325397


In [103]:
p_data1 = mdata_ohe[['id','title','genres','overview','cast','crew']]

In [104]:
pop_data = popular_data.merge(p_data1,on='title')

In [105]:
pop_data= pop_data.head(50)

Top 50 movies

In [106]:
pop_data

Unnamed: 0,title,num_rating,avg_rating,id,genres,overview,cast,crew
0,Lost in Translation,129,2.782946,153,[Drama],"[Two, lost, souls, visiting, Tokyo, --, the, y...","[BillMurray, ScarlettJohansson, AnnaFaris]",[]
1,"Bang, Boom, Bang",175,2.871429,344,"[Crime, Action, Comedy]","[Bank, robber, Kelle, Grabowski, escapes, from...","[OliverKorittke, MarkusKnüfken, RalfRichter]",[PeterThorwarth]
2,Who Killed Bambi?,113,3.044248,1917,[Thriller],"[Isabelle,, a, beautiful, nursing, student,, i...","[SophieQuinton, LaurentLucas, CatherineJacob]",[]
3,Tough Enough,106,3.084906,434,"[Drama, Thriller]","[From, the, youth, directed, novel, of, the, s...","[DavidKross, JennyElvers, ErhanEmre]",[]
4,Tough Enough,106,3.084906,38556,"[Action, Drama, Romance]","[An, aspiring, country/western, singer,, whose...","[DennisQuaid, CarleneWatkins, StanShaw]",[RichardFleischer]
5,A Clockwork Orange,102,3.102941,185,"[ScienceFiction, Drama]","[Demonic, gang-leader, Alex, goes, on, the, sp...","[MalcolmMcDowell, PatrickMagee, AdrienneCorri]",[StanleyKubrick]
6,Syriana,158,3.110759,231,"[Drama, Thriller]","[The, Middle, Eastern, oil, industry, is, the,...","[GeorgeClooney, MattDamon, JeffreyWright]",[]
7,Wag the Dog,129,3.112403,586,"[Comedy, Drama]","[During, the, final, weeks, of, a, presidentia...","[DustinHoffman, RobertDeNiro, AnneHeche]",[BarryLevinson]
8,Titanic,209,3.227273,597,"[Drama, Romance, Thriller]","[84, years, later,, a, 101-year-old, woman, na...","[KateWinslet, LeonardoDiCaprio, FrancesFisher]",[]
9,Titanic,209,3.227273,16535,"[Drama, Action, Romance]","[Unhappily, married,, Julia, Sturges, decides,...","[CliftonWebb, BarbaraStanwyck, RobertWagner]",[]


# Content Based Recommendation System

In [107]:
new_data = new_data.iloc[:20000,:]

In [108]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [109]:
vector = cv.fit_transform(new_data['tags']).toarray()

In [110]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [111]:
vector.shape

(20000, 5000)

In [112]:
from sklearn.metrics.pairwise import cosine_similarity

In [113]:
similarity = cosine_similarity(vector)

In [114]:
similarity

array([[1.        , 0.04445542, 0.05057217, ..., 0.        , 0.03143473,
        0.        ],
       [0.04445542, 1.        , 0.07756315, ..., 0.        , 0.03214122,
        0.        ],
       [0.05057217, 0.07756315, 1.        , ..., 0.        , 0.03656362,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.38138504,
        0.        ],
       [0.03143473, 0.03214122, 0.03656362, ..., 0.38138504, 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [115]:
def recommend(movie):
    index = new_data[new_data['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:11]:
        print(new_data.iloc[i[0]].title)


In [116]:
new_data.head(5)

Unnamed: 0,id,title,tags
0,862,Toy Story,animation comedy family jealousy toy boy johnl...
1,8844,Jumanji,adventure fantasy family boardgame disappearan...
2,15602,Grumpier Old Men,romance comedy fishing bestfriend duringcredit...
3,31357,Waiting to Exhale,comedy drama romance basedonnovel interracialr...
4,11862,Father of the Bride Part II,comedy baby midlifecrisis confidence stevemart...


In [117]:
recommend('Waiting to Exhale')

Worth Winning
Snow days
Jasminum
If You Love
Chilly Scenes of Winter
A Good Marriage
Next Stop Wonderland
Little Black Book
About Last Night...
French Cancan


In [118]:
recommend('Toy Story')

Toy Story 2
Toy Story 3
The 40 Year Old Virgin
The Champ
Heartbeeps
Mr. Bug Goes to Town
Over the Hedge
Happiness Is a Warm Blanket, Charlie Brown
The Gang's All Here
Burke & Hare


# Collaborative Filtering Based Recommendation System

In [119]:
ratings_small.columns

Index(['userId', 'id', 'rating', 'timestamp'], dtype='object')

In [120]:
pd_data = movies.merge(ratings_small, on='id')

In [121]:
pd_data.shape

(44994, 27)

In [122]:
pd_data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'userId', 'rating', 'timestamp'],
      dtype='object')

In [123]:
pd_data = pd_data[['id','title','userId','genres','overview','release_date','rating']]

In [124]:
pd_data.head(5)

Unnamed: 0,id,title,userId,genres,overview,release_date,rating
0,949,Heat,23,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,3.5
1,949,Heat,102,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,4.0
2,949,Heat,232,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,2.0
3,949,Heat,242,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,5.0
4,949,Heat,263,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,3.0


In [125]:
x = pd_data.groupby('userId').count()['rating']>100

In [126]:
users = x[x].index
users

Int64Index([  4,  15,  17,  19,  23,  30,  48,  56,  57,  73,
            ...
            607, 608, 615, 624, 641, 648, 654, 659, 664, 665],
           dtype='int64', name='userId', length=121)

In [127]:
filter_data=pd_data[pd_data['userId'].isin(users)]

In [128]:
filter_data

Unnamed: 0,id,title,userId,genres,overview,release_date,rating
0,949,Heat,23,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,3.5
1,949,Heat,102,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,4.0
2,949,Heat,232,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,2.0
3,949,Heat,242,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,5.0
5,949,Heat,311,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...","Obsessive master thief, Neil McCauley leads a ...",1995-12-15,3.0
...,...,...,...,...,...,...,...
44986,3104,Frankenstein Created Woman,585,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",A deformed tormented girl drowns herself after...,1967-03-15,4.0
44987,3104,Frankenstein Created Woman,624,"[{'id': 27, 'name': 'Horror'}, {'id': 878, 'na...",A deformed tormented girl drowns herself after...,1967-03-15,4.0
44989,64197,Travelling with Pets,73,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",Plucked from an orphanage as a literal love sl...,2007-06-25,4.0
44991,64197,Travelling with Pets,648,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",Plucked from an orphanage as a literal love sl...,2007-06-25,3.5


In [129]:
y=filter_data.groupby('title').count()['rating']>=25
famous_movies=y[y].index

In [130]:
famous_movies

Index(['20,000 Leagues Under the Sea', '2001: A Space Odyssey',
       '28 Weeks Later', '300', '48 Hrs.', '5 Card Stud', '88 Minutes',
       'A Bridge Too Far', 'A Brief History of Time', 'A Clockwork Orange',
       ...
       'When Saturday Comes', 'While You Were Sleeping', 'Who Killed Bambi?',
       'Will Penny', 'Windows on Monday', 'X-Men Origins: Wolverine',
       'Y Tu Mamá También', 'Yesterday', 'Young and Innocent', 'Zatoichi'],
      dtype='object', name='title', length=311)

In [131]:
final_data=filter_data[filter_data['title'].isin(famous_movies)]

In [132]:
final_data

Unnamed: 0,id,title,userId,genres,overview,release_date,rating
20,1408,Cutthroat Island,19,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,3.0
21,1408,Cutthroat Island,23,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,3.5
22,1408,Cutthroat Island,57,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,4.0
24,1408,Cutthroat Island,73,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,3.0
25,1408,Cutthroat Island,111,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,4.5
...,...,...,...,...,...,...,...
44876,2791,The Chronicles of Riddick: Dark Fury,607,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,3.5
44878,2791,The Chronicles of Riddick: Dark Fury,615,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,4.5
44879,2791,The Chronicles of Riddick: Dark Fury,624,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,5.0
44881,2791,The Chronicles of Riddick: Dark Fury,654,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,4.5


In [133]:
final_data = final_data.drop_duplicates()

In [134]:
final_data

Unnamed: 0,id,title,userId,genres,overview,release_date,rating
20,1408,Cutthroat Island,19,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,3.0
21,1408,Cutthroat Island,23,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,3.5
22,1408,Cutthroat Island,57,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,4.0
24,1408,Cutthroat Island,73,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,3.0
25,1408,Cutthroat Island,111,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...","Morgan Adams and her slave, William Shaw, are ...",1995-12-22,4.5
...,...,...,...,...,...,...,...
44876,2791,The Chronicles of Riddick: Dark Fury,607,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,3.5
44878,2791,The Chronicles of Riddick: Dark Fury,615,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,4.5
44879,2791,The Chronicles of Riddick: Dark Fury,624,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,5.0
44881,2791,The Chronicles of Riddick: Dark Fury,654,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...","After their narrow escape at the end of ""Pitch...",2004-06-15,4.5


In [135]:
pt=final_data.pivot_table(index='title',columns='userId',values='rating')

In [136]:
pt

userId,4,15,17,19,23,30,48,56,57,73,...,607,608,615,624,641,648,654,659,664,665
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"20,000 Leagues Under the Sea",3.0,,,,,4.0,,,,3.0,...,,4.0,,,,,,,,
2001: A Space Odyssey,,2.0,,,4.0,,,,,,...,3.5,,,,4.0,,4.5,3.0,,3.0
28 Weeks Later,,1.5,,,1.0,3.0,,,,0.5,...,,,,1.5,,,3.0,,,
300,,,,3.0,,4.0,,,,,...,2.5,,,,,,,,,
48 Hrs.,,3.0,,3.0,3.5,5.0,,,,3.5,...,4.0,,,3.0,4.0,,5.0,5.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X-Men Origins: Wolverine,5.0,,,,,,,,,3.5,...,,,,,,,,,,4.0
Y Tu Mamá También,,0.5,,2.0,3.5,,,,,3.0,...,4.0,,3.5,4.0,3.0,,4.0,,,
Yesterday,,1.5,,,3.0,,3.0,,,2.0,...,,,3.5,,,,4.0,,3.5,
Young and Innocent,,1.0,4.5,,4.0,5.0,3.5,,,4.0,...,4.0,4.0,4.0,4.0,,,5.0,,4.5,


In [137]:
pt.fillna(0,inplace=True)

In [138]:
pt

userId,4,15,17,19,23,30,48,56,57,73,...,607,608,615,624,641,648,654,659,664,665
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"20,000 Leagues Under the Sea",3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey,0.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,4.0,0.0,4.5,3.0,0.0,3.0
28 Weeks Later,0.0,1.5,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,1.5,0.0,0.0,3.0,0.0,0.0,0.0
300,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48 Hrs.,0.0,3.0,0.0,3.0,3.5,5.0,0.0,0.0,0.0,3.5,...,4.0,0.0,0.0,3.0,4.0,0.0,5.0,5.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X-Men Origins: Wolverine,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
Y Tu Mamá También,0.0,0.5,0.0,2.0,3.5,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,3.5,4.0,3.0,0.0,4.0,0.0,0.0,0.0
Yesterday,0.0,1.5,0.0,0.0,3.0,0.0,3.0,0.0,0.0,2.0,...,0.0,0.0,3.5,0.0,0.0,0.0,4.0,0.0,3.5,0.0
Young and Innocent,0.0,1.0,4.5,0.0,4.0,5.0,3.5,0.0,0.0,4.0,...,4.0,4.0,4.0,4.0,0.0,0.0,5.0,0.0,4.5,0.0


In [139]:
from sklearn.metrics.pairwise import cosine_similarity

In [140]:
similarity_score=cosine_similarity(pt)

In [141]:
similarity_score.shape

(311, 311)

In [142]:
similarity_score

array([[1.        , 0.25638371, 0.39593965, ..., 0.40458132, 0.52626798,
        0.34487554],
       [0.25638371, 1.        , 0.37061598, ..., 0.31115918, 0.43967961,
        0.23810045],
       [0.39593965, 0.37061598, 1.        , ..., 0.37761967, 0.37063312,
        0.21093064],
       ...,
       [0.40458132, 0.31115918, 0.37761967, ..., 1.        , 0.45938889,
        0.26042189],
       [0.52626798, 0.43967961, 0.37063312, ..., 0.45938889, 1.        ,
        0.40500839],
       [0.34487554, 0.23810045, 0.21093064, ..., 0.26042189, 0.40500839,
        1.        ]])

In [143]:
mdata_ohe

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tags
0,862,Toy Story,"[Led, by, Woody,, Andy's, toys, live, happily,...","[Animation, Comedy, Family]","[jealousy, toy, boy]","[TomHanks, TimAllen, DonRickles]",[JohnLasseter],"[Animation, Comedy, Family, jealousy, toy, boy..."
1,8844,Jumanji,"[When, siblings, Judy, and, Peter, discover, a...","[Adventure, Fantasy, Family]","[boardgame, disappearance, basedonchildren'sbook]","[RobinWilliams, JonathanHyde, KirstenDunst]",[],"[Adventure, Fantasy, Family, boardgame, disapp..."
2,15602,Grumpier Old Men,"[A, family, wedding, reignites, the, ancient, ...","[Romance, Comedy]","[fishing, bestfriend, duringcreditsstinger]","[WalterMatthau, JackLemmon, Ann-Margret]",[HowardDeutch],"[Romance, Comedy, fishing, bestfriend, duringc..."
3,31357,Waiting to Exhale,"[Cheated, on,, mistreated, and, stepped, on,, ...","[Comedy, Drama, Romance]","[basedonnovel, interracialrelationship, single...","[WhitneyHouston, AngelaBassett, LorettaDevine]",[ForestWhitaker],"[Comedy, Drama, Romance, basedonnovel, interra..."
4,11862,Father of the Bride Part II,"[Just, when, George, Banks, has, recovered, fr...",[Comedy],"[baby, midlifecrisis, confidence]","[SteveMartin, DianeKeaton, MartinShort]",[],"[Comedy, baby, midlifecrisis, confidence, Stev..."
...,...,...,...,...,...,...,...,...
46543,439050,Subdue,"[Rising, and, falling, between, a, man, and, w...","[Drama, Family]",[tragiclove],"[LeilaHatami, KouroshTahami, ElhamKorda]",[HamidNematollah],"[Drama, Family, tragiclove, HamidNematollah, L..."
46544,111109,Century of Birthing,"[An, artist, struggles, to, finish, his, work,...",[Drama],"[artist, play, pinoy]","[AngelAquino, PerryDizon, HazelOrencio]",[LavDiaz],"[Drama, artist, play, pinoy, LavDiaz, AngelAqu..."
46545,67758,Betrayal,"[When, one, of, her, hits, goes, wrong,, a, pr...","[Action, Drama, Thriller]",[],"[ErikaEleniak, AdamBaldwin, JulieduPage]",[MarkL.Lester],"[Action, Drama, Thriller, MarkL.Lester, ErikaE..."
46546,227506,Satan Triumphant,"[In, a, small, town, live, two, brothers,, one...",[],[],"[IwanMosschuchin, NathalieLissenko, PavelPavlov]",[YakovProtazanov],"[YakovProtazanov, IwanMosschuchin, NathalieLis..."


In [144]:
def recommendor(movie):
    # index fetch
    index = np.where(pt.index==movie)[0][0]
    similar_items = sorted(list(enumerate(similarity_score[index])),key=lambda x:x[1],reverse=True)[1:11]

    data = []
    for i in similar_items:
        item = []
        temp_df = mdata_ohe[mdata_ohe['title'] == pt.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates('title')['title'].values))
        item.extend(list(temp_df.drop_duplicates('title')['cast'].values))
        item.extend(list(temp_df.drop_duplicates('title')['genres'].values))


        data.append(item)

    return data

In [145]:
pt.head(10)

userId,4,15,17,19,23,30,48,56,57,73,...,607,608,615,624,641,648,654,659,664,665
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"20,000 Leagues Under the Sea",3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey,0.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,4.0,0.0,4.5,3.0,0.0,3.0
28 Weeks Later,0.0,1.5,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.5,...,0.0,0.0,0.0,1.5,0.0,0.0,3.0,0.0,0.0,0.0
300,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48 Hrs.,0.0,3.0,0.0,3.0,3.5,5.0,0.0,0.0,0.0,3.5,...,4.0,0.0,0.0,3.0,4.0,0.0,5.0,5.0,0.0,0.0
5 Card Stud,0.0,5.0,4.5,0.0,4.0,3.0,4.5,4.0,0.0,5.0,...,4.5,0.0,3.5,4.0,0.0,1.5,5.0,0.0,4.0,5.0
88 Minutes,0.0,2.0,0.0,0.0,3.5,4.0,0.0,2.0,0.0,3.5,...,0.0,0.0,0.0,3.0,0.0,0.0,4.0,0.0,0.0,0.0
A Bridge Too Far,0.0,4.0,0.0,0.0,5.0,4.0,4.0,4.0,0.0,4.0,...,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Brief History of Time,0.0,1.0,0.0,0.0,0.0,5.0,0.0,4.0,5.0,4.0,...,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
A Clockwork Orange,3.0,2.0,3.0,0.0,3.5,0.0,0.0,0.0,0.0,2.5,...,3.0,0.0,0.0,2.5,4.0,0.0,0.0,0.0,0.0,0.0


## Recommends 10 similar movies

In [146]:
recommendor('A Brief History of Time')

[['Street Kings',
  ['KeanuReeves', 'ForestWhitaker', 'ChrisEvans'],
  ['Action', 'Crime', 'Drama']],
 ['The Breakfast Club',
  ['EmilioEstevez', 'AnthonyMichaelHall', 'JuddNelson'],
  ['Comedy', 'Drama']],
 ['Cold Mountain', ['JudeLaw', 'NicoleKidman', 'RenéeZellweger'], ['Drama']],
 ['Carry On Screaming',
  ['HarryH.Corbett', 'KennethWilliams', 'JimDale'],
  ['Comedy']],
 ['Notes on a Scandal',
  ['JudiDench', 'CateBlanchett', 'BillNighy'],
  ['Drama', 'Romance']],
 ['Lonely Hearts',
  ['JohnTravolta', 'JamesGandolfini', 'SalmaHayek'],
  ['Drama', 'Thriller', 'Crime']],
 ['Nostalgia',
  ['OlegYankovskiy', 'ErlandJosephson', 'DomizianaGiordano'],
  ['Drama', 'Romance']],
 ['Say Anything...',
  ['JohnCusack', 'IoneSkye', 'JohnMahoney'],
  ['Comedy', 'Drama', 'Romance']],
 ['The Chronicles of Riddick: Dark Fury',
  ['VinDiesel', 'RhianaGriffith', 'KeithDavid'],
  ['Action', 'Animation', 'ScienceFiction']],
 ['Once Were Warriors',
  ['RenaOwen', 'TemueraMorrison', 'MamaengaroaKerr-Bell']

In [147]:
recommendor('88 Minutes')

[['Mothra vs. Godzilla',
  ['AkiraTakarada', 'YurikoHoshi', 'HiroshiKoizumi'],
  ['Fantasy', 'ScienceFiction', 'Action']],
 ['28 Weeks Later',
  ['ImogenPoots', 'RobertCarlyle', 'RoseByrne'],
  ['Horror', 'Thriller', 'ScienceFiction']],
 ['Houseboat',
  ['CaryGrant', 'SophiaLoren', 'MarthaHyer'],
  ['Comedy', 'Drama', 'Family']],
 ['Bell, Book and Candle',
  ['JamesStewart', 'KimNovak', 'JackLemmon'],
  ['Fantasy', 'Comedy', 'Romance']],
 ['5 Card Stud',
  ['DeanMartin', 'RobertMitchum', 'IngerStevens'],
  ['Action', 'Western', 'Thriller']],
 ['Nosferatu',
  ['MaxSchreck', 'GustavvonWangenheim', 'GretaSchröder'],
  ['Fantasy', 'Horror']],
 ['The Searchers', ['JohnWayne', 'JeffreyHunter', 'VeraMiles'], ['Western']],
 ['All the Way Boys',
  ['AlexanderAllerson', 'BudSpencer', 'TerenceHill'],
  ['Adventure', 'Action', 'Comedy']],
 ['The Passion of Joan of Arc',
  ['MariaFalconetti', 'EugeneSilvain', 'AndréBerley'],
  ['Drama', 'History']],
 ['Married to the Mob',
  ['MichellePfeiffer', 'M