In [None]:
"""
                               MOVIE RECOMMENDATION SYSTEM USING CONTENT BASED FILTERING.
                        
    CONTENT BASED FILTERING:Content Based recommender system tries to guess the features or behaviour of a user given the
                  item's features that he/she reacted.The main features of content based filtering is it does not require
                     other user's data during recommendations to one user.
                        
                                USER<-->seen by users<-->similar movie<-->USER 
                                
                                                 OUR WORKING MODEL:
                                      DATA<-->PRE-PROCESSING<-->TRAIN-MODEL<-->RESULTS 
                                      
            FEATURES ABOUT DATASETS:::
              => Here basically we use two data sets one is movie and another is credit about each movie.
              MOVIE DATA-SETS====================================================================================
             1. budget -> Budget of movie                   
             2. genres -> Types of movie like action,drama or adventure etc.               
             3. homepage  -> social site of movies.               
             4. id        -> Each movie having its unique TMDB id.                
             5. keywords  -> Description of movie like Alien,Science-fiction etc.
             6. original_language -> Original Language of movie without doubed
             7. original_title  -> Original title of movie without doubed
             8. overview     -> Summary of movies
             9.popularity    -> Count the popularity of particular movies from different-different platform
             10. production_companies -> Name of production companies
             11. production_countries -> Name of production countries
             12. release_date     -> Release date of movie
             13. revenue        -> How much revenue earned by particular movie
             14. runtime        -> Total runtime of movie.
             15. spoken_languages -> Which language is used in movie.
             16. status       -> whether the movie is released or not.
             17. tagline      -> Tagline of the movie.
             18. title        -> Title of the movie.
             19. vote_average -> Average vote of particular movie.
             20. vote_count   -> Total vote count of particular
              RATING DATA-SETS========================================================
             1. movie_id ->  Unique TMDB id's of particular movie.   
             2. title   ->    Title of movie
             3. cast   ->   All cast member of movie.  
             4. crew    ->   All crew member of movie.                               
"""

In [335]:
# importing some useful python module

import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [238]:
#assigning data sets:

movies=pd.read_csv("tmdb_5000_movies.csv")
credit=pd.read_csv("tmdb_5000_credits.csv")

In [239]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [240]:
credit.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [241]:
#checking the shape of movie-datasets:

movies.shape

(4803, 20)

In [242]:
#checking the shape of credit-datasets:

credit.shape

(4803, 4)

In [243]:
#checking the data-type of movie-datasets:

movies.dtypes

budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
dtype: object

In [244]:
#checking the data-type of credit data-sets:

credit.dtypes

movie_id     int64
title       object
cast        object
crew        object
dtype: object

In [245]:
#here we have to two different data-sets are available so it may be hectic for implementation so we need to merge it.

movies=movies.merge(credit,on='title')

In [246]:
#now again checking the shape of merged data-sets:

movies.shape

(4809, 23)

In [247]:
#Based on overviewing the attributes of data-sets, we have to remove some useless column
#so we let put only some useful column

movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [248]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [249]:
#now we have to pre-processing the data means we have to remove some row which is either null or having some duplicate data

movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [250]:
#Ae we see overview column have 3 null value so we have to drop it

movies.dropna(inplace=True)

In [251]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [252]:
#for duplicate value:

movies.duplicated().sum()

0

In [253]:
#Now in genres column, there are several dictionary format data which is not in appropriate format, so first of all
#we need to keep it as appropriate format

#also we have to change string of list to list and for that python have a module 'ast' which we are going to use here

def convert(value):
    l=[]
    for i in ast.literal_eval(value):
        l.append(i['name'])
    return l    

In [254]:
movies['genres']=movies['genres'].apply(convert)

In [255]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [256]:
#same method we are going to use for keywords column

movies['keywords']=movies['keywords'].apply(convert)

In [257]:
# In cast column we need actual name of actor not his name associated with the movie name like we need Sanajay dutt
#  not sanju baba,or like johhny depp not Angelica Teach

def convert2(value):
    l=[]
    c=0
    for i in ast.literal_eval(value):
        if(c!=3):
            l.append(i['name'])
            c=c+1
        else:
            break
    return l       

In [258]:
movies['cast']=movies['cast'].apply(convert2)

In [259]:
movies.head(10)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
5,559,Spider-Man 3,The seemingly invincible Spider-Man goes up ag...,"[Fantasy, Action, Adventure]","[dual identity, amnesia, sandstorm, love of on...","[Tobey Maguire, Kirsten Dunst, James Franco]","[{""credit_id"": ""52fe4252c3a36847f80151a5"", ""de..."
6,38757,Tangled,When the kingdom's most wanted-and most charmi...,"[Animation, Family]","[hostage, magic, horse, fairy tale, musical, p...","[Zachary Levi, Mandy Moore, Donna Murphy]","[{""credit_id"": ""52fe46db9251416c91062101"", ""de..."
7,99861,Avengers: Age of Ultron,When Tony Stark tries to jumpstart a dormant p...,"[Action, Adventure, Science Fiction]","[marvel comic, sequel, superhero, based on com...","[Robert Downey Jr., Chris Hemsworth, Mark Ruff...","[{""credit_id"": ""55d5f7d4c3a3683e7e0016eb"", ""de..."
8,767,Harry Potter and the Half-Blood Prince,"As Harry begins his sixth year at Hogwarts, he...","[Adventure, Fantasy, Family]","[witch, magic, broom, school of witchcraft, wi...","[Daniel Radcliffe, Rupert Grint, Emma Watson]","[{""credit_id"": ""52fe4273c3a36847f801fab1"", ""de..."
9,209112,Batman v Superman: Dawn of Justice,Fearing the actions of a god-like Super Hero l...,"[Action, Adventure, Fantasy]","[dc comics, vigilante, superhero, based on com...","[Ben Affleck, Henry Cavill, Gal Gadot]","[{""credit_id"": ""553bf23692514135c8002886"", ""de..."


In [260]:
#in crew column we need only director name so we are going to filter only director name

def fetch_director(value):
    l=[]
    for i in ast.literal_eval(value):
        if(i['job']=='Director'):
            l.append(i['name'])
            break
    return l        

In [261]:
movies['crew']=movies['crew'].apply(fetch_director)

In [262]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [263]:
# For overview column we also convert it to list format so that we can concatenate it with all other column

movies['overview']=movies['overview'].apply(lambda x:x.split())

In [264]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]",[Andrew Stanton]


In [265]:
"""
Now the problem is that in several column like cast there is a name with space like sam worthington, we know that
same worthington has a single name of any actor but the thing is here sam is considered as one entity and worthington is
considered as another entity. and it may be possible that the same name with 'sam' is also available in another column
so, we need to replace the space between name to avoid this anomalies.
"""

"\nNow the problem is that in several column like cast there is a name with space like sam worthington, we know that\nsame worthington has a single name of any actor but the thing is here sam is considered as one entity and worthington is\nconsidered as another entity. and it may be possible that the same name with 'sam' is also available in another column\nso, we need to replace the space between name to avoid this anomalies.\n"

In [266]:
# so we are going to replace all the spaces between name among all column one by one.
# and for that we are using list comprehension to remove all the spaces between two words.

movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])

In [267]:
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])

In [268]:
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])

In [269]:
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [270]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton]


In [271]:
#now we have to create a tags and in this tag we have to concatenate all the rest column other than movies_id and title

movies['tag']=movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']

In [272]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tag
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [319]:
# now we are going to split data-sets and make a new data-sets for easy implementation

new_df=movies[['movie_id','title','tag']]

In [320]:
new_df.head()

Unnamed: 0,movie_id,title,tag
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [321]:
#Now we have to again convert the list of tag to string format

new_df['tag']=new_df['tag'].apply(lambda x:" ".join(x))  

In [322]:
new_df.head()

Unnamed: 0,movie_id,title,tag
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [323]:
#Now we are going to change all string of tag column to lowercase character to avoid error

new_df['tag']=new_df['tag'].apply(lambda x: x.lower())

In [324]:
new_df.head()

Unnamed: 0,movie_id,title,tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


In [325]:
new_df['tag'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [326]:
new_df['tag'][1]

"captain barbossa, long believed to be dead, has come back to life and is headed to the edge of the earth with will turner and elizabeth swann. but nothing is quite as it seems. adventure fantasy action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger johnnydepp orlandobloom keiraknightley goreverbinski"

In [327]:
"""Now we have to think about the similarity between both tag and for lot of data it is not possible check the similarity
  between them. For that here we are going to use vectorization method that means in the tag column, we have to convert the
  text to vectors form using text vectorization. Because when anybody try to search a movie based on that movie our model
  predicts the similar movie and that similar movie must be the closest vector of all 5000 vector across the space, that's
why we need vectorization here.There are many method of text-vectorization but here we are going to use 'Bags of word' method.
"""

"Now we have to think about the similarity between both tag and for lot of data it is not possible check the similarity\n  between them. For that here we are going to use vectorization method that means in the tag column, we have to convert the\n  text to vectors form using text vectorization. Because when anybody try to search a movie based on that movie our model\n  predicts the similar movie and that similar movie must be the closest vector of all 5000 vector across the space, that's\nwhy we need vectorization here.There are many method of text-vectorization but here we are going to use 'Bags of word' method.\n"

In [328]:
#Also we are not going to consider stop-words i.e a,an,the for... etc

cv=CountVectorizer(max_features=5000,stop_words='english')

In [329]:
#Now with the help of cv object to create numpy array

cv.fit_transform(new_df['tag']).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [330]:
#checking shape

cv.fit_transform(new_df['tag']).toarray().shape

(4806, 5000)

In [331]:
vector=cv.fit_transform(new_df['tag']).toarray()

In [332]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [333]:
#checking first movie i.e avatar

vector[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [334]:
#Now we can see the 5000 most frequent common words using get_feature_names method

cv.get_feature_names()

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1940s',
 '1950',
 '1950s',
 '1960s',
 '1970s',
 '1980',
 '1980s',
 '1985',
 '1990s',
 '1999',
 '19th',
 '19thcentury',
 '20',
 '200',
 '2009',
 '20th',
 '24',
 '25',
 '30',
 '300',
 '3d',
 '40',
 '50',
 '500',
 '60',
 '60s',
 '70',
 '70s',
 'aaron',
 'aaroneckhart',
 'abandoned',
 'abducted',
 'abigailbreslin',
 'abilities',
 'ability',
 'able',
 'aboard',
 'abuse',
 'abusive',
 'academy',
 'accept',
 'accepted',
 'accepts',
 'access',
 'accident',
 'accidental',
 'accidentally',
 'accompanied',
 'accomplish',
 'account',
 'accountant',
 'accused',
 'ace',
 'achieve',
 'act',
 'acting',
 'action',
 'actionhero',
 'actions',
 'activist',
 'activities',
 'activity',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adams',
 'adamsandler',
 'adamshankman',
 'adaptation',
 'adapted',
 'addict',
 'addicted',
 'addiction',
 'adolescence',
 'adolescent'

In [336]:
#Applying model by using of cosine similarity:

similarity = cosine_similarity(vector)

In [337]:
print(similarity)

[[1.         0.08964215 0.05976143 ... 0.02519763 0.02817181 0.        ]
 [0.08964215 1.         0.0625     ... 0.02635231 0.         0.        ]
 [0.05976143 0.0625     1.         ... 0.02635231 0.         0.        ]
 ...
 [0.02519763 0.02635231 0.02635231 ... 1.         0.0745356  0.04836508]
 [0.02817181 0.         0.         ... 0.0745356  1.         0.05407381]
 [0.         0.         0.         ... 0.04836508 0.05407381 1.        ]]


In [353]:
#we can see in out-put the first value is 1 this because of similarity of first moviw with first movie

#Now, applying function for recommendation:

#enumerate function means it keeps the record of the distance between one movie with all other its nearest movie.

def recommend(movie):
    movie_index=new_df[new_df['title'] == movie].index[0]
    distance=similarity[movie_index]
    movie_list=sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])[1:11]
    print("Recommended movies are:======================================")
    for i in movie_list:
        print(new_df.iloc[i[0]].title)

In [354]:
recommend('Spectre')

Quantum of Solace
Never Say Never Again
Skyfall
From Russia with Love
Thunderball
Diamonds Are Forever
Safe Haven
Die Another Day
Dr. No
Licence to Kill


In [355]:
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Ender's Game
Aliens vs Predator: Requiem
Battle: Los Angeles
Lifeforce
Falcon Rising
Krull
Edge of Tomorrow
