## Importing necessary libraries

In [148]:
import numpy as np
import pandas as pd
import ast
import re
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

### Reading Data

In [149]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [150]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [151]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Checking for null values

In [152]:
credits.isna().sum()

movie_id    0
title       0
cast        0
crew        0
dtype: int64

In [153]:
movies.isna().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

**Merging the 2 datasets based on title. After merging we select the columns that are important for us to recommend movies to the user.**

In [154]:
movies = movies.merge(credits, on='title')

In [155]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'release_date', 'cast', 'crew']]

In [156]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",2007-05-19,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",2015-10-26,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",2012-07-16,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",2012-03-07,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### Removing null values and exploring the data

In [157]:
movies.shape

(4809, 8)

In [158]:
movies.isna().sum()

movie_id        0
title           0
overview        3
genres          0
keywords        0
release_date    1
cast            0
crew            0
dtype: int64

In [159]:
movies.duplicated().sum()

0

In [160]:
movies.dropna(inplace=True)

In [161]:
movies.isna().sum()

movie_id        0
title           0
overview        0
genres          0
keywords        0
release_date    0
cast            0
crew            0
dtype: int64

In [162]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [163]:
movies['keywords'][0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

**Here ast.literal_eval is used to convert string into list.**

In [164]:
ast.literal_eval(movies['genres'][0])

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

**Helper function to extract genres and keywords and return them in the form of list**

In [165]:
def extract(col):
    lst = []
    for i in ast.literal_eval(col):
        lst.append(i['name'])
    return lst

# extract(movies['genres'][0])

In [166]:
movies['genres'] = movies['genres'].apply(extract)

In [167]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",2007-05-19,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",2015-10-26,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",2012-07-16,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",2012-03-07,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [168]:
movies['keywords'] = movies['keywords'].apply(extract)

In [169]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",2009-12-10,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",2007-05-19,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",2015-10-26,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",2012-07-16,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",2012-03-07,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [170]:
ast.literal_eval(movies['cast'][0])[0]

{'cast_id': 242,
 'character': 'Jake Sully',
 'credit_id': '5602a8a7c3a3685532001c9a',
 'gender': 2,
 'id': 65731,
 'name': 'Sam Worthington',
 'order': 0}

**Helper function to extract maximum 5 cast members.**

In [171]:
def extract_cast(cast_col):
    count = 0
    cast = []
    for member in ast.literal_eval(cast_col):
        cast.append(member['name'])
        count+=1
        if count==5:
            break
    return cast

In [172]:
print(extract_cast(credits['cast'][1453]))

['Angus Macfadyen', 'Mario Yedidia', 'Marley Shelton', 'Chao Li Chi']


In [173]:
movies['cast'] = movies['cast'].apply(extract_cast)

In [174]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",2009-12-10,"[Sam Worthington, Zoe Saldana, Sigourney Weave...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",2007-05-19,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",2015-10-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",2012-07-16,"[Christian Bale, Michael Caine, Gary Oldman, A...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",2012-03-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [175]:
movies.isna().sum()

movie_id        0
title           0
overview        0
genres          0
keywords        0
release_date    0
cast            0
crew            0
dtype: int64

**Helper function to extract the directory from the list of crew members.**

In [176]:
def extract_director(crew_col):
    director = []
    for crew in ast.literal_eval(crew_col):
        if crew['job'] == 'Director':
            director.append(crew['name'])
            break
    return director

In [177]:
extract_director(movies['crew'][0])

['James Cameron']

In [178]:
movies['crew'] = movies['crew'].apply(extract_director)

In [179]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",2009-12-10,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",2007-05-19,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",2015-10-26,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",2012-07-16,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",2012-03-07,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [180]:
movies.isna().sum()

movie_id        0
title           0
overview        0
genres          0
keywords        0
release_date    0
cast            0
crew            0
dtype: int64

In [181]:
int(movies['release_date'][0][:4])

2009

In [182]:
movies['release_date'].value_counts()

2006-01-01    10
2002-01-01     8
2013-07-18     7
2014-12-25     7
1999-10-22     7
              ..
2008-01-17     1
2014-09-03     1
1977-06-24     1
2007-01-31     1
2012-05-03     1
Name: release_date, Length: 3278, dtype: int64

In [183]:
year_regex = r'\d{4}'
re.findall(year_regex, '1992-09-04')

['1992']

In [184]:
movies['release_date']

0       2009-12-10
1       2007-05-19
2       2015-10-26
3       2012-07-16
4       2012-03-07
           ...    
4804    1992-09-04
4805    2011-12-26
4806    2013-10-13
4807    2012-05-03
4808    2005-08-05
Name: release_date, Length: 4805, dtype: object

**Helper function to categotise movies into old and new base on their release date. Condition that I followed here is if the movie was released before 2000 it's an old movie else it's a new movie.**

In [185]:
def extract_year(date):
    #print(date)
    d = re.findall(r'(\d{4})',date)
    if int(d[0])>2000:
        return 'new'
    else:
        return 'old'

movies['release_date'] = movies['release_date'].apply(extract_year)

In [186]:
movies['release_date'][0]

'new'

In [187]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",new,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",new,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",new,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",new,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",new,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [188]:
movies['release_date'].isna().sum()

0

In [189]:
movies.isna().sum()

movie_id        0
title           0
overview        0
genres          0
keywords        0
release_date    0
cast            0
crew            0
dtype: int64

In [190]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [191]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",new,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",new,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",new,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",new,"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",new,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [192]:
movies['release_date'] = movies['release_date'].apply(lambda x: x.split())

In [193]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",[new],"[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",[new],"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",[new],"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",[new],"[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",[new],"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


**Removing white space between 2 words as it will be easier for our model to make better recommendation.**

In [194]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [195]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[new],"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[new],"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",[new],"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...",[new],"[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",[new],"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


**Creating a new column tags which will be a combination of the overview, genres, keywords, cast, crew and release date column.**

In [196]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['release_date']

In [197]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[new],"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[new],"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",[new],"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...",[new],"[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",[new],"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


**Removing white space between genres. This will help in recommending based on single or multiple genres.**

In [198]:
movies['genres_new'] = movies['genres'].apply(lambda x: ''.join(x))

In [199]:
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,release_date,cast,crew,tags,genres_new
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...",[new],"[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin...",ActionAdventureFantasyScienceFiction
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...",[new],"[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d...",AdventureFantasyAction
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...",[new],"[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send...",ActionAdventureCrime
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...",[new],"[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan],"[Following, the, death, of, District, Attorney...",ActionCrimeDramaThriller
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...",[new],"[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili...",ActionAdventureScienceFiction


**Creating a new dataframe.**

In [200]:
movie_df = movies[['movie_id', 'title', 'tags', 'genres', 'genres_new']]

In [201]:
movie_df.head()

Unnamed: 0,movie_id,title,tags,genres,genres_new
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]",ActionAdventureFantasyScienceFiction
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]",AdventureFantasyAction
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]",ActionAdventureCrime
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]",ActionCrimeDramaThriller
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]",ActionAdventureScienceFiction


**Converting tags list into a string. I also converted the tags string and genres column into lowercase.**

In [202]:
movie_df['tags'] = movie_df['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['tags'] = movie_df['tags'].apply(lambda x: " ".join(x))


In [203]:
movie_df.head()

Unnamed: 0,movie_id,title,tags,genres,genres_new
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, ScienceFiction]",ActionAdventureFantasyScienceFiction
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]",AdventureFantasyAction
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]",ActionAdventureCrime
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]",ActionCrimeDramaThriller
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]",ActionAdventureScienceFiction


In [204]:
movie_df['tags'] = movie_df['tags'].apply(lambda x: x.lower())
movie_df['genres_new'] = movie_df['genres_new'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['tags'] = movie_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['genres_new'] = movie_df['genres_new'].apply(lambda x: x.lower())


In [205]:
movie_df.head()

Unnamed: 0,movie_id,title,tags,genres,genres_new
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...","[Action, Adventure, Fantasy, ScienceFiction]",actionadventurefantasysciencefiction
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]",adventurefantasyaction
2,206647,Spectre,a cryptic message from bond’s past sends him o...,"[Action, Adventure, Crime]",actionadventurecrime
3,49026,The Dark Knight Rises,following the death of district attorney harve...,"[Action, Crime, Drama, Thriller]",actioncrimedramathriller
4,49529,John Carter,"john carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]",actionadventuresciencefiction


**Applying lemmatization using WordNerLemmatizer on tags.**

In [206]:
lemma = WordNetLemmatizer()

In [207]:
def lemmatizer(text):
    words = []
    for i in text.split():
        words.append(lemma.lemmatize(i))
    return " ".join(words)

In [208]:
movie_df['tags'] = movie_df['tags'].apply(lemmatizer)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_df['tags'] = movie_df['tags'].apply(lemmatizer)


In [209]:
movie_df.head()

Unnamed: 0,movie_id,title,tags,genres,genres_new
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...","[Action, Adventure, Fantasy, ScienceFiction]",actionadventurefantasysciencefiction
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]",adventurefantasyaction
2,206647,Spectre,a cryptic message from bond’s past sends him o...,"[Action, Adventure, Crime]",actionadventurecrime
3,49026,The Dark Knight Rises,following the death of district attorney harve...,"[Action, Crime, Drama, Thriller]",actioncrimedramathriller
4,49529,John Carter,"john carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]",actionadventuresciencefiction


## Approach 1 - Bag of Words

In [210]:
countVec = CountVectorizer(max_features=6000, stop_words='english') #max_features - most frequently occuring words
vectors = countVec.fit_transform(movie_df['tags']).toarray()

In [211]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [212]:
similarity = cosine_similarity(vectors)

In [213]:
cosine_similarity(vectors).shape

(4805, 4805)

In [214]:
similarity[0] # array of distance of first movie with every other movie in the dataset

array([1.        , 0.10028244, 0.10461316, ..., 0.06406221, 0.06897007,
       0.04496938])

In [215]:
def recommend(movie):
    movie_index = movie_df[movie_df['title']==movie].index[0]
    distance_vector = similarity[movie_index]
    movies_list = sorted(list(enumerate(distance_vector)), reverse=True, key=lambda x:x[1])[1:6]
    
    for movie in movies_list:
        print(movie_df.iloc[movie[0]].title)

In [216]:
recommend("Avatar")

Aliens
Battle: Los Angeles
Independence Day
Falcon Rising
Aliens vs Predator: Requiem


## Approach 2 - TF-IDF Vectorizer

In [217]:
tfidf = TfidfVectorizer(max_features = 6000, stop_words = 'english')
vector_tfidf = tfidf.fit_transform(movie_df['tags']).toarray()

In [218]:
vector_tfidf[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [219]:
similarity_tfidf = cosine_similarity(vector_tfidf)

In [220]:
similarity_tfidf[0]

array([1.        , 0.01974221, 0.02819824, ..., 0.019514  , 0.00740556,
       0.00234212])

In [221]:
def recommend_tfidf(movie):
    movie_index = movie_df[movie_df['title']==movie].index[0]
    distance_vector = similarity_tfidf[movie_index]
    movies_list = sorted(list(enumerate(distance_vector)), reverse=True, key=lambda x:x[1])[1:6]
    
    for movie in movies_list:
        print(movie_df.iloc[movie[0]].title)

In [222]:
recommend_tfidf("Independence Day: Resurgence")

Independence Day
Space Battleship Yamato
Meet Dave
2012
U.F.O.


In [231]:
def recommend_by_genre(genre, c):
    count=0
    for i, j, k in zip(movie_df['genres_new'], movie_df['title'], movie_df['movie_id']):
        if genre in i:
            print(k, j)
            count+=1
        if count==c:
            break
recommend_by_genre('actionthriller', 10)

10764 Quantum of Solace
44912 Green Lantern
14869 G.I. Joe: The Rise of Cobra
296 Terminator 3: Rise of the Machines
27205 Inception
87101 Terminator Genisys
6479 I Am Legend
2080 X-Men Origins: Wolverine
605 The Matrix Revolutions
604 The Matrix Reloaded


In [259]:
movie_df.head()

Unnamed: 0,movie_id,title,tags,genres,genres_new
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di...","[Action, Adventure, Fantasy, ScienceFiction]",actionadventurefantasysciencefiction
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]",adventurefantasyaction
2,206647,Spectre,a cryptic message from bond’s past sends him o...,"[Action, Adventure, Crime]",actionadventurecrime
3,49026,The Dark Knight Rises,following the death of district attorney harve...,"[Action, Crime, Drama, Thriller]",actioncrimedramathriller
4,49529,John Carter,"john carter is a war-weary, former military ca...","[Action, Adventure, ScienceFiction]",actionadventuresciencefiction


In [266]:
movie_df[movie_df['movie_id']==36586]

Unnamed: 0,movie_id,title,tags,genres,genres_new
864,36586,Blade II,a rare mutation ha occurred within the vampire...,"[Fantasy, Horror, Action, Thriller]",fantasyhorroractionthriller


In [267]:
def filter_movies(genres_to_filter, count=10):
    # Filter the DataFrame
    filtered_df = movie_df[movie_df['genres_new'].apply(lambda x: all(genre in x for genre in genres_to_filter))]
    
    
    # If the filtered DataFrame has less than 'count' movies, return them all

    if len(filtered_df) < count:
        return filtered_df
    
    # Otherwise, return a random sample of 'count' movies
    return filtered_df.sample(n=count, random_state=np.random.RandomState())['title']

# Test the function
filtered_df = filter_movies(['action', 'thriller'])
# print(filtered_df['title'])
print(list(filtered_df['title']))
print((filtered_df['movie_id']))

['Sky Captain and the World of Tomorrow', 'The Negotiator', 'Mission: Impossible III', 'Drive', 'Planet of the Apes', 'Live and Let Die', 'Die Hard', 'The Jackal', 'The Marine', 'Die Hard: With a Vengeance']
607      5137
950      9631
139       956
2564    64690
278       869
3348      253
1708      562
731      4824
2584     8975
355      1572
Name: movie_id, dtype: int64


## Approach 3 -Bag of Words with n_grams

In [224]:
countVec_ngrams = CountVectorizer(max_features=6000, stop_words='english', ngram_range = (1,4)) 
vectors_ngrams = countVec_ngrams.fit_transform(movie_df['tags']).toarray()
similarity_ngrams = cosine_similarity(vectors_ngrams)

In [225]:
def recommend_ngrams(movie):
    movie_index = movie_df[movie_df['title']==movie].index[0]
    distance_vector = similarity_ngrams[movie_index]
    movies_list = sorted(list(enumerate(distance_vector)), reverse=True, key=lambda x:x[1])[1:6]
    
    for movie in movies_list:
        print(movie_df.iloc[movie[0]].title)

In [226]:
recommend_ngrams("John Carter")

Independence Day: Resurgence
Damnation Alley
Captain America: The First Avenger
Battleship
Jurassic World


Available genres: {'Action','Adventure','Animation','Comedy','Crime','Documentary','Drama','Family','Fantasy','Foreign',
 'History','Horror','Music','Mystery','Romance','ScienceFiction','TVMovie','Thriller','War','Western'}

In [227]:
pickle.dump(movie_df, open("movies_list.pkl", "wb"))

In [228]:
pickle.dump(movie_df.to_dict(), open("movies_dict.pkl", "wb"))

In [229]:
pickle.dump(similarity_tfidf, open("similarity_tfidf.pkl", "wb"))

In [230]:
['Action','Adventure','Animation','Comedy','Crime','Documentary','Drama','Family','Fantasy','Foreign', 'History','Horror','Music','Mystery','Romance','ScienceFiction','TVMovie','Thriller','War','Western']

['Action',
 'Adventure',
 'Animation',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Foreign',
 'History',
 'Horror',
 'Music',
 'Mystery',
 'Romance',
 'ScienceFiction',
 'TVMovie',
 'Thriller',
 'War',
 'Western']