In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
from time import *
from random import randint
import time
import json
import matplotlib.pyplot as plt
import config_file
from ast import literal_eval
from functools import partial, reduce
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

stemmer = SnowballStemmer('english')

api_key = config_file.tmdb_api_key

## 1. Metadata

In [2]:
metadata = pd.read_csv('data/movie_metadata.csv')
metadata.head()

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/g4EccIilvUXEGDLcGozUbodgtvn.jpg,,0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 28, '...",https://hombalefilms.com/movies/kantara/,858485,tt15327088,kn,ಕಾಂತಾರ,...,2022-09-30,0,150,"[{'english_name': 'Kannada', 'iso_639_1': 'kn'...",Released,A Legend,Kantara,False,8.4,10
1,False,/kXfqcdQKsToO0OUXHcrrNCHDBzO.jpg,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,The Shawshank Redemption,...,1994-09-23,28341469,142,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.7,22556
2,False,/qFLQEng8NADXWkxB5vE9dO7YyNf.jpg,"{'id': 473983, 'name': 'Hababam Sınıfı [Seri]'...",0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,83651,tt0252487,tr,Hababam Sınıfı,...,1975-04-01,0,87,"[{'english_name': 'Turkish', 'iso_639_1': 'tr'...",Released,,The Chaos Class,False,7.96,100
3,False,/rSPw7tgCH9c6NqICZef4kZjFOQ5.jpg,"{'id': 230, 'name': 'The Godfather Collection'...",6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.thegodfather.com/,238,tt0068646,en,The Godfather,...,1972-03-14,245066411,175,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,An offer you can't refuse.,The Godfather,False,8.716,16826
4,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",https://www.warnerbros.com/movies/dark-knight/,155,tt0468569,en,The Dark Knight,...,2008-07-14,1004558444,152,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Welcome to a world without rules.,The Dark Knight,False,8.502,28486


In [3]:
metadata.dtypes

adult                       bool
backdrop_path             object
belongs_to_collection     object
budget                     int64
genres                    object
homepage                  object
id                         int64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                    int64
runtime                    int64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                       bool
vote_average             float64
vote_count                 int64
dtype: object

In [4]:
metadata = metadata[['id', 'imdb_id', 'title', 'release_date', 'vote_average', 'genres', 'overview', 'tagline']]

In [5]:
metadata.notnull().sum()

id              4790
imdb_id         4790
title           4790
release_date    4790
vote_average    4790
genres          4790
overview        4789
tagline         4431
dtype: int64

In [6]:
metadata['release_date'].head()

0    2022-09-30
1    1994-09-23
2    1975-04-01
3    1972-03-14
4    2008-07-14
Name: release_date, dtype: object

In [7]:
metadata['year'] = metadata['release_date'].apply(lambda x: int(x[:4]))

In [8]:
metadata.drop(columns=['release_date'], inplace=True)

In [9]:
metadata['genres'] = metadata['genres'].apply(literal_eval).apply(lambda x: [i['name'].lower() for i in x]*2 if isinstance(x, list) else [])

In [10]:
metadata['overview'] = metadata['overview'].fillna('').str.lower()
metadata['tagline'] = metadata['tagline'].fillna('').str.lower()

In [11]:
stop_words = set(stopwords.words('english'))

metadata['overview'] = metadata['overview'].apply(lambda x: [stemmer.stem(i.translate(str.maketrans('', '', string.punctuation))) for i in x.split(' ') if i not in stop_words])
metadata['tagline'] = metadata['tagline'].apply(lambda x: [stemmer.stem(i.translate(str.maketrans('', '', string.punctuation))) for i in x.split(' ') if i not in stop_words])

In [12]:
metadata.head()

Unnamed: 0,id,imdb_id,title,vote_average,genres,overview,tagline,year
0,858485,tt15327088,Kantara,8.4,"[mystery, action, thriller, mystery, action, t...","[shiva, tribal, vagabond, live, mother, hamlet...",[legend],2022
1,278,tt0111161,The Shawshank Redemption,8.7,"[drama, crime, drama, crime]","[frame, 1940s, doubl, murder, wife, lover, ups...","[fear, hold, prison, hope, set, free]",1994
2,83651,tt0252487,The Chaos Class,7.96,"[comedy, drama, comedy, drama]","[lazi, uneduc, student, most, come, money, sha...",[],1975
3,238,tt0068646,The Godfather,8.716,"[drama, crime, drama, crime]","[span, year, 1945, 1955, chronicl, fiction, it...","[offer, cant, refus]",1972
4,155,tt0468569,The Dark Knight,8.502,"[drama, action, crime, thriller, drama, action...","[batman, rais, stake, war, crime, help, lt, ji...","[welcom, world, without, rule]",2008


In [13]:
metadata.dtypes

id                int64
imdb_id          object
title            object
vote_average    float64
genres           object
overview         object
tagline          object
year              int64
dtype: object

### 2. Credits

In [14]:
credits = pd.read_csv('data/movie_credits.csv')
credits.head()

Unnamed: 0,id,cast,crew
0,858485,"[{'adult': False, 'gender': 2, 'id': 1752058, ...","[{'adult': False, 'gender': 0, 'id': 1189437, ..."
1,278,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno..."
2,83651,"[{'adult': False, 'gender': 2, 'id': 556048, '...","[{'adult': False, 'gender': 2, 'id': 109083, '..."
3,238,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno..."
4,155,"[{'adult': False, 'gender': 2, 'id': 3894, 'kn...","[{'adult': False, 'gender': 1, 'id': 1113, 'kn..."


In [15]:
credits['actors'] = credits['cast'].apply(literal_eval).apply(lambda x: [i['name'].lower().replace(' ', '') for i in x[:3]]*2)

In [16]:
credits['director'] = credits['crew'].apply(literal_eval).apply(lambda x: [i['name'].lower().replace(' ', '') for i in x if i['job'] == 'Director']*2)

In [17]:
credits.drop(columns=['cast', 'crew'], inplace=True)
credits.head()

Unnamed: 0,id,actors,director
0,858485,"[rishabshetty, sapthamigowda, kishore, rishabs...","[rishabshetty, rishabshetty]"
1,278,"[timrobbins, morganfreeman, bobgunton, timrobb...","[frankdarabont, frankdarabont]"
2,83651,"[münirözkul, adilenaşit, kemalsunal, münirözku...","[ertemeğilmez, ertemeğilmez]"
3,238,"[marlonbrando, alpacino, jamescaan, marlonbran...","[francisfordcoppola, francisfordcoppola]"
4,155,"[christianbale, heathledger, michaelcaine, chr...","[christophernolan, christophernolan]"


In [18]:
credits.dtypes

id           int64
actors      object
director    object
dtype: object

## 3. Keywords

In [19]:
keywords = pd.read_csv('data/movie_keywords.csv')
keywords.head()

Unnamed: 0,id,keywords
0,858485,[]
1,278,"[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."
2,83651,[]
3,238,"[{'id': 131, 'name': 'italy'}, {'id': 697, 'na..."
4,155,"[{'id': 853, 'name': 'crime fighter'}, {'id': ..."


In [20]:
keywords['keywords'] = keywords['keywords'].apply(literal_eval)

In [21]:
all_keywords = pd.Series([x['name'] for item in keywords['keywords'] for x in item]).value_counts()

In [22]:
all_keywords = all_keywords[all_keywords>1]

In [23]:
all_keywords

based on novel or book    633
duringcreditsstinger      344
murder                    332
new york city             253
based on true story       221
                         ... 
payback                     2
erased memory               2
swimsuit                    2
contraception               2
dowager                     2
Length: 5707, dtype: int64

In [24]:
keywords['keywords'] = keywords['keywords'].apply(lambda x: [i['name'] for i in x if i['name'] in all_keywords.index])

In [25]:
keywords['keywords'] = keywords['keywords'].apply(lambda x: [stemmer.stem(i).replace(' ', '')  for i in x if i not in stop_words])

In [26]:
keywords

Unnamed: 0,id,keywords
0,858485,[]
1,278,"[prison, corrupt, policebrut, basedonnovelorbo..."
2,83651,[]
3,238,"[itali, lossoflovedon, loveatfirstsight, based..."
4,155,"[crimefight, secretident, antihero, scarecrow,..."
...,...,...
4783,876671,"[hungari, basedontruestori, histor, politicall..."
4784,415861,"[journalist, conspiracytheori, crash]"
4785,556270,[sequel]
4786,470121,[]


## 4. Merging dataframes on id

In [27]:
dfs = [metadata, credits, keywords]
outer_merge = partial(pd.merge, how='outer')
combined = reduce(outer_merge, dfs)

In [28]:
combined.dtypes

id                int64
imdb_id          object
title            object
vote_average    float64
genres           object
overview         object
tagline          object
year            float64
actors           object
director         object
keywords         object
dtype: object

In [29]:
combined.dropna(thresh=5, inplace=True)

In [30]:
#combined['vote_average'] = combined['vote_average'].apply(lambda x: [str(round(x))])

In [31]:
combined['year'] = combined['year'].astype(int)

In [32]:
combined.dtypes

id                int64
imdb_id          object
title            object
vote_average    float64
genres           object
overview         object
tagline          object
year              int32
actors           object
director         object
keywords         object
dtype: object

In [33]:
combined.head()

Unnamed: 0,id,imdb_id,title,vote_average,genres,overview,tagline,year,actors,director,keywords
0,858485,tt15327088,Kantara,8.4,"[mystery, action, thriller, mystery, action, t...","[shiva, tribal, vagabond, live, mother, hamlet...",[legend],2022,"[rishabshetty, sapthamigowda, kishore, rishabs...","[rishabshetty, rishabshetty]",[]
1,278,tt0111161,The Shawshank Redemption,8.7,"[drama, crime, drama, crime]","[frame, 1940s, doubl, murder, wife, lover, ups...","[fear, hold, prison, hope, set, free]",1994,"[timrobbins, morganfreeman, bobgunton, timrobb...","[frankdarabont, frankdarabont]","[prison, corrupt, policebrut, basedonnovelorbo..."
2,83651,tt0252487,The Chaos Class,7.96,"[comedy, drama, comedy, drama]","[lazi, uneduc, student, most, come, money, sha...",[],1975,"[münirözkul, adilenaşit, kemalsunal, münirözku...","[ertemeğilmez, ertemeğilmez]",[]
3,238,tt0068646,The Godfather,8.716,"[drama, crime, drama, crime]","[span, year, 1945, 1955, chronicl, fiction, it...","[offer, cant, refus]",1972,"[marlonbrando, alpacino, jamescaan, marlonbran...","[francisfordcoppola, francisfordcoppola]","[itali, lossoflovedon, loveatfirstsight, based..."
4,155,tt0468569,The Dark Knight,8.502,"[drama, action, crime, thriller, drama, action...","[batman, rais, stake, war, crime, help, lt, ji...","[welcom, world, without, rule]",2008,"[christianbale, heathledger, michaelcaine, chr...","[christophernolan, christophernolan]","[crimefight, secretident, antihero, scarecrow,..."


In [34]:
combined['soup'] = combined['vote_average'].apply(lambda x: [str(round(x))])*2 + combined['genres'] + combined['overview'] + combined['tagline'] + combined['keywords'] + combined['actors'] + combined['director']

In [35]:
combined['soup'] = combined['soup'].dropna().apply(lambda x: ' '.join(x))

In [36]:
print(combined['soup'][1])

9 9 drama crime drama crime frame 1940s doubl murder wife lover upstand banker andi dufresn begin new life shawshank prison put account skill work amor warden long stretch prison dufresn come admir inmat  includ older prison name red  integr unquench sens hope fear hold prison hope set free prison corrupt policebrut basedonnovelorbook prisoncel delinqu prisonescap wrongfulimprison framedformurd 1940s timrobbins morganfreeman bobgunton timrobbins morganfreeman bobgunton frankdarabont frankdarabont


In [37]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(combined['soup'].values.astype(str))

In [38]:
tfidf_matrix.shape

(4790, 206702)

In [39]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [40]:
combined = combined.reset_index()
titles = combined[['title', 'year', 'vote_average']]
indices = pd.Series(combined.index, index=combined['title'])

In [41]:
def get_recommendations(title):
    id = indices[title]
    sim_scores = list(enumerate(cosine_sim[id]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [42]:
get_recommendations('Mean Girls').head(15)

Unnamed: 0,title,year,vote_average
3461,Freaky Friday,2003,6.412
3496,Doc Hollywood,1991,6.328
4392,The Hot Chick,2002,6.339
4102,Ghosts of Girlfriends Past,2009,5.9
2629,Just Like Heaven,2005,6.912
3884,Mr. Popper's Penguins,2011,5.917
3054,The Spiderwick Chronicles,2008,6.598
2899,The Parent Trap,1998,7.142
4635,Herbie Fully Loaded,2005,5.664
4457,Just My Luck,2006,6.094


In [43]:
from flask import Flask, request, jsonify
from flask_cors import CORS

In [44]:
combined.to_pickle('data/movies.pkl')
pd.DataFrame(cosine_sim).to_pickle('data/similarity.pkl')

In [49]:
sim = pd.read_pickle('data/movies.pkl')

In [50]:
sim.head()

Unnamed: 0,index,id,imdb_id,title,vote_average,genres,overview,tagline,year,actors,director,keywords,soup
0,0,858485,tt15327088,Kantara,8.4,"[mystery, action, thriller, mystery, action, t...","[shiva, tribal, vagabond, live, mother, hamlet...",[legend],2022,"[rishabshetty, sapthamigowda, kishore, rishabs...","[rishabshetty, rishabshetty]",[],8 8 mystery action thriller mystery action thr...
1,1,278,tt0111161,The Shawshank Redemption,8.7,"[drama, crime, drama, crime]","[frame, 1940s, doubl, murder, wife, lover, ups...","[fear, hold, prison, hope, set, free]",1994,"[timrobbins, morganfreeman, bobgunton, timrobb...","[frankdarabont, frankdarabont]","[prison, corrupt, policebrut, basedonnovelorbo...",9 9 drama crime drama crime frame 1940s doubl ...
2,2,83651,tt0252487,The Chaos Class,7.96,"[comedy, drama, comedy, drama]","[lazi, uneduc, student, most, come, money, sha...",[],1975,"[münirözkul, adilenaşit, kemalsunal, münirözku...","[ertemeğilmez, ertemeğilmez]",[],8 8 comedy drama comedy drama lazi uneduc stud...
3,3,238,tt0068646,The Godfather,8.716,"[drama, crime, drama, crime]","[span, year, 1945, 1955, chronicl, fiction, it...","[offer, cant, refus]",1972,"[marlonbrando, alpacino, jamescaan, marlonbran...","[francisfordcoppola, francisfordcoppola]","[itali, lossoflovedon, loveatfirstsight, based...",9 9 drama crime drama crime span year 1945 195...
4,4,155,tt0468569,The Dark Knight,8.502,"[drama, action, crime, thriller, drama, action...","[batman, rais, stake, war, crime, help, lt, ji...","[welcom, world, without, rule]",2008,"[christianbale, heathledger, michaelcaine, chr...","[christophernolan, christophernolan]","[crimefight, secretident, antihero, scarecrow,...",9 9 drama action crime thriller drama action c...


In [5]:
url = "https://api.themoviedb.org/3/movie/155?api_key=7aa8ed02db282841456234e35f1401a1&language=en-US"
data = requests.get(url)
data = data.json()
poster_path = data['poster_path']
full_path = "https://image.tmdb.org/t/p/w500/" + poster_path

In [6]:
data

{'adult': False,
 'backdrop_path': '/cfT29Im5VDvjE0RpyKOSdCKZal7.jpg',
 'belongs_to_collection': {'id': 263,
  'name': 'The Dark Knight Collection',
  'poster_path': '/hGvOMQBD88jAV0olS2DT1AxreHn.jpg',
  'backdrop_path': '/xfKot7lqaiW4XpL5TtDlVBA9ei9.jpg'},
 'budget': 185000000,
 'genres': [{'id': 18, 'name': 'Drama'},
  {'id': 28, 'name': 'Action'},
  {'id': 80, 'name': 'Crime'},
  {'id': 53, 'name': 'Thriller'}],
 'homepage': 'https://www.warnerbros.com/movies/dark-knight/',
 'id': 155,
 'imdb_id': 'tt0468569',
 'original_language': 'en',
 'original_title': 'The Dark Knight',
 'overview': 'Batman raises the stakes in his war on crime. With the help of Lt. Jim Gordon and District Attorney Harvey Dent, Batman sets out to dismantle the remaining criminal organizations that plague the streets. The partnership proves to be effective, but they soon find themselves prey to a reign of chaos unleashed by a rising criminal mastermind known to the terrified citizens of Gotham as the Joker.',
 '

In [48]:
print(pd.__version__)

1.3.5
