In [16]:
import sys
import os
import pandas as pd
import numpy as np
import requests
from tmdbv3api import Movie
from tqdm import tqdm
import json
import re

In [17]:
df = pd.read_csv('movie_data.csv')

In [18]:
df.shape

(200, 23)

In [19]:
df.head()

Unnamed: 0,budget,genres,homepage,id,imdb_id,origin_country,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords
0,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10751, ""...",,3924,tt0029927,['US'],en,Blondie,Blondie and Dagwood are about to celebrate the...,3.072,...,1938-11-30,0,70,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,The favorite comic strip of millions at last o...,Blondie,7.1,8,"{""keywords"": [{""id"": 190801, ""name"": ""blondie""}]}"
1,0,"[{""id"": 12, ""name"": ""Adventure""}]",,6124,,['DE'],de,Der Mann ohne Namen,,1.625,...,1921-01-01,0,420,[],Released,,"Peter Voss, Thief of Millions",0.0,0,"{""keywords"": []}"
2,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""n...",,8773,tt0055747,"['FR', 'IT', 'JP', 'PL']",fr,L'Amour à vingt ans,Love at Twenty unites five directors from five...,3.545,...,1962-06-22,0,120,"[{""english_name"": ""French"", ""iso_639_1"": ""fr"",...",Released,The Intimate Secrets of Young Lovers,Love at Twenty,6.7,48,"{""keywords"": []}"
3,0,[],http://www.nwdfilms.com,25449,,['US'],en,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,3.503,...,2008-12-08,0,69,"[{""english_name"": ""English"", ""iso_639_1"": ""en""...",Released,,New World Disorder 9: Never Enough,6.0,3,"{""keywords"": [{""id"": 6075, ""name"": ""sports""}, ..."
4,0,"[{""id"": 10751, ""name"": ""Family""}]",,31975,tt1656746,['US'],en,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",0.002,...,2010-01-05,0,46,[],Released,,Sesame Street: Elmo Loves You!,0.0,0,"{""keywords"": []}"


In [20]:
def process_dicts(entry):
    entry = json.loads(entry)
    return " ".join([e['name'] for e in entry])

In [21]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'vote_average', 'vote_count', 'keywords'],
      dtype='object')

In [22]:
df['keywords'] = df['keywords'].apply(json.loads)
df['keywords'] = df['keywords'].apply(lambda x: x['keywords'])
df['keywords'] = df['keywords'].apply(lambda x: " ".join([i['name'] for i in x]))
for col in ['genres', 'production_countries',
            'production_companies', 'spoken_languages']:
    df[col] = df[col].apply(process_dicts)

In [23]:
df.head()

Unnamed: 0,budget,genres,homepage,id,imdb_id,origin_country,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords
0,0,Comedy Family,,3924,tt0029927,['US'],en,Blondie,Blondie and Dagwood are about to celebrate the...,3.072,...,1938-11-30,0,70,English,Released,The favorite comic strip of millions at last o...,Blondie,7.1,8,blondie
1,0,Adventure,,6124,,['DE'],de,Der Mann ohne Namen,,1.625,...,1921-01-01,0,420,,Released,,"Peter Voss, Thief of Millions",0.0,0,
2,0,Drama Romance,,8773,tt0055747,"['FR', 'IT', 'JP', 'PL']",fr,L'Amour à vingt ans,Love at Twenty unites five directors from five...,3.545,...,1962-06-22,0,120,Français Deutsch Italiano 日本語 Polski,Released,The Intimate Secrets of Young Lovers,Love at Twenty,6.7,48,
3,0,,http://www.nwdfilms.com,25449,,['US'],en,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,3.503,...,2008-12-08,0,69,English,Released,,New World Disorder 9: Never Enough,6.0,3,sports mountain biking
4,0,Family,,31975,tt1656746,['US'],en,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",0.002,...,2010-01-05,0,46,,Released,,Sesame Street: Elmo Loves You!,0.0,0,


In [24]:
missing = {'', 'NaN', 'nan'}
for col in df.columns:
    df[col] = df[col].apply(lambda x: np.NAN if x in missing else x)

In [26]:
df.isnull().sum()

budget                    0
genres                    1
homepage                121
id                        0
imdb_id                   3
origin_country            0
original_language         0
original_title            0
overview                  2
popularity                0
poster_path               2
production_companies      6
production_countries      2
release_date              0
revenue                   0
runtime                   0
spoken_languages          2
status                    0
tagline                  38
title                     0
vote_average              0
vote_count                0
keywords                  3
dtype: int64

In [27]:
df.head()

Unnamed: 0,budget,genres,homepage,id,imdb_id,origin_country,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords
0,0,Comedy Family,,3924,tt0029927,['US'],en,Blondie,Blondie and Dagwood are about to celebrate the...,3.072,...,1938-11-30,0,70,English,Released,The favorite comic strip of millions at last o...,Blondie,7.1,8,blondie
1,0,Adventure,,6124,,['DE'],de,Der Mann ohne Namen,,1.625,...,1921-01-01,0,420,,Released,,"Peter Voss, Thief of Millions",0.0,0,
2,0,Drama Romance,,8773,tt0055747,"['FR', 'IT', 'JP', 'PL']",fr,L'Amour à vingt ans,Love at Twenty unites five directors from five...,3.545,...,1962-06-22,0,120,Français Deutsch Italiano 日本語 Polski,Released,The Intimate Secrets of Young Lovers,Love at Twenty,6.7,48,
3,0,,http://www.nwdfilms.com,25449,,['US'],en,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,3.503,...,2008-12-08,0,69,English,Released,,New World Disorder 9: Never Enough,6.0,3,sports mountain biking
4,0,Family,,31975,tt1656746,['US'],en,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",0.002,...,2010-01-05,0,46,,Released,,Sesame Street: Elmo Loves You!,0.0,0,


In [28]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'vote_average', 'vote_count', 'keywords'],
      dtype='object')

In [34]:
df[df['revenue'] != 0]['revenue']

7        4257354
8       12136938
11     775398007
12     940335536
13     677387716
         ...    
195     15119639
196      2500000
197    245066411
198     25000000
199    102600000
Name: revenue, Length: 152, dtype: int64

In [37]:
df = df.drop(columns= ['budget', 'homepage'])

In [39]:
df.head()

Unnamed: 0,genres,id,imdb_id,origin_country,original_language,original_title,overview,popularity,poster_path,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords
0,Comedy Family,3924,tt0029927,['US'],en,Blondie,Blondie and Dagwood are about to celebrate the...,3.072,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,Columbia Pictures,...,1938-11-30,0,70,English,Released,The favorite comic strip of millions at last o...,Blondie,7.1,8,blondie
1,Adventure,6124,,['DE'],de,Der Mann ohne Namen,,1.625,/6xUbUCvndklbGVYiljHr34NTxSl.jpg,,...,1921-01-01,0,420,,Released,,"Peter Voss, Thief of Millions",0.0,0,
2,Drama Romance,8773,tt0055747,"['FR', 'IT', 'JP', 'PL']",fr,L'Amour à vingt ans,Love at Twenty unites five directors from five...,3.545,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,Ulysse Productions Unitec Films Cinesecolo TOH...,...,1962-06-22,0,120,Français Deutsch Italiano 日本語 Polski,Released,The Intimate Secrets of Young Lovers,Love at Twenty,6.7,48,
3,,25449,,['US'],en,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,3.503,/itQjoIO2e3padeLliHROJVAb7Av.jpg,,...,2008-12-08,0,69,English,Released,,New World Disorder 9: Never Enough,6.0,3,sports mountain biking
4,Family,31975,tt1656746,['US'],en,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",0.002,/qKWcCmvGr4g0dgXvhqAc4BAMCtk.jpg,,...,2010-01-05,0,46,,Released,,Sesame Street: Elmo Loves You!,0.0,0,


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   genres                199 non-null    object 
 1   id                    200 non-null    int64  
 2   imdb_id               197 non-null    object 
 3   origin_country        200 non-null    object 
 4   original_language     200 non-null    object 
 5   original_title        200 non-null    object 
 6   overview              198 non-null    object 
 7   popularity            200 non-null    float64
 8   poster_path           198 non-null    object 
 9   production_companies  194 non-null    object 
 10  production_countries  198 non-null    object 
 11  release_date          200 non-null    object 
 12  revenue               200 non-null    int64  
 13  runtime               200 non-null    int64  
 14  spoken_languages      198 non-null    object 
 15  status                2

In [41]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [42]:
df.head()

Unnamed: 0,genres,id,imdb_id,origin_country,original_language,original_title,overview,popularity,poster_path,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords
0,Comedy Family,3924,tt0029927,['US'],en,Blondie,Blondie and Dagwood are about to celebrate the...,3.072,/zBiHKhXklvTFwj4M1uEUcQGAVJ.jpg,Columbia Pictures,...,1938-11-30,0,70,English,Released,The favorite comic strip of millions at last o...,Blondie,7.1,8,blondie
1,Adventure,6124,,['DE'],de,Der Mann ohne Namen,,1.625,/6xUbUCvndklbGVYiljHr34NTxSl.jpg,,...,1921-01-01,0,420,,Released,,"Peter Voss, Thief of Millions",0.0,0,
2,Drama Romance,8773,tt0055747,"['FR', 'IT', 'JP', 'PL']",fr,L'Amour à vingt ans,Love at Twenty unites five directors from five...,3.545,/aup2QCYCsyEeQfpboXy0f4uj8aE.jpg,Ulysse Productions Unitec Films Cinesecolo TOH...,...,1962-06-22,0,120,Français Deutsch Italiano 日本語 Polski,Released,The Intimate Secrets of Young Lovers,Love at Twenty,6.7,48,
3,,25449,,['US'],en,New World Disorder 9: Never Enough,Gee Atherton ripping the Worlds course the day...,3.503,/itQjoIO2e3padeLliHROJVAb7Av.jpg,,...,2008-12-08,0,69,English,Released,,New World Disorder 9: Never Enough,6.0,3,sports mountain biking
4,Family,31975,tt1656746,['US'],en,Sesame Street: Elmo Loves You!,"Elmo is making a very, very super special surp...",0.002,/qKWcCmvGr4g0dgXvhqAc4BAMCtk.jpg,,...,2010-01-05,0,46,,Released,,Sesame Street: Elmo Loves You!,0.0,0,


In [43]:
df['year'] = df['release_date'].dt.year.astype('Int64')

In [45]:
df.isnull().sum()

genres                   1
id                       0
imdb_id                  3
origin_country           0
original_language        0
original_title           0
overview                 2
popularity               0
poster_path              2
production_companies     6
production_countries     2
release_date             0
revenue                  0
runtime                  0
spoken_languages         2
status                   0
tagline                 38
title                    0
vote_average             0
vote_count               0
keywords                 3
year                     0
dtype: int64

In [46]:
df.dropna(inplace=True)

In [47]:
df = df[df['vote_count'] > 50]

In [48]:
df

Unnamed: 0,genres,id,imdb_id,origin_country,original_language,original_title,overview,popularity,poster_path,production_companies,...,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,keywords,year
7,Comedy,5,tt0113101,['US'],en,Four Rooms,It's Ted the Bellhop's first night on the job....,13.956,/75aHn1NOYXh4M7L5shoeQ6NGykP.jpg,Miramax A Band Apart,...,4257354,98,English,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,5.853,2638,hotel new year's eve witch bet sperm hotel roo...,1995
8,Action Crime Thriller,6,tt0107286,['US'],en,Judgment Night,"Four young friends, while taking a shortcut en...",12.859,/3rvvpS9YPM5HB2f4HYiNiJVtdam.jpg,Largo Entertainment JVC Universal Pictures,...,12136938,109,English,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.477,331,"drug dealer chicago, illinois escape one night...",1993
11,Adventure Action Science Fiction,11,tt0076759,['US'],en,Star Wars,Princess Leia is captured and held hostage by ...,97.319,/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg,Lucasfilm Ltd. 20th Century Fox,...,775398007,121,English,Released,"A long time ago in a galaxy far, far away...",Star Wars,8.204,20596,empire galaxy rebellion android hermit smuggli...,1977
12,Animation Family,12,tt0266543,['US'],en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",99.507,/eHuGQ10FUzK1mdOY69wF5pGgEf5.jpg,Pixar,...,940335536,100,English,Released,There are 3.7 trillion fish in the ocean. They...,Finding Nemo,7.820,19223,"sydney, australia parent child relationship an...",2003
13,Comedy Drama Romance,13,tt0109830,['US'],en,Forrest Gump,A man with a low IQ has accomplished great thi...,91.712,/arw2vcBveWOVZr6pxd9XTd1TdQa.jpg,Paramount Pictures The Steve Tisch Company Wen...,...,677387716,142,English,Released,The world will never be the same once you've s...,Forrest Gump,8.470,27465,vietnam war vietnam veteran mentally disabled ...,1994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Comedy Drama Romance,236,tt0110598,['AU'],en,Muriel's Wedding,A young social outcast in Australia steals mon...,7.524,/zJyTr8Fo412a2OIfJGXTRAm4IwX.jpg,CiBy 2000 Film Victoria House & Moorhouse Film...,...,15119639,106,English 日本語,Released,Success is the best revenge.,Muriel's Wedding,6.800,445,daughter individual friendship dream love of o...,1994
196,Drama Thriller Crime Romance,237,tt0289635,"['US', 'GB']",en,Young Adam,A young drifter working on a river barge disru...,15.417,/rU5GJ5r4tQbSx9C2ENKbPkD6nr9.jpg,Recorded Picture Company,...,2500000,93,English,Released,Everyone has a past. Everyone has a secret.,Young Adam,5.800,167,dying and death adultery individual court case...,2003
197,Drama Crime,238,tt0068646,['US'],en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",179.773,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,Paramount Pictures Alfran Productions,...,245066411,175,English Italiano Latin,Released,An offer you can't refuse.,The Godfather,8.700,20661,based on novel or book loss of loved one love ...,1972
198,Comedy Romance Crime,239,tt0053291,['US'],en,Some Like It Hot,Two musicians witness a mob hit and struggle t...,25.033,/hVIKyTK13AvOGv7ICmJjK44DTzp.jpg,The Mirisch Company,...,25000000,122,English,Released,The movie too HOT for words!,Some Like It Hot,8.100,3451,"chicago, illinois florida transvestism musicia...",1959


In [50]:
cre = pd.read_csv('movie_credits.csv')

In [51]:
cre.head()

Unnamed: 0,movie_id,movie_title,cast,crew
0,3924,Blondie,"[{""adult"": false, ""gender"": 1, ""id"": 34178, ""k...","[{""adult"": false, ""gender"": 0, ""id"": 34170, ""k..."
1,6124,"Peter Voss, Thief of Millions","[{""adult"": false, ""gender"": 2, ""id"": 48038, ""k...","[{""adult"": false, ""gender"": 2, ""id"": 48035, ""k..."
2,8773,Love at Twenty,"[{""adult"": false, ""gender"": 2, ""id"": 1653, ""kn...","[{""adult"": false, ""gender"": 2, ""id"": 1003232, ..."
3,25449,New World Disorder 9: Never Enough,"[{""adult"": false, ""gender"": 2, ""id"": 84130, ""k...","[{""adult"": false, ""gender"": 0, ""id"": 112786, ""..."
4,31975,Sesame Street: Elmo Loves You!,[],[]
