In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from datetime import datetime
from pathlib import Path
from imdb import Cinemagoer

In [2]:
# loading the data from The Numbers
the_numbers_df = pd.read_csv('The_Numbers_data_cleaned.csv')

In [3]:
the_numbers_df

Unnamed: 0,index,release_date,movie,budget,domestic_gross,worldwide_gross
0,1,4/23/2019,Avengers: Endgame,400000000,858373000,2797800564
1,2,5/20/2011,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,3,4/22/2015,Avengers: Age of Ultron,365000000,459005868,1395316979
3,4,12/16/2015,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,5,4/25/2018,Avengers: Infinity War,300000000,678815482,2048359754
...,...,...,...,...,...,...
6222,6223,Unknown,Red 11,7000,0,0
6223,6224,4/2/1999,Following,6000,48482,240495
6224,6225,7/13/2005,Return to the Land of Wonders,5000,1338,1338
6225,6226,9/29/2015,A Plague So Pleasant,1400,0,0


In [4]:
the_numbers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6227 entries, 0 to 6226
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            6227 non-null   int64 
 1   release_date     6227 non-null   object
 2   movie            6227 non-null   object
 3   budget           6227 non-null   object
 4   domestic_gross   6227 non-null   object
 5   worldwide_gross  6227 non-null   object
dtypes: int64(1), object(5)
memory usage: 292.0+ KB


In [5]:
# changing release_date to datetime
the_numbers_df['release_date'] = pd.to_datetime(the_numbers_df['release_date'], errors ='coerce')

In [6]:
# converting columns to integer
the_numbers_df['budget'] = the_numbers_df['budget'].str.replace(',','').astype('int64')
the_numbers_df['domestic_gross'] = the_numbers_df['domestic_gross'].str.replace(',','').astype('int64')
the_numbers_df['worldwide_gross'] = the_numbers_df['worldwide_gross'].str.replace(',','').astype('int64')

In [7]:
the_numbers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6227 entries, 0 to 6226
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            6227 non-null   int64         
 1   release_date     6112 non-null   datetime64[ns]
 2   movie            6227 non-null   object        
 3   budget           6227 non-null   int64         
 4   domestic_gross   6227 non-null   int64         
 5   worldwide_gross  6227 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 292.0+ KB


In [8]:
the_numbers_df.head()

Unnamed: 0,index,release_date,movie,budget,domestic_gross,worldwide_gross
0,1,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,3,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979
3,4,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,5,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754


In [9]:
# dropping nulls
the_numbers_df.dropna(inplace=True)

In [10]:
# filtering
# 1. release_date after 1999
# 2. release_date before 2020 (pre-pandemic)
# 3. worldwide_gross >= $500,000
# 4. budget >= $5,000,000 #(the_numbers_df['budget']>=5000000)
budget_df = the_numbers_df[(the_numbers_df['release_date']>='2000-01-01') & (the_numbers_df['release_date']<='2019-12-31') & (the_numbers_df['worldwide_gross']>=500000)]

In [11]:
budget_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3811 entries, 0 to 6219
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            3811 non-null   int64         
 1   release_date     3811 non-null   datetime64[ns]
 2   movie            3811 non-null   object        
 3   budget           3811 non-null   int64         
 4   domestic_gross   3811 non-null   int64         
 5   worldwide_gross  3811 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 208.4+ KB


In [12]:
# resetting index
budget_df.reset_index(inplace=True)

In [13]:
# dropping index columns
budget_df.drop(['index', 'level_0'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [14]:
budget_df

Unnamed: 0,release_date,movie,budget,domestic_gross,worldwide_gross
0,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979
3,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754
...,...,...,...,...,...
3806,2017-07-07,A Ghost Story,100000,1594798,2769782
3807,2004-05-07,Super Size Me,65000,11529368,22233808
3808,2001-03-16,Gabriela,50000,2335352,2335352
3809,2001-03-09,Dayereh,10000,673780,683509


In [96]:
#imdb_rating = []
#main_cast_1 = []
#main_cast_2 = []
#main_cast_3 = []

In [242]:
# crawling imdb (1st attempt)
print(datetime.now())
for i in range(0,len(budget_df)):
    # searching the movie on imdb
    search_url = 'https://www.imdb.com/search/title/?title=' +str(budget_df['movie'][i])+ '&title_type=feature&release_date=' +str(budget_df['release_date'][i].year)+'-01-01,' +str(budget_df['release_date'][i].year)+ '-12-31'
    response_search = requests.get(search_url)
    soup_search = BeautifulSoup(response_search.content, 'html.parser')
    
    try:
        # obtaining the movie page
        movie_url = 'http://imdb.com'+soup_search.select('h3 a')[0].get('href')
        response = requests.get(movie_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # adding rating to list
        try:
            rating = soup.select('span.AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV')[0].get_text()
            imdb_rating.append(rating)
        except:
            imdb_rating.append(np.nan)

        # adding cast 1 to list
        try:
            cast1 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(1) > a')[0].get_text()
            main_cast_1.append(cast1)
        except:
            main_cast_1.append(np.nan)

        # adding cast 2 to list
        try:
            cast2 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(2) > a')[0].get_text()
            main_cast_2.append(cast2)
        except:
            main_cast_2.append(np.nan)

        # adding cast 3 to list
        try:
            cast3 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(3) > a')[0].get_text()
            main_cast_3.append(cast3)
        except:
            main_cast_3.append(np.nan)
            
    except:
        imdb_rating.append(np.nan)
        main_cast_1.append(np.nan)
        main_cast_2.append(np.nan)
        main_cast_3.append(np.nan)
    
    # adding sleep time
    wait_time = 1
    sleep(wait_time)

print(datetime.now())

2022-03-04 09:00:52.530667
2022-03-04 09:32:20.828181


In [257]:
# creating a dataframe for crawled data
imdb_df = pd.DataFrame(columns=['title','imdb_rating','cast_1','cast_2','cast_3'])

In [258]:
# filling crawled data into the new dataframe
imdb_df['title'] = budget_df['movie']
imdb_df['imdb_rating'] = imdb_rating
imdb_df['cast_1'] = main_cast_1
imdb_df['cast_2'] = main_cast_2
imdb_df['cast_3'] = main_cast_3

In [260]:
imdb_df

Unnamed: 0,title,imdb_rating,cast_1,cast_2,cast_3
0,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo
1,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane
2,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo
3,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac
4,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo
...,...,...,...,...,...
3806,A Ghost Story,6.8,Casey Affleck,Rooney Mara,McColm Cephas Jr.
3807,Super Size Me,,,,
3808,Gabriela,4.8,Jaime Gomez,Seidy Lopez,Zach Galligan
3809,Dayereh,,,,


In [261]:
# saving the initial imdb_df to a csv 
filepath = Path('imdb_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_df.to_csv(filepath, index=False)

In [15]:
# loading the initial imdb_df 
imdb_df = pd.read_csv('imdb_df.csv')

In [80]:
# creating a dataframe for movies without data (1st attempt)
imdb_missing_df = imdb_df[imdb_df['imdb_rating'].isna()]

In [324]:
# creating a movie_url column with null values
movie_url = []
for i in range(0,len(imdb_missing_df)):
    movie_url.append(np.nan)
imdb_missing_df['movie_url'] = movie_url

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['movie_url'] = movie_url


In [214]:
# extracting imdb_id for movies
ia = Cinemagoer()
imdb_id = []
for i in range(0, len(imdb_missing_df)):
    try:
        movie = ia.search_movie(imdb_missing_df['title'][i])[0]
        imdb_id.append('tt'+str(ia.get_imdbID(movie)))
    except:
        imdb_id.append(np.nan)

In [219]:
# updating movie_url based on imdb_id
for i in range(0, len(imdb_missing_df)):
    if type(imdb_missing_df['movie_url'][i]) == float:
        imdb_missing_df['movie_url'][i] = 'https://www.imdb.com/title/'+str(imdb_missing_df['imdb_id'][i])+'/'
    else:
        imdb_missing_df['movie_url'][i] = imdb_missing_df['movie_url'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['movie_url'][i] = imdb_missing_df['movie_url'][i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['movie_url'][i] = 'https://www.imdb.com/title/'+str(imdb_missing_df['imdb_id'][i])+'/'


In [330]:
# updating imdb_id after manually checking and correcting the movie_url
for i in range(0, len(imdb_missing_df)):
    imdb_missing_df['imdb_id'][i] = imdb_missing_df['movie_url'][i].partition('/title/')[-1].partition('/')[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['imdb_id'][i] = imdb_missing_df['movie_url'][i].partition('/title/')[-1].partition('/')[0]


In [386]:
# crawling imdb for missing data (2nd attempt)
print(datetime.now())
for i in range(0, len(imdb_missing_df)):
    response = requests.get(imdb_missing_df['movie_url'][i])
    soup = BeautifulSoup(response.content, 'html.parser')

    # adding rating to list
    try:
        imdb_missing_df['imdb_rating'][i] = soup.select('span.AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV')[0].get_text()
    except:
        imdb_missing_df['imdb_rating'][i] = np.nan

    # adding cast 1 to list
    try:
        imdb_missing_df['cast_1'][i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(1) > a')[0].get_text()
    except:
        imdb_missing_df['cast_1'][i] = np.nan

    # adding cast 2 to list
    try:
        imdb_missing_df['cast_2'][i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(2) > a')[0].get_text()
    except:
        imdb_missing_df['cast_2'][i] = np.nan

    # adding cast 3 to list
    try:
        imdb_missing_df['cast_3'][i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(3) > a')[0].get_text()
    except:
        imdb_missing_df['cast_3'][i] = np.nan

    # adding sleep time
    wait_time = 1
    sleep(wait_time)

print(datetime.now())

2022-03-06 15:42:15.498751


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['imdb_rating'][i] = soup.select('span.AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV')[0].get_text()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['cast_1'][i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(1) > a')[0].get_text()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df['cast_2'][i] = soup.select('div.PrincipalCredit

2022-03-06 16:41:10.595930


In [550]:
# creating a dataframe for movies without data (2nd attempt)
imdb_missing_df2 = imdb_missing_df[imdb_missing_df['cast_1'].isna()]
imdb_missing_df2

Unnamed: 0,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id
35,Michael Jackson's This Is It,7.3,,,,https://www.imdb.com/title/tt1477715/,tt1477715
137,Shi Yue Wei Cheng,8.4,,,,https://www.imdb.com/title/tt1403130/,tt4559518
200,U2 3D,8.4,,,,https://www.imdb.com/title/tt0892375/,tt0892375
227,Justin Bieber: Never Say Never,1.7,,,,https://www.imdb.com/title/tt1702443/,tt1702443
240,Katy Perry: Part of Me,5.9,,,,https://www.imdb.com/title/tt2215719/,tt2215719
282,Hillsong: Let Hope Rise,6.4,,,,https://www.imdb.com/title/tt3850544/,tt3850544
468,Meet the Mormons,6.8,,,,https://www.imdb.com/title/tt4003774/,tt4003774
478,In the Shadow of the Moon,8.0,,,,https://www.imdb.com/title/tt0925248/,tt0925248
504,The Gatekeepers,7.6,,,,https://www.imdb.com/title/tt2309788/,tt2309788
528,Shine a Light,7.1,,,,https://www.imdb.com/title/tt0893382/,tt0893382


In [563]:
# crawling imdb for missing data (3rd attempt)
print(datetime.now())
for i in range(0, len(imdb_missing_df2)):
    response = requests.get(imdb_missing_df2['movie_url'].iloc[i])
    soup = BeautifulSoup(response.content, 'html.parser')

    # adding cast 1 to list
    try:
        imdb_missing_df2['cast_1'].iloc[i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm > ul > li.ipc-metadata-list__item.ipc-metadata-list-item--link > div > ul > li:nth-child(1) > a')[0].get_text()
    except:
        imdb_missing_df2['cast_1'].iloc[i] = np.nan

    # adding cast 2 to list
    try:
        imdb_missing_df2['cast_2'].iloc[i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm > ul > li.ipc-metadata-list__item.ipc-metadata-list-item--link > div > ul > li:nth-child(2) > a')[0].get_text()
    except:
        imdb_missing_df2['cast_2'].iloc[i] = np.nan

    # adding cast 3 to list
    try:
        imdb_missing_df2['cast_3'].iloc[i] = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm > ul > li.ipc-metadata-list__item.ipc-metadata-list-item--link > div > ul > li:nth-child(3) > a')[0].get_text()
    except:
        imdb_missing_df2['cast_3'].iloc[i] = np.nan

    # adding sleep time
    wait_time = 1
    sleep(wait_time)

print(datetime.now())

2022-03-06 18:09:25.094437
2022-03-06 18:10:38.709188


In [567]:
# substituting missing values in imdb_missing_df from imdb_missing_df2
index = imdb_missing_df[imdb_missing_df['cast_1'].isna()].index
for i in range(0, len(imdb_missing_df2)):    
    imdb_missing_df[index[i]:index[i]+1]['imdb_rating'] = imdb_missing_df2['imdb_rating'].iloc[i]
    imdb_missing_df[index[i]:index[i]+1]['cast_1'] = imdb_missing_df2['cast_1'].iloc[i]
    imdb_missing_df[index[i]:index[i]+1]['cast_2'] = imdb_missing_df2['cast_2'].iloc[i]
    imdb_missing_df[index[i]:index[i]+1]['cast_3'] = imdb_missing_df2['cast_3'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df[index[i]:index[i]+1]['imdb_rating'] = imdb_missing_df2['imdb_rating'].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df[index[i]:index[i]+1]['cast_1'] = imdb_missing_df2['cast_1'].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_missing_df[index[i]:in

In [570]:
# saving the imdb_missing_df
filepath = Path('imdb_missing_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_missing_df.to_csv(filepath, index=False)

In [580]:
# substituting missing values in imdb_df from imdb_missing_df
index = imdb_df[imdb_df['imdb_rating'].isna()].index
for i in range(0, len(imdb_missing_df)):    
    imdb_df[index[i]:index[i]+1]['imdb_rating'] = imdb_missing_df['imdb_rating'].iloc[i]
    imdb_df[index[i]:index[i]+1]['cast_1'] = imdb_missing_df['cast_1'].iloc[i]
    imdb_df[index[i]:index[i]+1]['cast_2'] = imdb_missing_df['cast_2'].iloc[i]
    imdb_df[index[i]:index[i]+1]['cast_3'] = imdb_missing_df['cast_3'].iloc[i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df[index[i]:index[i]+1]['imdb_rating'] = imdb_missing_df['imdb_rating'].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df[index[i]:index[i]+1]['cast_1'] = imdb_missing_df['cast_1'].iloc[i]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_df[index[i]:index[i]+1]['cast_2'] = imdb

In [589]:
# adding the movie_url and imdb_id to the imdb_df and saving it to a new dataframe
imdb_full_df = imdb_df.merge(imdb_missing_df, how='left', on=['title', 'imdb_rating', 'cast_1', 'cast_2', 'cast_3'])

In [646]:
# checking null values for imdb_full_df
imdb_full_df.isna().sum()

title             0
imdb_rating       0
cast_1            0
cast_2            0
cast_3            5
movie_url      3220
imdb_id        3220
dtype: int64

In [670]:
# fetching all imdb_id
for i in range(0, len(imdb_full_df)):
    if type(imdb_full_df['imdb_id'][i]) == float:
        try:
            movie = ia.search_movie(imdb_full_df['title'][i])[0]
            imdb_full_df['imdb_id'][i] = 'tt'+str(ia.get_imdbID(movie))
        except:
            imdb_full_df['imdb_id'][i] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_full_df['imdb_id'][i] = 'tt'+str(ia.get_imdbID(movie))
2022-03-06 19:21:18,854 CRITICAL [imdbpy] C:\Users\Bosco\anaconda3\envs\DA_ENV\lib\site-packages\imdb\_exceptions.py:32: IMDbParserError exception raised; args: ('invalid title: """"',); kwds: {}
NoneType: None
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_full_df['imdb_id'][i] = np.nan
2022-03-06 19:35:08,352 CRITICAL [imdbpy] C:\Users\Bosco\anaconda3\envs\DA_ENV\lib\site-packages\imdb\_exceptions.py:32: IMDbParserError exception raised; args: ('invalid title: """"',); kwds: {}
NoneType: None
2022-03-06 19:38:55,688 CRITICAL [imdbpy] C:\Users\Bosco\anaconda3

In [710]:
# filling movie_url from imdb_db
for i in range(0, len(imdb_full_df)):
    if type(imdb_full_df['movie_url'][i]) == float:
        imdb_full_df['movie_url'][i] = 'https://www.imdb.com/title/'+imdb_full_df['imdb_id'][i]+'/'
    else:
        imdb_full_df['movie_url'][i] = imdb_full_df['movie_url'][i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_full_df['movie_url'][i] = 'https://www.imdb.com/title/'+imdb_full_df['imdb_id'][i]+'/'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  imdb_full_df['movie_url'][i] = imdb_full_df['movie_url'][i]


In [2339]:
# saving the imdb_full_df
filepath = Path('imdb_full_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_full_df.to_csv(filepath, index=False)

In [74]:
# loading the imdb_full_df
imdb_full_df = pd.read_csv('imdb_full_df.csv')

In [2165]:
# creating lists for extra movie info
imdb_rating_check = []
runtime = []
director = []
genre = []

In [2290]:
# fetching extra movie info
print(datetime.now())
for i in range(0, len(imdb_raw_df)):
    try:
        movie = ia.get_movie(imdb_raw_df['imdb_id'][i].partition('tt')[-1])
        imdb_rating_check.append(movie.get('rating'))
        runtime.append(movie.get('runtimes'))
        genre.append(movie.get('genres'))
        director.append(str(movie.get('director')[0]))
    except:
        imdb_rating_check.append(np.nan)
        runtime.append(np.nan)
        genre.append(np.nan)
        director.append(np.nan)
print(datetime.now())

2022-03-07 21:23:42.392624
2022-03-07 22:17:20.403529


In [2335]:
# creating a dataframe for extra data
imdb_extra_df = pd.DataFrame(columns=['rating_check', 'runtime', 'genre'])
imdb_extra_df['rating_check'] = imdb_rating_check
imdb_extra_df['runtime'] = runtime
imdb_extra_df['genre'] = genre
imdb_extra_df['director'] = director

In [80]:
imdb_extra_df

Unnamed: 0,rating_check,runtime,genre,director
0,8.4,['181'],"['Action', 'Adventure', 'Drama', 'Sci-Fi']",Anthony Russo
1,6.6,['137'],"['Action', 'Adventure', 'Fantasy']",Rob Marshall
2,7.3,['141'],"['Action', 'Adventure', 'Sci-Fi']",Joss Whedon
3,7.9,['138'],"['Action', 'Adventure', 'Sci-Fi']",J.J. Abrams
4,8.5,['149'],"['Action', 'Adventure', 'Sci-Fi']",Anthony Russo
...,...,...,...,...
3309,7.5,['89'],"['Comedy', 'Horror']",Eli Craig
3310,6.1,['90'],"['Crime', 'Thriller', 'War']",Brian De Palma
3311,7.4,['105'],"['Comedy', 'Drama', 'Music']",William H. Macy
3312,7.2,['102'],"['Crime', 'Drama', 'Thriller']",Jaume Balagueró


In [49]:
# saving the imdb_extra_df
filepath = Path('imdb_extra_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_extra_df.to_csv(filepath, index=False)

In [17]:
# loading the imdb_extra_df 
imdb_extra_df = pd.read_csv('imdb_extra_df.csv')

In [2137]:
# merging the budget_df and imdb_full_df
imdb_raw_df = pd.concat([budget_df, imdb_full_df], axis=1)
imdb_raw_df

Unnamed: 0,release_date,movie,budget,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id,director
0,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796,Anthony Russo
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650,Rob Marshall
2,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427,Joss Whedon
3,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496,J.J. Abrams
4,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756,Anthony Russo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806,2017-07-07,A Ghost Story,100000,1594798,2769782,A Ghost Story,6.8,Casey Affleck,Rooney Mara,McColm Cephas Jr.,https://www.imdb.com/title/tt6265828/,tt6265828,David Lowery
3807,2004-05-07,Super Size Me,65000,11529368,22233808,Super Size Me,7.2,Morgan Spurlock,Daryl Isaacs,Chemeeka Walker,https://www.imdb.com/title/tt0390521/,tt0390521,Morgan Spurlock
3808,2001-03-16,Gabriela,50000,2335352,2335352,Gabriela,4.8,Jaime Gomez,Seidy Lopez,Zach Galligan,https://www.imdb.com/title/tt0085575/,tt0085575,Bruno Barreto
3809,2001-03-09,Dayereh,10000,673780,683509,Dayereh,7.4,Maryiam Palvin Almani,Nargess Mamizadeh,Mojgan Faramarzi,https://www.imdb.com/title/tt0255094/,tt0255094,Jafar Panahi


In [2138]:
# dropping movies with duplicate title
imdb_raw_df.drop_duplicates(subset=['movie'],inplace=True)
imdb_raw_df

Unnamed: 0,release_date,movie,budget,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id,director
0,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796,Anthony Russo
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650,Rob Marshall
2,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427,Joss Whedon
3,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496,J.J. Abrams
4,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756,Anthony Russo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3806,2017-07-07,A Ghost Story,100000,1594798,2769782,A Ghost Story,6.8,Casey Affleck,Rooney Mara,McColm Cephas Jr.,https://www.imdb.com/title/tt6265828/,tt6265828,David Lowery
3807,2004-05-07,Super Size Me,65000,11529368,22233808,Super Size Me,7.2,Morgan Spurlock,Daryl Isaacs,Chemeeka Walker,https://www.imdb.com/title/tt0390521/,tt0390521,Morgan Spurlock
3808,2001-03-16,Gabriela,50000,2335352,2335352,Gabriela,4.8,Jaime Gomez,Seidy Lopez,Zach Galligan,https://www.imdb.com/title/tt0085575/,tt0085575,Bruno Barreto
3809,2001-03-09,Dayereh,10000,673780,683509,Dayereh,7.4,Maryiam Palvin Almani,Nargess Mamizadeh,Mojgan Faramarzi,https://www.imdb.com/title/tt0255094/,tt0255094,Jafar Panahi


In [2139]:
# filtering movies with budget >= $5,000,000
imdb_raw_df = imdb_raw_df[imdb_raw_df['budget']>=5000000]

In [2140]:
# resetting index
imdb_raw_df.reset_index(inplace=True)

In [2143]:
# dropping duplicate column
imdb_raw_df.drop(columns=['movie'], inplace=True)
imdb_raw_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,release_date,budget,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id,director
0,2019-04-23,400000000,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796,Anthony Russo
1,2011-05-20,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650,Rob Marshall
2,2015-04-22,365000000,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427,Joss Whedon
3,2015-12-16,306000000,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496,J.J. Abrams
4,2018-04-25,300000000,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756,Anthony Russo
...,...,...,...,...,...,...,...,...,...,...,...,...
3309,2011-09-30,5000000,223838,5476793,Tucker & Dale vs. Evil,7.5,Tyler Labine,Alan Tudyk,Katrina Bowden,https://www.imdb.com/title/tt1465522/,tt1465522,Eli Craig
3310,2007-11-16,5000000,65388,861325,Redacted,6.1,Patrick Carroll,Rob Devaney,Izzy Diaz,https://www.imdb.com/title/tt0937237/,tt0937237,Brian De Palma
3311,2014-10-17,5000000,56001,584499,Rudderless,7.4,Billy Crudup,Anton Yelchin,Felicity Huffman,https://www.imdb.com/title/tt1798243/,tt1798243,William H. Macy
3312,2012-05-17,5000000,0,9109597,Mientras duermes,7.2,Luis Tosar,Marta Etura,Alberto San Juan,https://www.imdb.com/title/tt1437358/,tt1437358,Jaume Balagueró


In [2338]:
# saving the imdb_raw_df
filepath = Path('imdb_raw_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_raw_df.to_csv(filepath, index=False)

In [26]:
# loading the imdb_raw_df 
imdb_raw_df = pd.read_csv('imdb_raw_df.csv')

In [76]:
# merging the imdb_raw_df and imdb_extra_df
imdb_full_info_df = pd.concat([imdb_raw_df, imdb_extra_df], axis=1)
imdb_full_info_df

Unnamed: 0,release_date,budget,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id,director,rating_check,runtime,genre,director.1
0,2019-04-23,400000000,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796,Anthony Russo,8.4,['181'],"['Action', 'Adventure', 'Drama', 'Sci-Fi']",Anthony Russo
1,2011-05-20,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650,Rob Marshall,6.6,['137'],"['Action', 'Adventure', 'Fantasy']",Rob Marshall
2,2015-04-22,365000000,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427,Joss Whedon,7.3,['141'],"['Action', 'Adventure', 'Sci-Fi']",Joss Whedon
3,2015-12-16,306000000,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496,J.J. Abrams,7.9,['138'],"['Action', 'Adventure', 'Sci-Fi']",J.J. Abrams
4,2018-04-25,300000000,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756,Anthony Russo,8.5,['149'],"['Action', 'Adventure', 'Sci-Fi']",Anthony Russo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,2011-09-30,5000000,223838,5476793,Tucker & Dale vs. Evil,7.5,Tyler Labine,Alan Tudyk,Katrina Bowden,https://www.imdb.com/title/tt1465522/,tt1465522,Eli Craig,7.5,['89'],"['Comedy', 'Horror']",Eli Craig
3310,2007-11-16,5000000,65388,861325,Redacted,6.1,Patrick Carroll,Rob Devaney,Izzy Diaz,https://www.imdb.com/title/tt0937237/,tt0937237,Brian De Palma,6.1,['90'],"['Crime', 'Thriller', 'War']",Brian De Palma
3311,2014-10-17,5000000,56001,584499,Rudderless,7.4,Billy Crudup,Anton Yelchin,Felicity Huffman,https://www.imdb.com/title/tt1798243/,tt1798243,William H. Macy,7.4,['105'],"['Comedy', 'Drama', 'Music']",William H. Macy
3312,2012-05-17,5000000,0,9109597,Mientras duermes,7.2,Luis Tosar,Marta Etura,Alberto San Juan,https://www.imdb.com/title/tt1437358/,tt1437358,Jaume Balagueró,7.2,['102'],"['Crime', 'Drama', 'Thriller']",Jaume Balagueró


In [77]:
imdb_full_info_df.isna().sum()

release_date        0
budget              0
domestic_gross      0
worldwide_gross     0
title               0
imdb_rating         0
cast_1              0
cast_2              0
cast_3              4
movie_url           0
imdb_id             0
director           92
rating_check        0
runtime             0
genre               0
director            0
dtype: int64

In [78]:
# saving the imdb_full_info_df
filepath = Path('imdb_full_info_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_full_info_df.to_csv(filepath, index=False)

In [85]:
# loading the Merged_df_corrected_scores_and_language
merged_df = pd.read_csv('Merged_df_corrected_scores_and_language.csv', sep=';')

In [91]:
merged_df.drop(columns=['title_x', 'producer','director_x', 'genre', 'release_date_x', 'budget_x', 'domestic_gross_x', 'worldwide_gross_x',
       'imdb_rating', 'cast_1', 'cast_2', 'cast_3', 'movie_url', 'imdb_id','director_y'], inplace=True)

In [120]:
# merging the 2 dataframes
final_v0_df = pd.concat([imdb_full_info_df, merged_df], axis=1)

In [121]:
final_v0_df.drop(columns=['movie_url', 'imdb_id', 'rating_check'], axis=1, inplace=True)

In [124]:
final_v0_df.isna().sum()

release_date           0
budget                 0
domestic_gross         0
worldwide_gross        0
title                  0
imdb_rating            0
cast_1                 0
cast_2                 0
cast_3                 4
director              92
runtime                0
genre                  0
director               0
tomatometer_score     37
audience_score_rt     10
original_language     21
MPA_Ratings          120
dtype: int64

17

In [2377]:
combined_df.drop(columns=['title_x', 'release_date_x', 'budget_x', 'domestic_gross_x', 'worldwide_gross_x', 'director'], inplace=True)

In [2378]:
combined_df

Unnamed: 0,tomatometer_score,audience_score_rt,original_language,MPA_Ratings,release_date,budget,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id
0,94.0,90.0,English,PG-13,2019-04-23,400000000,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796
1,33.0,54.0,English,PG-13,2011-05-20,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650
2,76.0,83.0,English,PG-13,2015-04-22,365000000,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427
3,93.0,85.0,English,PG-13,2015-12-16,306000000,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496
4,85.0,91.0,English,PG-13,2018-04-25,300000000,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,85.0,85.0,English,R,2011-09-30,5000000,223838,5476793,Tucker & Dale vs. Evil,7.5,Tyler Labine,Alan Tudyk,Katrina Bowden,https://www.imdb.com/title/tt1465522/,tt1465522
3310,45.0,45.0,English,R,2007-11-16,5000000,65388,861325,Redacted,6.1,Patrick Carroll,Rob Devaney,Izzy Diaz,https://www.imdb.com/title/tt0937237/,tt0937237
3311,64.0,83.0,English,R,2014-10-17,5000000,56001,584499,Rudderless,7.4,Billy Crudup,Anton Yelchin,Felicity Huffman,https://www.imdb.com/title/tt1798243/,tt1798243
3312,91.0,78.0,Jaume Balagueró,,2012-05-17,5000000,0,9109597,Mientras duermes,7.2,Luis Tosar,Marta Etura,Alberto San Juan,https://www.imdb.com/title/tt1437358/,tt1437358


In [2379]:
# merging the combined_df and imdb_extra_df
full_info_df = pd.concat([combined_df, imdb_extra_df], axis=1)
full_info_df

Unnamed: 0,tomatometer_score,audience_score_rt,original_language,MPA_Ratings,release_date,budget,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id,rating_check,runtime,genre,director
0,94.0,90.0,English,PG-13,2019-04-23,400000000,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796,8.4,[181],"[Action, Adventure, Drama, Sci-Fi]",Anthony Russo
1,33.0,54.0,English,PG-13,2011-05-20,379000000,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650,6.6,[137],"[Action, Adventure, Fantasy]",Rob Marshall
2,76.0,83.0,English,PG-13,2015-04-22,365000000,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427,7.3,[141],"[Action, Adventure, Sci-Fi]",Joss Whedon
3,93.0,85.0,English,PG-13,2015-12-16,306000000,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496,7.9,[138],"[Action, Adventure, Sci-Fi]",J.J. Abrams
4,85.0,91.0,English,PG-13,2018-04-25,300000000,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756,8.5,[149],"[Action, Adventure, Sci-Fi]",Anthony Russo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,85.0,85.0,English,R,2011-09-30,5000000,223838,5476793,Tucker & Dale vs. Evil,7.5,Tyler Labine,Alan Tudyk,Katrina Bowden,https://www.imdb.com/title/tt1465522/,tt1465522,7.5,[89],"[Comedy, Horror]",Eli Craig
3310,45.0,45.0,English,R,2007-11-16,5000000,65388,861325,Redacted,6.1,Patrick Carroll,Rob Devaney,Izzy Diaz,https://www.imdb.com/title/tt0937237/,tt0937237,6.1,[90],"[Crime, Thriller, War]",Brian De Palma
3311,64.0,83.0,English,R,2014-10-17,5000000,56001,584499,Rudderless,7.4,Billy Crudup,Anton Yelchin,Felicity Huffman,https://www.imdb.com/title/tt1798243/,tt1798243,7.4,[105],"[Comedy, Drama, Music]",William H. Macy
3312,91.0,78.0,Jaume Balagueró,,2012-05-17,5000000,0,9109597,Mientras duermes,7.2,Luis Tosar,Marta Etura,Alberto San Juan,https://www.imdb.com/title/tt1437358/,tt1437358,7.2,[102],"[Crime, Drama, Thriller]",Jaume Balagueró


In [2382]:
# saving the full_info_df
filepath = Path('full_info_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
full_info_df.to_csv(filepath, index=False)

In [126]:
# saving the final_v0_df
filepath = Path('final_v0_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
final_v0_df.to_csv(filepath, index=False)

In [683]:
movie.infoset2keys

{'main': ['localized title',
  'cast',
  'genres',
  'runtimes',
  'countries',
  'country codes',
  'language codes',
  'color info',
  'aspect ratio',
  'sound mix',
  'box office',
  'certificates',
  'original air date',
  'rating',
  'votes',
  'cover url',
  'imdbID',
  'plot outline',
  'languages',
  'title',
  'year',
  'kind',
  'original title',
  'director',
  'writer',
  'producer',
  'composer',
  'cinematographer',
  'editor',
  'editorial department',
  'casting director',
  'production design',
  'art direction',
  'set decoration',
  'costume designer',
  'make up',
  'production manager',
  'assistant director',
  'art department',
  'sound crew',
  'special effects',
  'visual effects',
  'stunt performer',
  'camera and electrical department',
  'animation department',
  'casting department',
  'costume department',
  'location management',
  'music department',
  'script department',
  'transportation department',
  'miscellaneous crew',
  'thanks',
  'akas',
  't

In [2147]:
merged_df = pd.read_csv('Merged_df.csv')

In [2149]:
Merged_df_corrected_scores = pd.read_csv('Merged_df_corrected_scores.csv', sep=';')

In [2153]:
merged_df['tomatometer_score'] = Merged_df_corrected_scores['tomatometer_score']
merged_df['audience_score_rt'] = Merged_df_corrected_scores['audience_score_rt']
merged_df['MPA_Ratings'] = Merged_df_corrected_scores['MPA_Ratings']

In [2375]:
merged_df.drop(columns=['producer', 'director_x', 'genre', 'movie', 'imdb_rating', 'cast_1', 'cast_2', 'cast_3', 'movie_url', 'imdb_id', 'director_y'], inplace=True)

KeyError: "['producer' 'director_x' 'genre' 'movie' 'imdb_rating' 'cast_1' 'cast_2'\n 'cast_3' 'movie_url' 'imdb_id' 'director_y'] not found in axis"

In [2374]:
combined_df = pd.concat([merged_df, imdb_raw_df], axis=1)
combined_df

Unnamed: 0,title_x,tomatometer_score,audience_score_rt,original_language,MPA_Ratings,release_date_x,budget_x,domestic_gross_x,worldwide_gross_x,release_date,...,domestic_gross,worldwide_gross,title,imdb_rating,cast_1,cast_2,cast_3,movie_url,imdb_id,director
0,Avengers: Endgame,94.0,90.0,English,PG-13,2019-04-23,400000000,858373000,2797800564,2019-04-23,...,858373000,2797800564,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt4154796/,tt4154796,Anthony Russo
1,Pirates of the Caribbean: On Stranger Tides,33.0,54.0,English,PG-13,2011-05-20,379000000,241071802,1045713802,2011-05-20,...,241071802,1045713802,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane,https://www.imdb.com/title/tt1298650/,tt1298650,Rob Marshall
2,Avengers: Age of Ultron,76.0,83.0,English,PG-13,2015-04-22,365000000,459005868,1395316979,2015-04-22,...,459005868,1395316979,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo,https://www.imdb.com/title/tt2395427/,tt2395427,Joss Whedon
3,Star Wars Episode VII: The Force Awakens,93.0,85.0,English,PG-13,2015-12-16,306000000,936662225,2064615817,2015-12-16,...,936662225,2064615817,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac,https://www.imdb.com/title/tt2488496/,tt2488496,J.J. Abrams
4,Avengers: Infinity War,85.0,91.0,English,PG-13,2018-04-25,300000000,678815482,2048359754,2018-04-25,...,678815482,2048359754,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo,https://www.imdb.com/title/tt4154756/,tt4154756,Anthony Russo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3309,Tucker & Dale vs. Evil,85.0,85.0,English,R,2011-09-30,5000000,223838,5476793,2011-09-30,...,223838,5476793,Tucker & Dale vs. Evil,7.5,Tyler Labine,Alan Tudyk,Katrina Bowden,https://www.imdb.com/title/tt1465522/,tt1465522,Eli Craig
3310,Redacted,45.0,45.0,English,R,2007-11-16,5000000,65388,861325,2007-11-16,...,65388,861325,Redacted,6.1,Patrick Carroll,Rob Devaney,Izzy Diaz,https://www.imdb.com/title/tt0937237/,tt0937237,Brian De Palma
3311,Rudderless,64.0,83.0,English,R,2014-10-17,5000000,56001,584499,2014-10-17,...,56001,584499,Rudderless,7.4,Billy Crudup,Anton Yelchin,Felicity Huffman,https://www.imdb.com/title/tt1798243/,tt1798243,William H. Macy
3312,Mientras duermes,91.0,78.0,Jaume Balagueró,,2012-05-17,5000000,0,9109597,2012-05-17,...,0,9109597,Mientras duermes,7.2,Luis Tosar,Marta Etura,Alberto San Juan,https://www.imdb.com/title/tt1437358/,tt1437358,Jaume Balagueró


In [2160]:
# saving the combined_df
filepath = Path('combined_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
combined_df.to_csv(filepath, index=False)

In [2161]:
combined_df.isna().sum()

title_x                0
tomatometer_score     32
audience_score_rt     15
original_language     26
MPA_Ratings          155
release_date_x         0
budget_x               0
domestic_gross_x       0
worldwide_gross_x      0
release_date           0
budget                 0
domestic_gross         0
worldwide_gross        0
title                  0
imdb_rating            0
cast_1                 0
cast_2                 0
cast_3                 4
movie_url              0
imdb_id                0
director              92
dtype: int64