In [81]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from time import sleep
from random import randint
from datetime import datetime

In [82]:
# loading the data from The Numbers
the_numbers_df = pd.read_csv('The_Numbers_data_cleaned.csv')

In [83]:
the_numbers_df

Unnamed: 0,index,release_date,movie,budget,domestic_gross,worldwide_gross
0,1,4/23/2019,Avengers: Endgame,400000000,858373000,2797800564
1,2,5/20/2011,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,3,4/22/2015,Avengers: Age of Ultron,365000000,459005868,1395316979
3,4,12/16/2015,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,5,4/25/2018,Avengers: Infinity War,300000000,678815482,2048359754
...,...,...,...,...,...,...
6222,6223,Unknown,Red 11,7000,0,0
6223,6224,4/2/1999,Following,6000,48482,240495
6224,6225,7/13/2005,Return to the Land of Wonders,5000,1338,1338
6225,6226,9/29/2015,A Plague So Pleasant,1400,0,0


In [84]:
the_numbers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6227 entries, 0 to 6226
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   index            6227 non-null   int64 
 1   release_date     6227 non-null   object
 2   movie            6227 non-null   object
 3   budget           6227 non-null   object
 4   domestic_gross   6227 non-null   object
 5   worldwide_gross  6227 non-null   object
dtypes: int64(1), object(5)
memory usage: 292.0+ KB


In [85]:
# changing release_date to datetime
the_numbers_df['release_date'] = pd.to_datetime(the_numbers_df['release_date'], errors ='coerce')

In [86]:
# converting columns to integer
the_numbers_df['budget'] = the_numbers_df['budget'].str.replace(',','').astype('int64')
the_numbers_df['domestic_gross'] = the_numbers_df['domestic_gross'].str.replace(',','').astype('int64')
the_numbers_df['worldwide_gross'] = the_numbers_df['worldwide_gross'].str.replace(',','').astype('int64')

In [87]:
the_numbers_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6227 entries, 0 to 6226
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            6227 non-null   int64         
 1   release_date     6112 non-null   datetime64[ns]
 2   movie            6227 non-null   object        
 3   budget           6227 non-null   int64         
 4   domestic_gross   6227 non-null   int64         
 5   worldwide_gross  6227 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 292.0+ KB


In [88]:
the_numbers_df.head()

Unnamed: 0,index,release_date,movie,budget,domestic_gross,worldwide_gross
0,1,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564
1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,3,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979
3,4,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,5,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754


In [89]:
# dropping nulls
the_numbers_df.dropna(inplace=True)

In [90]:
# filtering
# 1. release_date after 1999
# 2. release_date before 2020 (pre-pandemic)
# 3. worldwide_gross > $500,000
budget_df = the_numbers_df[(the_numbers_df['release_date']>='2000-01-01') & (the_numbers_df['release_date']<='2019-12-31') & (the_numbers_df['worldwide_gross']>=500000)]

In [91]:
budget_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3811 entries, 0 to 6219
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            3811 non-null   int64         
 1   release_date     3811 non-null   datetime64[ns]
 2   movie            3811 non-null   object        
 3   budget           3811 non-null   int64         
 4   domestic_gross   3811 non-null   int64         
 5   worldwide_gross  3811 non-null   int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 208.4+ KB


In [92]:
budget_df.reset_index(inplace=True)

In [93]:
budget_df

Unnamed: 0,level_0,index,release_date,movie,budget,domestic_gross,worldwide_gross
0,0,1,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564
1,1,2,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,2,3,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979
3,3,4,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,4,5,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754
...,...,...,...,...,...,...,...
3806,6119,6120,2017-07-07,A Ghost Story,100000,1594798,2769782
3807,6147,6148,2004-05-07,Super Size Me,65000,11529368,22233808
3808,6155,6156,2001-03-16,Gabriela,50000,2335352,2335352
3809,6210,6211,2001-03-09,Dayereh,10000,673780,683509


In [94]:
# dropping index columns
budget_df.drop(['index', 'level_0'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [95]:
budget_df

Unnamed: 0,release_date,movie,budget,domestic_gross,worldwide_gross
0,2019-04-23,Avengers: Endgame,400000000,858373000,2797800564
1,2011-05-20,Pirates of the Caribbean: On Stranger Tides,379000000,241071802,1045713802
2,2015-04-22,Avengers: Age of Ultron,365000000,459005868,1395316979
3,2015-12-16,Star Wars Episode VII: The Force Awakens,306000000,936662225,2064615817
4,2018-04-25,Avengers: Infinity War,300000000,678815482,2048359754
...,...,...,...,...,...
3806,2017-07-07,A Ghost Story,100000,1594798,2769782
3807,2004-05-07,Super Size Me,65000,11529368,22233808
3808,2001-03-16,Gabriela,50000,2335352,2335352
3809,2001-03-09,Dayereh,10000,673780,683509


In [96]:
#imdb_rating = []
#main_cast_1 = []
#main_cast_2 = []
#main_cast_3 = []

In [164]:
movie_url = 'https://www.imdb.com/title/tt1967614'
movie_url

'https://www.imdb.com/title/tt1967614'

In [168]:
rating = soup.select('span.AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV')[0].get_text()
rating

'5.8'

In [189]:
imdb_df['cast_3'][2499] = cast3

In [169]:
cast1 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(1) > a')[0].get_text()
cast1

'Robert De Niro'

In [170]:
cast2 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(2) > a')[0].get_text()
cast2

'Leslie Mann'

In [171]:
cast3 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(3) > a')[0].get_text()
cast3

'Danny DeVito'

In [165]:
response = requests.get(movie_url)
#__next > main > div > section.ipc-page-background.ipc-page-background--base.MainDetailPageLayout__StyledPageBackground-sc-13rp3wh-0.hsughJ > section > div:nth-child(4) > section > section > div.Hero__MediaContentContainer__Video-sc-kvkd64-2.gVRpZB > div.Hero__ContentContainer-sc-kvkd64-10.frcskz > div.Hero__MetaContainer__Video-sc-kvkd64-4.jMerKX > div.RatingBar__RatingContainer-sc-85l9wd-0.hNqCJh.Hero__HideableRatingBar-sc-kvkd64-12.fWiTEj > div > div:nth-child(1) > a > div > div > div.AggregateRatingButton__ContentWrap-sc-1ll29m0-0.hmJkIS > div.AggregateRatingButton__Rating-sc-1ll29m0-2.bmbYRW > span.AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV

In [166]:
soup = BeautifulSoup(response.content, 'html.parser')

In [142]:
print(datetime.now())
for i in range(1500,2500):
    # searching the movie on imdb
    search_url = 'https://www.imdb.com/search/title/?title=' +str(budget_df['movie'][i])+ '&title_type=feature&release_date=' +str(budget_df['release_date'][i].year)+'-01-01,' +str(budget_df['release_date'][i].year)+ '-12-31'
    response_search = requests.get(search_url)
    soup_search = BeautifulSoup(response_search.content, 'html.parser')
    
    try:
        # obtaining the movie page
        movie_url = 'http://imdb.com'+soup_search.select('h3 a')[0].get('href')
        response = requests.get(movie_url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # adding rating to list
        try:
            rating = soup.select('span.AggregateRatingButton__RatingScore-sc-1ll29m0-1.iTLWoV')[0].get_text()
            imdb_rating.append(rating)
        except:
            imdb_rating.append(np.nan)

        # adding cast 1 to list
        try:
            cast1 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(1) > a')[0].get_text()
            main_cast_1.append(cast1)
        except:
            main_cast_1.append(np.nan)

        # adding cast 2 to list
        try:
            cast2 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(2) > a')[0].get_text()
            main_cast_2.append(cast2)
        except:
            main_cast_2.append(np.nan)

        # adding cast 3 to list
        try:
            cast3 = soup.select('div.PrincipalCredits__PrincipalCreditsPanelWideScreen-sc-hdn81t-0.hzbDAm> ul > li:nth-child(3) > div > ul > li:nth-child(3) > a')[0].get_text()
            main_cast_3.append(cast3)
        except:
            main_cast_3.append(np.nan)
            
    except:
        imdb_rating.append(np.nan)
        main_cast_1.append(np.nan)
        main_cast_2.append(np.nan)
        main_cast_3.append(np.nan)
    
    # adding sleep time
    wait_time = randint(1, 2)
    sleep(wait_time)
    
print(datetime.now())

2022-03-03 21:29:24.180706
2022-03-03 23:55:25.492099


In [143]:
budget_df[2498:2500]

Unnamed: 0,release_date,movie,budget,domestic_gross,worldwide_gross
2498,2008-07-18,Transsiberian,15000000,2203641,6379575
2499,2017-02-03,The Comedian,15000000,1658706,1658706


In [144]:
imdb_rating[2498:2500]

['6.6', '2.9']

In [145]:
main_cast_1[2498:2500]

['Woody Harrelson', nan]

In [146]:
main_cast_2[2498:2500]

['Emily Mortimer', nan]

In [147]:
main_cast_3[2498:2500]

['Ben Kingsley', nan]

In [179]:
# creating a dataframe for crawled data
imdb_df = pd.DataFrame(columns=['title','imdb_rating','cast_1','cast_2','cast_3'])

In [180]:
imdb_df['title'] = budget_df['movie'][0:2500]

In [181]:
imdb_df['imdb_rating'] = imdb_rating
imdb_df['cast_1'] = main_cast_1
imdb_df['cast_2'] = main_cast_2
imdb_df['cast_3'] = main_cast_3

In [190]:
imdb_df

Unnamed: 0,title,imdb_rating,cast_1,cast_2,cast_3
0,Avengers: Endgame,8.4,Robert Downey Jr.,Chris Evans,Mark Ruffalo
1,Pirates of the Caribbean: On Stranger Tides,6.6,Johnny Depp,Penélope Cruz,Ian McShane
2,Avengers: Age of Ultron,7.3,Robert Downey Jr.,Chris Evans,Mark Ruffalo
3,Star Wars Episode VII: The Force Awakens,7.9,Daisy Ridley,John Boyega,Oscar Isaac
4,Avengers: Infinity War,8.5,Robert Downey Jr.,Chris Hemsworth,Mark Ruffalo
...,...,...,...,...,...
2495,Never Let Me Go,7.1,Keira Knightley,Carey Mulligan,Andrew Garfield
2496,The Disappointments Room,3.9,Kate Beckinsale,Mel Raido,Duncan Joiner
2497,The Company,6.2,Neve Campbell,James Franco,Malcolm McDowell
2498,Transsiberian,6.6,Woody Harrelson,Emily Mortimer,Ben Kingsley


In [191]:
# saving the dataframe to a csv
from pathlib import Path  
filepath = Path('imdb_df.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)
imdb_df.to_csv(filepath, index=False)