In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import time
import seaborn as sns
import requests
import json
from imdb import IMDb

## Scrape Box Office Data Using BeautifulSoup

In [2]:
def get_opening(soup, year):
    table = soup.find_all('table')[0]
    movie_list = []
    for row in table.find_all('tr')[2:102]:
        cells = row.find_all('td')
        if len(cells) > 0:
            url = cells[1].find('a')['href']
            movie_with_year = cells[1].text+" ("+year+")"
            title = cells[1].text
            tot_gross = cells[6].text
            theaters = cells[7].text
            release_date = cells[11].text + "/" + year
            cell_dict = {'url':url, "movie_name":movie_with_year, "title" : title,
                         'gross':tot_gross, 'theaters': theaters, 'release_date': release_date}
            movie_list.append(cell_dict)
    return movie_list

In [3]:
years = []
for i in range(1977, 2020):
    years.append(str(i))

def scrape_boxoffice(years):
    movie_list = []
    for year in years:
        url = 'https://www.boxofficemojo.com/year/'+str(year)+'/?grossesOption=totalGrosses'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        if len(movie_list) == 0:
            movie_list = get_opening(soup, year)
        else:
            movie_list = movie_list + get_opening(soup, year)
        # pause for 5seconds to limit traffic
        time.sleep(5)
        print("Successfully scraped " + year + "'s list")
    return movie_list

# Uncomment below to scrape
movie_list = scrape_boxoffice(years)

Successfully scraped 1977's list
Successfully scraped 1978's list
Successfully scraped 1979's list
Successfully scraped 1980's list
Successfully scraped 1981's list
Successfully scraped 1982's list
Successfully scraped 1983's list
Successfully scraped 1984's list
Successfully scraped 1985's list
Successfully scraped 1986's list
Successfully scraped 1987's list
Successfully scraped 1988's list
Successfully scraped 1989's list
Successfully scraped 1990's list
Successfully scraped 1991's list
Successfully scraped 1992's list
Successfully scraped 1993's list
Successfully scraped 1994's list
Successfully scraped 1995's list
Successfully scraped 1996's list
Successfully scraped 1997's list
Successfully scraped 1998's list
Successfully scraped 1999's list
Successfully scraped 2000's list
Successfully scraped 2001's list
Successfully scraped 2002's list
Successfully scraped 2003's list
Successfully scraped 2004's list
Successfully scraped 2005's list
Successfully scraped 2006's list
Successful

In [4]:
# Convert the list of dictionaries to a pandas dataframe
mojo_df = pd.DataFrame(movie_list)
mojo_df.head()

Unnamed: 0,gross,movie_name,release_date,theaters,title,url
0,"$126,737,428",Smokey and the Bandit (1977),May 27/1977,-,Smokey and the Bandit,/release/rl477136385/?ref_=bo_yld_table_2
1,"$116,395,460",Close Encounters of the Third Kind (1977),Nov 16/1977,650,Close Encounters of the Third Kind,/release/rl340428289/?ref_=bo_yld_table_3
2,"$94,213,184",Saturday Night Fever (1977),Dec 16/1977,726,Saturday Night Fever,/release/rl2926544385/?ref_=bo_yld_table_4
3,"$50,750,000",A Bridge Too Far (1977),Jun 15/1977,-,A Bridge Too Far,/release/rl139036161/?ref_=bo_yld_table_5
4,"$47,346,365",The Deep (1977),Jun 17/1977,731,The Deep,/release/rl3561784833/?ref_=bo_yld_table_6


In [5]:
mojo_df["release_date"] = pd.to_datetime(mojo_df['release_date'],format='%b %d/%Y')

In [6]:
mojo_df.shape

(4170, 6)

## Scrape imdb_id for each movie we scraped from box-office-mojo

In [7]:
def get_ttid(soup):
    return soup.find_all('option')[0]['value'].split("/")[2]
    
def scrape_ttid(mojo_df):
    ttid_list = []
    for i in range(len(mojo_df)):
        url = 'http://www.boxofficemojo.com' + mojo_df.url.iloc[i]
        response = requests.get(url)
        # pause for 2seconds to limit traffic
        time.sleep(1)
        soup = BeautifulSoup(response.text, 'lxml')
        ttid = get_ttid(soup)
        ttid_list.append(ttid)
        if (i+1) % 50 == 0:
            print("Successfully scraped", (i+1), " ttid")
    return ttid_list

# Uncomment to scrape
ttid_list = scrape_ttid(mojo_df)

Successfully scraped 50  ttid
Successfully scraped 100  ttid
Successfully scraped 150  ttid
Successfully scraped 200  ttid
Successfully scraped 250  ttid
Successfully scraped 300  ttid
Successfully scraped 350  ttid
Successfully scraped 400  ttid
Successfully scraped 450  ttid
Successfully scraped 500  ttid
Successfully scraped 550  ttid
Successfully scraped 600  ttid
Successfully scraped 650  ttid
Successfully scraped 700  ttid
Successfully scraped 750  ttid
Successfully scraped 800  ttid
Successfully scraped 850  ttid
Successfully scraped 900  ttid
Successfully scraped 950  ttid
Successfully scraped 1000  ttid
Successfully scraped 1050  ttid
Successfully scraped 1100  ttid
Successfully scraped 1150  ttid
Successfully scraped 1200  ttid
Successfully scraped 1250  ttid
Successfully scraped 1300  ttid
Successfully scraped 1350  ttid
Successfully scraped 1400  ttid
Successfully scraped 1450  ttid
Successfully scraped 1500  ttid
Successfully scraped 1550  ttid
Successfully scraped 1600  t

In [8]:
tid_dict = [{"tconst": i} for i in ttid_list]
ttid_df = pd.DataFrame(tid_dict)
ttid_df.shape

(4170, 1)

In [9]:
ttid_df.head()

Unnamed: 0,tconst
0,tt0076729
1,tt0075860
2,tt0076666
3,tt0075784
4,tt0075925


In [10]:
mojo_ttid = mojo_df.merge(ttid_df, how='left', left_index=True, right_index=True)

In [11]:
mojo_ttid.head()

Unnamed: 0,gross,movie_name,release_date,theaters,title,url,tconst
0,"$126,737,428",Smokey and the Bandit (1977),1977-05-27,-,Smokey and the Bandit,/release/rl477136385/?ref_=bo_yld_table_2,tt0076729
1,"$116,395,460",Close Encounters of the Third Kind (1977),1977-11-16,650,Close Encounters of the Third Kind,/release/rl340428289/?ref_=bo_yld_table_3,tt0075860
2,"$94,213,184",Saturday Night Fever (1977),1977-12-16,726,Saturday Night Fever,/release/rl2926544385/?ref_=bo_yld_table_4,tt0076666
3,"$50,750,000",A Bridge Too Far (1977),1977-06-15,-,A Bridge Too Far,/release/rl139036161/?ref_=bo_yld_table_5,tt0075784
4,"$47,346,365",The Deep (1977),1977-06-17,731,The Deep,/release/rl3561784833/?ref_=bo_yld_table_6,tt0075925


In [13]:
# Save dataframe as csv
mojo_ttid.to_csv("./data/box_office_mojo.csv")

## Scrape imdb infromation for tconst

In [333]:
ttid_list = pd.read_csv("./data/box_office_mojo.csv")["tconst"]

In [380]:
def get_movie(soup, ttid):
    try :
        movie_dict = {}
        temp = soup.findAll("strong")[0]["title"].split(" ")
        user_rating = temp[0]
        no_user_ratings = temp[3]
    #     print(user_rating)
    #     print(no_user_ratings)

        temp = soup.findAll("div", {"class":"subtext"})[0]
        mmpa_rating = temp.contents[0].strip()
#         print(mmpa_rating)
        genres = [i.text for i in temp.findAll("a")[:-1]]
#         print(genres)

        temp = soup.findAll("div", {"class":"credit_summary_item"})
        directors = []
        cast = []
        directors_ids = []
        cast_ids = []
        if temp!= None and len(temp) > 0:
            for i in temp[0].findAll("a"):
                split = i["href"].split("/")
                if len(split) >= 3:
                    directors.append(i.text)
                    directors_ids.append(split[2])
        if temp!= None and len(temp) > 2:
            temp1 = temp[2].findAll("a")
            for i in temp1[:-1]:
                split = i["href"].split("/")
                if len(split) >= 3:
                    cast.append(i.text)
                    cast_ids.append(split[2])

        temp = soup.findAll("div", {"class":"metacriticScore"})
        meta_score = None
        if temp != None and len(temp) > 0:
            meta_score = temp[0].findAll("span")[0].text
#         print(metacriticScore)
    
        temp = soup.findAll("div", {"class":"titleReviewBarItem"})
        if meta_score != None:
            temp1 = temp[1].findAll("a")
            user_review = temp1[0].text.split(" ")[0]
            critic_review = temp1[1].text.split(" ")[0]
        else:
            temp1 = temp[0].findAll("a")
            user_review = temp1[0].text.split(" ")[0]
            critic_review = temp1[1].text.split(" ")[0]
#         print(user_review)
#         print(critic_review)

        popularity = None
        temp1 = soup.findAll("div", {"class":"popularityTrendDown"})
        if len(temp1) > 0:
            if meta_score != None :
                popularity = temp[2].findAll("span")[0].contents[0].strip()[:1]
            else:
                popularity = temp[1].findAll("span")[0].contents[0].strip()[:1]
                
    #     print(popularity)
    
        awards = 0
        temp = soup.findAll("div", {"id":"titleAwardsRanks"})
        if temp!= None and len(temp)> 0:
            temp1 = temp[0].findAll("span", {"class": "awards-blurb"})
            if temp != None and len(temp) > 0:
                for i in temp:
                    awards += sum([int(s) for s in i.text.split() if s.isdigit()])  

        movie_dict = {'tconst':ttid, 
                      "user_rating":user_rating, 
                      "number_user_ratings" : no_user_ratings,
                      'mmpa_rating':mmpa_rating, 
                      'genres': genres, 
                      'directors': directors,
                      "directors_ids" : directors_ids,
                      'cast': cast,
                      "cast_ids" : cast_ids,
                      'meta_score':meta_score, 
                      'user_reviews': user_review, 
                      'critic_reviews': critic_review,
                      'imdb_popularity': popularity,
                      'awards': awards
                     }
        return movie_dict
    
    except Exception as e: 
        print("An exception occurred "+ttid)
        print(e)
    
    
def scrape_movies(id_list):
    movies = []
    for i in range(len(id_list)):
        url = 'https://www.imdb.com/title/' + id_list[i]
        response = requests.get(url)
        # pause for 2seconds to limit traffic
        time.sleep(0.2)
        soup = BeautifulSoup(response.text, 'lxml')
        movie = get_movie(soup, id_list[i] )
        if movie != None:
            movies.append(movie)
        if (i+1) % 50 == 0:
            print("Successfully scraped", (i+1), " ttid")
    return movies

movies = scrape_movies(ttid_list)
# movies = scrape_movies(["tt10192656","tt0080762", "tt0096328", "tt0102370", "tt0102370", "tt0424942"
#                        , "tt1477715"])

Successfully scraped 50  ttid
Successfully scraped 100  ttid
Successfully scraped 150  ttid
Successfully scraped 200  ttid
An exception occurred tt10192656
list index out of range
Successfully scraped 250  ttid
Successfully scraped 300  ttid
Successfully scraped 350  ttid
Successfully scraped 400  ttid
Successfully scraped 450  ttid
Successfully scraped 500  ttid
Successfully scraped 550  ttid
Successfully scraped 600  ttid
Successfully scraped 650  ttid
Successfully scraped 700  ttid
Successfully scraped 750  ttid
Successfully scraped 800  ttid
Successfully scraped 850  ttid
Successfully scraped 900  ttid
Successfully scraped 950  ttid
Successfully scraped 1000  ttid
Successfully scraped 1050  ttid
Successfully scraped 1100  ttid
Successfully scraped 1150  ttid
Successfully scraped 1200  ttid
Successfully scraped 1250  ttid
Successfully scraped 1300  ttid
Successfully scraped 1350  ttid
Successfully scraped 1400  ttid
Successfully scraped 1450  ttid
Successfully scraped 1500  ttid
Suc

In [381]:
movies_list = []
for i in range(len(movies)):
    if movies[i] != None:
        movies_list.append(movies[i])
imdb_df = pd.DataFrame(movies_list)
imdb_df.shape

(4169, 14)

In [382]:
imdb_df.head()

Unnamed: 0,awards,cast,cast_ids,critic_reviews,directors,directors_ids,genres,imdb_popularity,meta_score,mmpa_rating,number_user_ratings,tconst,user_rating,user_reviews
0,2,"[Burt Reynolds, Sally Field, Jerry Reed]","[nm0000608, nm0000398, nm0715274]",78,[Hal Needham],[nm0624102],"[Action, Comedy]",3.0,50.0,PG,41965,tt0076729,7.0,191
1,53,"[Richard Dreyfuss, François Truffaut, Teri Garr]","[nm0000377, nm0000076, nm0000414]",220,[Steven Spielberg],[nm0000229],"[Drama, Sci-Fi]",,90.0,PG,171550,tt0075860,7.6,411
2,16,"[John Travolta, Karen Lynn Gorney, Barry Miller]","[nm0000237, nm0331186, nm0587944]",97,[John Badham],[nm0000824],"[Drama, Music]",1.0,77.0,R,65781,tt0076666,6.8,240
3,11,"[Sean Connery, Ryan O'Neal, Michael Caine]","[nm0000125, nm0641939, nm0000323]",55,[Richard Attenborough],[nm0000277],"[Drama, History, War]",2.0,,PG,48385,tt0075784,7.4,238
4,3,"[Jacqueline Bisset, Nick Nolte, Dick Anthony W...","[nm0000302, nm0000560, nm0930454]",33,[Peter Yates],[nm0946811],"[Adventure, Mystery, Thriller]",,41.0,PG,10324,tt0075925,6.2,69


In [384]:
# Save dataframe as csv
imdb_df.to_csv("./data/imdb.csv")
# Save dataframe as csv
imdb_df.to_pickle("./data/imdb.pkl")

## Scrape tmdb infromation for tconst

In [59]:
api_key = "4cf01ddda6758f12149edf1fad3e4a54"

In [13]:
get_movies(ttid_list):
    tmdb_data = []
    index = 0
    for i in ttid_list:
        response = requests.get('https://api.themoviedb.org/3/movie/'+i+'?api_key='+ api_key)
        tmdb_data.append(response.json())
        index +=1
        time.sleep(0.5)
        if (index+1) % 50 == 0:
                print("Successfully scraped", (index+1), " ttid")

tmdb_data = get_movies(ttid_list)

Successfully scraped 50  ttid
Successfully scraped 100  ttid
Successfully scraped 150  ttid
Successfully scraped 200  ttid
Successfully scraped 250  ttid
Successfully scraped 300  ttid
Successfully scraped 350  ttid
Successfully scraped 400  ttid
Successfully scraped 450  ttid
Successfully scraped 500  ttid
Successfully scraped 550  ttid
Successfully scraped 600  ttid
Successfully scraped 650  ttid
Successfully scraped 700  ttid
Successfully scraped 750  ttid
Successfully scraped 800  ttid
Successfully scraped 850  ttid
Successfully scraped 900  ttid
Successfully scraped 950  ttid
Successfully scraped 1000  ttid
Successfully scraped 1050  ttid
Successfully scraped 1100  ttid
Successfully scraped 1150  ttid
Successfully scraped 1200  ttid
Successfully scraped 1250  ttid
Successfully scraped 1300  ttid
Successfully scraped 1350  ttid
Successfully scraped 1400  ttid
Successfully scraped 1450  ttid
Successfully scraped 1500  ttid
Successfully scraped 1550  ttid
Successfully scraped 1600  t

In [None]:
tmdb_df = pd.DataFrame(tmdb_data)
tmdb_df.head()

In [15]:
# Save dataframe as csv
tmdb_df.to_csv("./data/tmdb.csv")

### Scrape data for actors and directors

In [None]:
# https://api.themoviedb.org/3/movie/tt0076729/?api_key=4cf01ddda6758f12149edf1fad3e4a54

In [None]:
# https://api.themoviedb.org/3/find/27436/?api_key=4cf01ddda6758f12149edf1fad3e4a54

In [281]:
cast = []
cast_names = []

for i,z in imdb_df[["directors","directors_ids"]].values:
    for j in range(len(z)):
        temp = {"name":i[j], "imdb_id":z[j]}
        cast_names.append(i[j])
        if temp not in cast:
            cast.append(temp)
        
for i,z in imdb_df[["cast", "cast_ids"]].values:
     for j in range(len(z)):
        temp = {"name":i[j], "imdb_id":z[j]}
        cast_names.append(i[j])
        if temp not in cast:
            cast.append(temp)
        
cast_df = pd.DataFrame(cast)
cast_df.head()

Unnamed: 0,imdb_id,name
0,nm0624102,Hal Needham
1,nm0000229,Steven Spielberg
2,nm0000824,John Badham
3,nm0000277,Richard Attenborough
4,nm0946811,Peter Yates


In [306]:
person_id=[]
with open("./data/person_ids.json", "r") as fp:
    line = fp.readline()
    while line:
        person_id.append(json.loads(line.strip()))
        line = fp.readline()
    fp.close()
    
persons = pd.DataFrame(person_id).rename(columns={"id":"tmdb_id"})
persons.head()
tmdb_ids = set(persons[persons["name"].isin(cast_names) == True]["tmdb_id"].values)

In [307]:
def getPerson(dir_list):
    tmdb_person = []
    index = 0
    for i in dir_list:
        response = requests.get('https://api.themoviedb.org/3/person/'+str(i)+'?api_key='+ api_key)
        tmdb_person.append(response.json())
        index +=1
        time.sleep(0.5)
        if (index+1) % 50 == 0:
                print("Successfully scraped", (index+1), " persons")
    return tmdb_person

tmdb_persons = getPerson(tmdb_ids)

Successfully scraped 50  persons
Successfully scraped 100  persons
Successfully scraped 150  persons
Successfully scraped 200  persons
Successfully scraped 250  persons
Successfully scraped 300  persons
Successfully scraped 350  persons
Successfully scraped 400  persons
Successfully scraped 450  persons
Successfully scraped 500  persons
Successfully scraped 550  persons
Successfully scraped 600  persons
Successfully scraped 650  persons
Successfully scraped 700  persons
Successfully scraped 750  persons
Successfully scraped 800  persons
Successfully scraped 850  persons
Successfully scraped 900  persons
Successfully scraped 950  persons
Successfully scraped 1000  persons
Successfully scraped 1050  persons
Successfully scraped 1100  persons
Successfully scraped 1150  persons
Successfully scraped 1200  persons
Successfully scraped 1250  persons
Successfully scraped 1300  persons
Successfully scraped 1350  persons
Successfully scraped 1400  persons
Successfully scraped 1450  persons
Succe

In [312]:
persons_df = pd.DataFrame(tmdb_persons)
persons_df.head()

Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","George Walton Lucas Jr. (born May 14, 1944) is...",1944-05-14,,2,,1,nm0000184,Directing,George Lucas,"Modesto, California, USA",6.642,/8qxin8urtFE0NqaZNFWOuV537bH.jpg
1,False,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2,,2,nm0000434,Acting,Mark Hamill,"Concord, California, USA",7.589,/fk8OfdReNltKZqOk2TZgkofCUFq.jpg
2,False,"[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...",Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2,,3,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",10.845,/7CcoVFTogQgex2kJkXKMe8qHZrC.jpg
3,False,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",Carrie Frances Fisher (21 October 1956 - 27 De...,1956-10-21,2016-12-27,1,http://www.carriefisher.com/,4,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",3.878,/rfJtncHewKVnHjqpIZvjn24ESeC.jpg
4,False,[Peter Wilton Cushing],"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",1913-05-26,1994-08-11,2,,5,nm0001088,Acting,Peter Cushing,"Kenley, Surrey, England, UK",2.704,/l0grZXcjqctESg7h6Jdp33pA3QG.jpg


In [328]:
persons_df[persons_df.name == "Steve McQueen"][["name", ]]

Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
2274,False,[Steve McQueen (III)],"Steve McQueen was born on October 9, 1969 in L...",1969-10-09,,2,,72757,nm2588606,Directing,Steve McQueen,"London, England, UK",0.84,/6w2fWkuhYQuv18OQi8cmSIq8srN.jpg
3917,False,[],He was the ultra-cool male film star of the 19...,1930-03-24,1980-11-07,2,,13565,nm0000537,Acting,Steve McQueen,"Beech Grove, IN",3.314,/b8LEJ08B4DMX2gsi5UTsYNRNJee.jpg


In [325]:
# Save dataframe as csv
persons_df.to_pickle("./data/cast_crew.pkl")