# IMDB web scraper
### Scrape all movies released between 1980 - 2016

Feature Film, Released between 1970-01-01 and 2023-04-30 in the United States (sorted by Release Date, descending)

80,556 titles are returned and 100 titles are displayed on each page = 856 pages to scrape (loop)

https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start=1

The start parameter referes to teh movie # in the search results - increase by 100 each loop.



From the terminal, cd into the correct directory
1. python3 -m venv ./venv
2. source ./venv/bin/activate
3. pip install pandas
4. pip install bs4
5. pip install requests
6. pip install jupyter notebook

Run using:
jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10


In [1]:
# import libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import random
import time


In [2]:
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start='

headers ={
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'
}


In [3]:
# https://github.com/SkeyRahaman/Web-Scraping-for-IMDb-top-250-movies-details/blob/master/Web%20Scraping.ipynb
def director_and_actor(director_and_star):
    director_and_star =  director_and_star.replace("\n","")
    director_and_star = director_and_star.replace("|","")
    director_and_star = director_and_star.split("Stars:")
    director_and_star[0] = director_and_star[0].replace("Director:","")
    director_and_star[0] = director_and_star[0].replace("Directors:","")
    for i in range(10):
        director_and_star[0]=director_and_star[0].replace("  "," ")
    director = director_and_star[0]
    stars = director_and_star[1]
    stars = stars.replace(":","")
    return director,stars


wait_between_pages = 1
start = 1

_id = 1000000

imdb_df = pd.DataFrame(columns=["_id", "movie_title", "year", "mpaa_rating", "runtime", "genre", "rating", "metascore", "summary", "director", "stars", "votes", "gross_revenue_us_canada", "imdb_url", "image_url"])

for page in range(1, 4):  #loop into the page (856 total = range 1, 857)

    scrape_url = url + str(start)
    print (scrape_url)
    
    time.sleep(wait_between_pages)
    
    print(f"Waiting {wait_between_pages} seconds to scrape \n {scrape_url}" )
    
    html_text = requests.get(scrape_url, headers=headers).text
    
    soup = BeautifulSoup(html_text, "html.parser")    
    # print(soup)
        
    for movie_item in soup.find_all(class_='lister-item'):
        
        _id+=1
        
        movie_title = movie_item.find(class_='lister-item-header').a.text
        try:
            year = movie_item.find(class_='lister-item-year').text[1:5]
            if year == "I) (":
                year = movie_item.find(class_='lister-item-year').text[5:9] 
        except:
            year = ""

        try:
            mpaa_rating = movie_item.find(class_='certificate').text
        except:
            mpaa_rating = ""

        try:
            runtime = movie_item.find(class_='runtime').text
            runtime = runtime.replace("min", "")
            runtime = runtime.strip()
        except:
            runtime = ""

        try:
            genre = movie_item.find(class_='genre').text.strip()
        except:
            genre = ""

        try:
            rating = movie_item.find(class_='ipl-rating-star__rating').text
        except:
            rating = ""
        
        try:
            metascore = movie_item.find(class_='metascore').text
        except:
            metascore = ""
        
        try:
            summary = movie_item.find(class_='lister-item-content').select('p')[1].text.strip()
        except:
            summary = ""
        
        try:
            #directors_stars = movie_item.find(class_='lister-item-content').select('p')[2].text.strip()
            director, stars = director_and_actor(movie_item.find(class_='lister-item-content').select('p')[2].text.strip())
        except:
            directors_stars = ""

        try:
            votes = movie_item.find(class_='lister-item-content').select('p')[3].select('span')[1].text
        except:
            votes = ""

        try:
            gross_revenue_us_canada = movie_item.find(class_='lister-item-content').select('p')[3].select('span')[4]['data-value']
            gross_revenue_us_canada = gross_revenue_us_canada.replace("$", "")
            gross_revenue_us_canada = gross_revenue_us_canada (",", "")
            gross_revenue_us_canada = gross_revenue_us_canada.strip()
        except:
            gross_revenue_us_canada = ""
            
        try:
            imdb_url = 'https://www.imdb.com' + movie_item.find(class_='lister-item-content').find(class_='lister-item-header').a['href']
        except:
            imdb_url = ""
        
        try:
            image_url = movie_item.find("div", class_='lister-item-image').a.img['loadlate']
        except:
            image_url = ""
            
        
        row = pd.Series(data=
        {
            "_id": _id,
            "movie_title": movie_title,
            "year": year,
            "mpaa_rating": mpaa_rating,
            "runtime": runtime,
            "genre": genre,
            "rating": rating,
            "metascore": metascore,
            "summary": summary,
            "director": director,
            "stars": stars,
            "votes": votes,
            "gross_revenue_us_canada": gross_revenue_us_canada,
            "imdb_url": imdb_url,
            "image_url": image_url
        })
        imdb_df = pd.concat([imdb_df, row.to_frame().T], axis=0, ignore_index = True)

        #print(_id)
        #print(movie_title)
        #print(year)
        #print(mpaa_rating)
        #print(runtime)
        #print(genre)
        #print(rating)
        #print(metascore)
        #print(summary)
        #print(director)
        #print(stars)
        #print(votes)
        #print(gross)
        #print(imdb_url)
        #print(image_url)
          
        #print( '\n', '----', '\n')


    #print("\n")
    print(f"page #{page} scraping is done.")    
    wait_between_pages = random.randint(3, 9)
    
    start = start + 100
    
        
print(imdb_df.head())
imdb_df.to_csv("./imdb_data.csv")

https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start=1
Waiting 1 seconds to scrape 
 https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start=1
page #1 scraping is done.
https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start=101
Waiting 6 seconds to scrape 
 https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start=101
page #2 scraping is done.
https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&count=100&start=201
Waiting 5 seconds to scrape 
 https://www.imdb.com/search/title/?title_type=feature&release_date=1970-01-01%2C2023-04-30&countries=us&sort=release_date,desc&

Get movie details from the details pages.
        

In [4]:
        
#movie_detail_page = requests.get(scrape_url, headers=headers).text
#detail_soup = BeautifulSoup(movie_detail_page, "html.parser")         
#popularity_score = movie_item.find(class_='lister-item-header').a.text        
#hero-rating-bar__popularity__score

# Read the IMDb data from the CSV file
imdb_data = pd.read_csv('imdb_data.csv')

# Create an empty DataFrame to store the IMDb details
imdb_details = pd.DataFrame(columns=['_id', 'popularity_score', 'release_date_us', 'estimated_budget', 'opening_weekend_domestic_date', 'opening_weekend_domestic_gross', 'worldwide_gross' ])

# Loop through each row in the IMDb data
for index, row in imdb_data.iterrows():
    _id = row['_id']
    imdb_url = row['imdb_url']
    
    print(_id)
    
    # Open the IMDb URL and parse the page with Beautiful Soup
    #response = requests.get(imdb_url)
    #soup = BeautifulSoup(response.content, 'html.parser')
    
    time.sleep(random.randint(3, 9))
    
    details_page_text = requests.get(imdb_url, headers=headers).text
    
    details_soup = BeautifulSoup(details_page_text, "html.parser")
    

    # popularity_score
    try:
        popularity_score = details_soup.find("div", attrs={"data-testid":"hero-rating-bar__popularity__score" }).text
    except:
        popularity_score = ""
    
    # release date
    try:
        release_date_us = details_soup.find('li', attrs={'data-testid':'title-details-releasedate'}).find('li', class_='ipc-inline-list__item').text
        release_date_us = release_date_us.replace("(United States)", "")
        release_date_us = release_date_us.strip()
    except:
        release_date_us = ""
    
    # estimated budget
    try:
        estimated_budget = details_soup.find('li', attrs={'data-testid':'title-boxoffice-budget'}).text
        estimated_budget = estimated_budget.replace("Budget","")
        estimated_budget = estimated_budget.replace("(estimated)","")
        estimated_budget = estimated_budget.replace("$","")
        estimated_budget = estimated_budget.replace(",","")
        estimated_budget = estimated_budget.strip()
    except:
        estimated_budget = ""
    
    # opening weekend domestic gross (us & canada)
    try:
        opening_weekend_domestic_gross = details_soup.find('li', attrs={'data-testid':'title-boxoffice-openingweekenddomestic'}).select('span', class_='ipc-metadata-list-item__list-content-item')[1].text
        opening_weekend_domestic_gross = opening_weekend_domestic_gross.replace("$", "")
        opening_weekend_domestic_gross = opening_weekend_domestic_gross.replace(",", "")
        opening_weekend_domestic_gross = opening_weekend_domestic_gross.strip()
    except:
        opening_weekend_domestic_gross = ""
    
    # opening weekend date (us & canada)
    try:
        opening_weekend_domestic_date = details_soup.find('li', attrs={'data-testid':'title-boxoffice-openingweekenddomestic'}).select('span', class_='ipc-metadata-list-item__list-content-item')[2].text
    except:
        opening_weekend_domestic_date = ""
    
    # worldwide gross
    try:
        worldwide_gross = details_soup.find('li', attrs={'data-testid':'title-boxoffice-cumulativeworldwidegross'}).find('span', class_='ipc-metadata-list-item__list-content-item').text
        worldwide_gross = worldwide_gross.replace("$", "")
        worldwide_gross = worldwide_gross.replace(",", "")
        worldwide_gross = worldwide_gross.strip()
    except:
        worldwide_gross = ""
    
    #print(_id)
    #print(popularity_score)
    #print(release_date_us)
    #print(f'estimated budget: {estimated_budget}')
    #print(f'domestic opening weekend date: {opening_weekend_domestic_date}')
    #print(f'domestic opening weekend gross: {opening_weekend_domestic_gross}')
    #print(f'worldwide gross: {worldwide_gross}')
    
    
    # Append the details to the IMDb details DataFrame
    #imdb_details = imdb_details.append({'_id': _id, 'popularity_score': popularity}, ignore_index=True)
    imdb_details.loc[index] = [_id, popularity_score, release_date_us, estimated_budget, opening_weekend_domestic_date, opening_weekend_domestic_gross, worldwide_gross]
    
# Write the IMDb details to a new CSV file
imdb_details.to_csv('imdb_details.csv', index=False)
    






        

1000001
1000002
1000003
1000004
1000005
1000006
1000007
1000008
1000009
1000010
1000011
1000012
1000013
1000014
1000015
1000016
1000017
1000018
1000019
1000020
1000021
1000022
1000023
1000024
1000025
1000026
1000027
1000028
1000029
1000030
1000031
1000032
1000033
1000034
1000035
1000036
1000037
1000038
1000039
1000040
1000041
1000042
1000043
1000044
1000045
1000046
1000047
1000048
1000049
1000050
1000051
1000052
1000053
1000054
1000055
1000056
1000057
1000058
1000059
1000060
1000061
1000062
1000063
1000064
1000065
1000066
1000067
1000068
1000069
1000070
1000071
1000072
1000073
1000074
1000075
1000076
1000077
1000078
1000079
1000080
1000081
1000082
1000083
1000084
1000085
1000086
1000087
1000088
1000089
1000090
1000091
1000092
1000093
1000094
1000095
1000096
1000097
1000098
1000099
1000100
1000101
1000102
1000103
1000104
1000105
1000106
1000107
1000108
1000109
1000110
1000111
1000112
1000113
1000114
1000115
1000116
1000117
1000118
1000119
1000120
1000121
1000122
1000123
1000124
1000125


scrape directors and stars