# Web-Scraping the World's most famous Movie Directors

This notebook was used to run a web scraper that extracts a list of:
- The world's most famous movie directors
- The world's most famous actors

This data will be used to feature-engineer two new columns in our dataset:
- whether the movie contains at least one top actor
- whether the movie contains at least one top director

In [None]:
# web-scraping packages
import requests
from bs4 import BeautifulSoup
import numpy as np


urls = []

# allows to look for a specific page
query = '?sort=list_order,asc&mode=detail&page='

# Lists of famous/popular/top100 Female directors
link1 = 'https://www.imdb.com/list/ls003532091/'
link2 = 'https://www.imdb.com/list/ls025705523/'
link3 = 'https://www.imdb.com/list/ls008961702/'
link4 = 'https://www.imdb.com/list/ls000660785/'
link5 = 'https://www.imdb.com/list/ls062233292/'

# Lists of famous/popular/top100 Male directors
link7 = 'https://www.imdb.com/list/ls056848274/'
link8 = 'https://www.imdb.com/list/ls073773341/'
link9 = 'https://www.imdb.com/list/ls000005319/'
link10 = 'https://www.imdb.com/list/ls008344500/'
link11 = 'https://www.imdb.com/list/ls050328773/'

links = [link1, link2, link3, link4, link5, link7, link8, link9, link10, link11]
directors = []

# Scraping each link
for index, link in enumerate(links):
    print("checking link", index + 1, "/14")
    # scraping each page of each link
    for page_number in range(1, 9):
        url1 = link +  query + str(page_number)
        page1 = requests.get(url1)
        soup1 = BeautifulSoup(page1.content, "html.parser")
        director_cards1 = soup1.find_all("h3", class_="lister-item-header")
        
        # scraping each name of each page
        for director in director_cards1:
            director_name = director.text[5:].strip()
            directors.append(director_name)
            print(director_name, 'added')


print(directors)


# saving final result to a CSV
np.savetxt("ultimate_directors_list.csv", 
           directors,
           delimiter =", ", 
           fmt ='% s',
           header='director_name')

# Web-Scraping the World's most famous Movie Actors

Similar logic as for directors above, but with lists of female and male actors

In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np


urls = []
query = '?sort=list_order,asc&mode=detail&page='

# Female actresses
link1 = 'https://www.imdb.com/list/ls004660971/'
link2 = 'https://www.imdb.com/list/ls057000570/'
link3 = 'https://www.imdb.com/list/ls022928836/'
link4 = 'https://www.imdb.com/list/ls063784435/'
link5 = 'https://www.imdb.com/list/ls000079132/'
link6 = 'https://www.imdb.com/list/ls070988001/'
# Male Actors
link7 = 'https://www.imdb.com/list/ls000519237/'
link8 = 'https://www.imdb.com/list/ls004602612/'
link9 = 'https://www.imdb.com/list/ls005155812/'
link10 = 'https://www.imdb.com/list/ls056919463/'
link11 = 'https://www.imdb.com/list/ls057537832/'

# both male and female
link12 = 'https://www.imdb.com/search/name/?gender=male,female/'
link13 = 'https://www.imdb.com/list/ls082599715/'
link14 = 'https://www.imdb.com/list/ls058011111/'

links = [link1, link2, link3, link4, link5, link6, link7, link8, link9, link10, link11, link12, link13, link14]

actors = []


for index, link in enumerate(links):
    print("checking link", index + 1, "/14")
    for page_number in range(1, 9):
        url1 = link +  query + str(page_number)
        page1 = requests.get(url1)
        soup1 = BeautifulSoup(page1.content, "html.parser")
        actor_cards1 = soup1.find_all("h3", class_="lister-item-header")

        for actor in actor_cards1:
            actor_name = actor.text[5:].strip()
            actors.append(actor_name)
            print(actor_name, 'added')


print(actors)


np.savetxt("ultimate_actors_list.csv", 
           actors,
           delimiter =", ", 
           fmt ='% s',
           header='actor_name')