# Web scraping

In this part of the project, the main goal is to retrieve the main info about the top 250 movies in some genre, do some data cleaning and export it to a CSV file.

## 1 Imports

In [None]:
from bs4 import BeautifulSoup
from warnings import warn
from time import sleep
from random import randint
import requests
import numpy as np, pandas as pd

## 2 Scraping and data cleaning

1. Define # pages to be scraped
2. Create lists to populate Pandas dataframes later
3. Get request for documentary movies
4. Begin movie extraction and data cleaning

In [None]:
# Scrap top 50 documentary movies
pages = np.arange(1, 51, 50)
headers = {
    'Accept-Language': 'en-US,en;q=0.8'
}

# Empty lists to store variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
metascores = []
directors = []
stars = []
votes = []

for page in pages:
    response = requests.get('https://m.imdb.com/search/title/?'
                            + 'title_type=movie&'
                            + 'genres=documentary&'
                            + f'start={page}&'
                            + 'ref_=adv_prv', headers=headers)
    
    # slow down requests so it doesn't overload IMDB's servers
    sleep(randint(8,15))

    if response.status_code != 200:
        warn(f'Request: {requests}; Status code: {response.status_code}')
    
    page_html = BeautifulSoup(response.text, 'html.parser')
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

    # movies extraction
    for container in movie_containers:
        title = container.h3.a.text
        titles.append(title)

        crew = container.find('p', class_='').text.replace('\n','').replace('|',':').split(':')
        crew = [person.strip().split(', ') if ',' in person else person.strip() for person in crew]
        directors_ = crew[1] if len(crew) > 1 else ''
        directors.append(directors_)
        stars_ = crew[-1] if 'Stars' in crew else ''            
        stars.append(stars_)

        if container.h3.find('span', class_='lister-item-year text-muted unbold') is not None:
            year = container.h3.find('span', class_='lister-item-year text-muted unbold').text
            year = int(''.join(d for d in year if d.isdigit())) if year else None
            years.append(year)
        else:
            years.append(None)

        if container.p.find('span', class_='certificate') is not None:
            rating = container.p.find('span', class_='certificate').text
            ratings.append(rating)
        else:
            ratings.append('')

        if container.p.find('span', class_='runtime') is not None:
            runtime = container.p.find('span', class_='runtime').text.replace(' min', '')
            runtime = int(''.join(d for d in runtime if d.isdigit()))
            runtimes.append(runtime)
        else:
            runtimes.append(None)

        if container.p.find('span', class_='genre') is not None:
            genres_ = container.p.find('span', class_='genre').text
            genres_ = genres_.strip().split(', ')
            genres.append(genres_)
        else:
            genres.append([])

        if container.strong is not None:
            imdb_rating = float(container.strong.text)
            imdb_ratings.append(imdb_rating)
        else:
            imdb_ratings.append(None)

        if container.find('span', class_='metascore') is not None:
            metascore = int(container.find('span', class_='metascore').text.strip())
            metascores.append(metascore)
        else:
            metascores.append(None)

        if container.find('p', class_='sort-num_votes-visible') is not None:
            vote = container.find('p', class_='sort-num_votes-visible').text.split('\n')
            vote = int(''.join(d for d in vote[vote.index('Votes:') + 1] if d.isdigit())) if 'Votes:' in vote else None
            votes.append(vote)
        else:
            votes.append(None)

## 3 Create dataframes and export to CSV

In [None]:
documentary_df = pd.DataFrame({
    'movie': titles,
    'year': years,
    'rating': ratings,
    'genre': genres,
    'runtime_min': runtimes,
    'imdb': imdb_ratings,
    'metascore': metascores,
    'votes': votes,
    'director(s)': directors,
    'stars': stars,
})

documentary_df.to_csv('../files/scraped_documentary_movies.csv', index=False)