# Classic Star Trek episode IMDb ratings 

In this notebook I retrieve the episode ratings for all of the classic* live-action Star Trek TV series from the [IMDb](https://www.imdb.com/) website, format the data into a nice form, and save as a `pandas` dataframe for ease of future use. 

I don't have any experience with web scraping - this project was partially an excuse to get some - so I mostly followed the simple tutorial [here](https://isabella-b.com/blog/scraping-episode-imdb-ratings-tutorial/). 

<sub>*defined as "not currently airing new episodes"<sub>

In [6]:
import pandas as pd
import pathlib

from requests import get
from bs4 import BeautifulSoup    

In [7]:
def get_episodes_df(show_id, seasons):
    """
    Get all episodes of a show from the specified range of seasons and return dataframe with info from IMDB.
    TODO: 
        1. Get show_id based on string e.g., "TNG" -> 0092455.
        2. How to get range of seasons? Breaking limit defaults to last season...
    """   
    
    episodes = {'season' : [], 
                'episode_number' : [],
                'title' : [], 
                'airdate' : [],
                'rating' : [], 
                'total_votes' : [], 
                'desc' : []}

    # Iterate over seasons
    for sn in seasons:
        response = get(f'https://www.imdb.com/title/tt{show_id}/episodes?season={sn}')

        # Parse content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # For each episode in each season
        for episode in page_html.find_all('div', class_ = 'info'):
                # Save the info
                episodes["season"].append(sn)
                episodes["episode_number"].append(episode.meta['content'])
                episodes["title"].append(episode.a['title'])
                episodes["airdate"].append(episode.find('div', class_='airdate').text.strip())
                episodes["rating"].append(episode.find('span', class_='ipl-rating-star__rating').text)
                episodes["total_votes"].append(episode.find('span', class_='ipl-rating-star__total-votes').text)
                episodes["desc"].append(episode.find('div', class_='item_description').text.strip())    
    
    # Convert to dataframe
    df = pd.DataFrame(episodes)
    
    # Clean up
    # Episode number to int
    df['episode_number'] = df.episode_number.astype(int)
    # Convert total votes to int
    df['total_votes'] = df.total_votes.apply(lambda v : int(v.replace(',', '')[1:-1]))
    # Make rating numeric
    df['rating'] = df.rating.astype(float)
    # Airdate to datetime
    df['airdate'] = pd.to_datetime(df.airdate)
    
    # Return the dataframe
    return df     

Now let's create and save episode info dataframes for each of the series, as well as one big dataframe which combines them all. 

In [8]:
shows_and_num_seasons = {"TOS" : ("0060028", 3), 
                        "TNG" : ("0092455", 7), 
                        "DS9" : ("0106145", 7), 
                        "VOY" : ("0112178", 7), 
                        "ENT" : ("0244365", 4)}

dest = pathlib.Path('imdb-ratings/')
dest.mkdir(parents=True, exist_ok=True)

all_trek = []
for show, info in shows_and_num_seasons.items():
    imdb_id, n_seasons = info
    df = get_episodes_df(imdb_id, range(1, n_seasons + 1))
    df.to_pickle(f"{dest}/{show}") 
    # Add series column for complete Trek dataframe
    df["series"] = show
    all_trek.append(df)

# Save complete dataframe
trek = pd.concat(all_trek, ignore_index=True)
trek.to_pickle(f"{dest}/TREK")
# Save human-readable version as well
trek.to_csv(f"{dest}/TREK.csv")