In [8]:
import os
import requests
from pathlib import Path
import pandas as pd
import seaborn as sns

### IMDb Non-Commercial Datasets

This is a non-commercial dataset, can be used for personal and non-commercial purposes: found here https://developer.imdb.com/non-commercial-datasets/

The dataset files can be accessed and downloaded from https://datasets.imdbws.com/. The data is refreshed daily. `TODO:` run automated ETL for this

Dataset details: - Each dataset is contained in a gzipped, tab-separated-values (TSV) formatted file in the UTF-8 character set. The first line in each file contains headers that describe what is in each column. A '\N' is used to denote that a particular field is missing or null for that title/name. The available datasets are as follows:


In [3]:
# Define the target directory relative to the notebook location, and create if it doesn't exist
output_dir = Path('datasets/imdb_non_commercial')
output_dir.mkdir(parents=True, exist_ok=True)

In [6]:
# List of dataset URLs from the IMDB website
urls = [
    'https://datasets.imdbws.com/name.basics.tsv.gz',
    'https://datasets.imdbws.com/title.akas.tsv.gz',
    'https://datasets.imdbws.com/title.basics.tsv.gz',
    'https://datasets.imdbws.com/title.crew.tsv.gz',
    'https://datasets.imdbws.com/title.episode.tsv.gz',
    'https://datasets.imdbws.com/title.principals.tsv.gz',
    'https://datasets.imdbws.com/title.ratings.tsv.gz'
]

Download all datasets. Note these will be zipped

In [7]:
for url in urls:
    # Extract filename from URL
    filename = url.split('/')[-1]
    filepath = output_dir / filename
    
    print(f"Downloading {filename}...")
    
    # Download the file; raise an error for bad status codes
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    # Save to file
    with open(filepath, 'wb') as f:
        # 8kb in memory at a time (memory efficiency)
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    
    print(f"Saved to {filepath}")

print("\nAll datasets downloaded successfully!")

Downloading name.basics.tsv.gz...
Saved to datasets/imdb_non_commercial/name.basics.tsv.gz
Downloading title.akas.tsv.gz...
Saved to datasets/imdb_non_commercial/title.akas.tsv.gz
Downloading title.basics.tsv.gz...
Saved to datasets/imdb_non_commercial/title.basics.tsv.gz
Downloading title.crew.tsv.gz...
Saved to datasets/imdb_non_commercial/title.crew.tsv.gz
Downloading title.episode.tsv.gz...
Saved to datasets/imdb_non_commercial/title.episode.tsv.gz
Downloading title.principals.tsv.gz...
Saved to datasets/imdb_non_commercial/title.principals.tsv.gz
Downloading title.ratings.tsv.gz...
Saved to datasets/imdb_non_commercial/title.ratings.tsv.gz

All datasets downloaded successfully!


### Dataset exploration

In [13]:
def read_sample_df(data_path, sep='\t', compression='gzip', top_rows=10000):
    # read first rows for review
    df_sample= pd.read_csv(
        data_path,
        sep=sep,
        compression=compression,
        nrows=top_rows
    )
    return df_sample

In [14]:
name_basics_sample = read_sample_df('datasets/imdb_non_commercial/name.basics.tsv.gz')
title_akas_sample = read_sample_df('datasets/imdb_non_commercial/title.akas.tsv.gz')
title_basics_sample = read_sample_df('datasets/imdb_non_commercial/title.basics.tsv.gz')
title_crew_sample = read_sample_df('datasets/imdb_non_commercial/title.crew.tsv.gz')
title_episode_sample = read_sample_df('datasets/imdb_non_commercial/title.episode.tsv.gz')
title_principals_sample = read_sample_df('datasets/imdb_non_commercial/title.principals.tsv.gz')
title_ratings_sample = read_sample_df('datasets/imdb_non_commercial/title.ratings.tsv.gz')

In [16]:
name_basics_sample.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"


In [18]:
title_akas_sample.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,\N,\N,original,\N,1
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita,US,\N,imdbDisplay,\N,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
4,tt0000001,5,Καρμενσίτα,GR,\N,imdbDisplay,\N,0


In [19]:
title_basics_sample.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [20]:
title_crew_sample.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [21]:
title_episode_sample.head()

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,\N,\N
1,tt0041951,tt0041038,1,9
2,tt0042816,tt0989125,1,17
3,tt0042889,tt0989125,\N,\N
4,tt0043426,tt0040051,3,42


In [22]:
title_principals_sample.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


In [23]:
title_ratings_sample.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2182
1,tt0000002,5.5,302
2,tt0000003,6.4,2262
3,tt0000004,5.2,194
4,tt0000005,6.2,2999
