# 01. Data Loading

This notebook handles the loading of the raw IMDb datasets. We will load each compressed TSV file into a pandas DataFrame and inspect the basic information (columns, data types, non-null counts) to ensure everything is loaded correctly.

In [2]:
import pandas as pd
import os

# Define paths
DATA_DIR = '../data/raw'

# List available files
print("Available files:")
for f in sorted(os.listdir(DATA_DIR)):
    if f.endswith('.gz'):
        print(f)

Available files:
name.basics.tsv.gz
title.akas.tsv.gz
title.basics.tsv.gz
title.crew.tsv.gz
title.episode.tsv.gz
title.principals.tsv.gz
title.ratings.tsv.gz


## Load Datasets

We will load the following datasets:
- `name.basics.tsv.gz`
- `title.akas.tsv.gz`
- `title.basics.tsv.gz`
- `title.crew.tsv.gz`
- `title.episode.tsv.gz`
- `title.principals.tsv.gz`
- `title.ratings.tsv.gz`

Since these are TSV files, we use `sep='\t'`. Some files might have `\N` effectively acting as NULL, but we'll stick to default loading first and handle cleaning in the next stage.

In [3]:
def load_imdb_dataset(filename, folder=DATA_DIR):
    path = os.path.join(folder, filename)
    print(f"Loading {filename}...")
    # low_memory=False to avoid DtypeWikarning mixed types for now, 
    # can optimize types later if needed.
    df = pd.read_csv(path, sep='\t', compression='gzip', low_memory=False)
    print(f"Loaded {filename}: {df.shape}")
    return df

In [4]:
# Loading Name Basics
name_basics = load_imdb_dataset('name.basics.tsv.gz')
name_basics.head()

Loading name.basics.tsv.gz...
Loaded name.basics.tsv.gz: (15053659, 6)


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924,2014,"actress,miscellaneous,soundtrack","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934,2025,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [5]:
# Loading Title Akas
title_akas = load_imdb_dataset('title.akas.tsv.gz')
title_akas.head()

Loading title.akas.tsv.gz...
Loaded title.akas.tsv.gz: (54779348, 8)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Carmencita,\N,\N,original,\N,1
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita,US,\N,imdbDisplay,\N,0
3,tt0000001,4,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
4,tt0000001,5,Καρμενσίτα,GR,\N,imdbDisplay,\N,0


In [6]:
# Loading Title Basics
title_basics = load_imdb_dataset('title.basics.tsv.gz')
title_basics.head()

Loading title.basics.tsv.gz...
Loaded title.basics.tsv.gz: (12233603, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [7]:
# Loading Title Crew
title_crew = load_imdb_dataset('title.crew.tsv.gz')
title_crew.head()

Loading title.crew.tsv.gz...
Loaded title.crew.tsv.gz: (12233603, 3)


Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [8]:
# Loading Title Episode
title_episode = load_imdb_dataset('title.episode.tsv.gz')
title_episode.head()

Loading title.episode.tsv.gz...
Loaded title.episode.tsv.gz: (9451839, 4)


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,\N,\N
1,tt0041951,tt0041038,1,9
2,tt0042816,tt0989125,1,17
3,tt0042889,tt0989125,\N,\N
4,tt0043426,tt0040051,3,42


In [9]:
# Loading Title Principals
title_principals = load_imdb_dataset('title.principals.tsv.gz')
title_principals.head()

Loading title.principals.tsv.gz...
Loaded title.principals.tsv.gz: (97373180, 6)


Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


In [10]:
# Loading Title Ratings
title_ratings = load_imdb_dataset('title.ratings.tsv.gz')
title_ratings.head()

Loading title.ratings.tsv.gz...
Loaded title.ratings.tsv.gz: (1627720, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2188
1,tt0000002,5.5,308
2,tt0000003,6.5,2289
3,tt0000004,5.1,196
4,tt0000005,6.2,3019


## Dataset Info Check

In [11]:
print("--- Name Basics ---")
name_basics.info()
print("\n--- Title Akas ---")
title_akas.info()
print("\n--- Title Basics ---")
title_basics.info()
print("\n--- Title Crew ---")
title_crew.info()
print("\n--- Title Episode ---")
title_episode.info()
print("\n--- Title Principals ---")
title_principals.info()
print("\n--- Title Ratings ---")
title_ratings.info()

--- Name Basics ---
<class 'pandas.DataFrame'>
RangeIndex: 15053659 entries, 0 to 15053658
Data columns (total 6 columns):
 #   Column             Dtype
---  ------             -----
 0   nconst             str  
 1   primaryName        str  
 2   birthYear          str  
 3   deathYear          str  
 4   primaryProfession  str  
 5   knownForTitles     str  
dtypes: str(6)
memory usage: 689.1 MB

--- Title Akas ---
<class 'pandas.DataFrame'>
RangeIndex: 54779348 entries, 0 to 54779347
Data columns (total 8 columns):
 #   Column           Dtype
---  ------           -----
 0   titleId          str  
 1   ordering         int64
 2   title            str  
 3   region           str  
 4   language         str  
 5   types            str  
 6   attributes       str  
 7   isOriginalTitle  int64
dtypes: int64(2), str(6)
memory usage: 3.3 GB

--- Title Basics ---
<class 'pandas.DataFrame'>
RangeIndex: 12233603 entries, 0 to 12233602
Data columns (total 9 columns):
 #   Column          Dtyp