In [None]:
import os
import requests
import gzip
import shutil
import pandas as pd
from IPython.display import Markdown

# define dataset URLs you want to load
dataset_urls = {
    'title.basics': 'https://datasets.imdbws.com/title.basics.tsv.gz',
    'title.ratings': 'https://datasets.imdbws.com/title.ratings.tsv.gz',
    'name.basics':   'https://datasets.imdbws.com/name.basics.tsv.gz',
    'title.principals': 'https://datasets.imdbws.com/title.principals.tsv.gz',
    'title.crew': 'https://datasets.imdbws.com/title.crew.tsv.gz',
    'title.episode': 'https://datasets.imdbws.com/title.episode.tsv.gz',
    'title.akas': 'https://datasets.imdbws.com/title.akas.tsv.gz',
}

data_dir = 'imdb_data'
os.makedirs(data_dir, exist_ok=True)

for name, url in dataset_urls.items():
    gz_path = os.path.join(data_dir, f'{name}.tsv.gz')
    tsv_path = os.path.join(data_dir, f'{name}.tsv')

    # download
    print(f'Downloading {url} …')
    resp = requests.get(url, stream=True)
    with open(gz_path, 'wb') as f:
        shutil.copyfileobj(resp.raw, f)

    # decompress
    print(f'Decompressing {gz_path} → {tsv_path} …')
    with gzip.open(gz_path, 'rb') as f_in:
        with open(tsv_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    # optionally remove .gz file
    os.remove(gz_path)

    # load into pandas
    print(f'Loading {tsv_path} into DataFrame …')
    df = pd.read_csv(tsv_path, sep='\t', dtype=str, low_memory=False)
    print(name, 'loaded:', df.shape)
    # e.g. store to a dict:
    globals()[f'df_{name.replace(".", "_")}'] = df



Downloading https://datasets.imdbws.com/title.basics.tsv.gz …
Decompressing imdb_data\title.basics.tsv.gz → imdb_data\title.basics.tsv …
Decompressing imdb_data\title.basics.tsv.gz → imdb_data\title.basics.tsv …
Loading imdb_data\title.basics.tsv into DataFrame …
Loading imdb_data\title.basics.tsv into DataFrame …
title.basics loaded: (12112601, 9)
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz …
title.basics loaded: (12112601, 9)
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz …
Decompressing imdb_data\title.ratings.tsv.gz → imdb_data\title.ratings.tsv …
Decompressing imdb_data\title.ratings.tsv.gz → imdb_data\title.ratings.tsv …
Loading imdb_data\title.ratings.tsv into DataFrame …
Loading imdb_data\title.ratings.tsv into DataFrame …
title.ratings loaded: (1607373, 3)
Downloading https://datasets.imdbws.com/name.basics.tsv.gz …
title.ratings loaded: (1607373, 3)
Downloading https://datasets.imdbws.com/name.basics.tsv.gz …
Decompressing imdb_data\name.basics.

## **Question**

#### **How many total people in data set?**

In [None]:
df_name_basics.shape[0]


14909809

#### **What is the earliest year of birth?**

In [None]:
# find the earliest born person
earliest_year = df_name_basics['birthYear'].replace('\\N', pd.NA).dropna().astype(int).min()

# get the rows of the earliest born person
df_earliest = df_name_basics[df_name_basics['birthYear'] == str(earliest_year)]

display(df_earliest)
Markdown(f"##### _The earliest person born in the dataset was in year **{df_earliest['birthYear'].values[0]}**._")

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
737970,nm0784172,Lucio Anneo Seneca,4,65,writer,"tt0043802,tt0218822,tt0049203,tt0972562"


##### _The earliest person born in the dataset was in year **4**._

#### **How many years ago was this person born?**

In [None]:
from datetime import datetime

# get the current year
current_year = datetime.now().year

# calculate how many years ago that was
year_ago = current_year - earliest_year
Markdown(f"##### _It was **{year_ago}** years ago._")

##### _It was **2021** years ago._

#### **Using only the data in the data set, determine if this date of birth correct.**

In [None]:
# Make a copy to avoid SettingWithCopyWarning
df_earliest = df_earliest.copy()

# Convert birthYear and deathYear to integers (NA values become pd.NA).
df_earliest['birthYear'] = pd.to_numeric(
    df_earliest['birthYear'],
    errors='coerce'
).astype('Int64')

df_earliest['deathYear'] = pd.to_numeric(
    df_earliest['deathYear'].replace('\\N', pd.NA),
    errors='coerce'
).astype('Int64')

# Check if he is dead
df_earliest['is_dead'] = df_earliest['deathYear'].notna()
# Check if death year is after birth year
df_earliest['death_after_birth'] = df_earliest['deathYear'] > df_earliest['birthYear']
# Check if he lived a plausible life span (< 150 years)
df_earliest['plausible_life'] = df_earliest['deathYear'] - df_earliest['birthYear'] < 150


# Plausible professions for a person born in year 4 
non_modern_professions = {'writer', 'miscellaneous', '\\N'}

# Split the primaryProfession into a list
df_earliest['profession_list'] = (
    df_earliest['primaryProfession']
    .fillna('')
    .str.split(',')
    .apply(lambda lst: [p.strip() for p in lst])
)

# profession_coherent = True if all professions are not modern or null
df_earliest['profession_coherent'] = df_earliest['profession_list'].apply(
    lambda profs: (
        all(p in non_modern_professions for p in profs)
    )
)

# Determine if the birthdate is credible with all conditions met
df_earliest['credible_birthdate'] = (
    df_earliest['is_dead'] & 
    df_earliest['death_after_birth'] & 
    df_earliest['plausible_life'] &
    df_earliest['profession_coherent']
)

result = df_earliest[[
    'primaryName', 
    'birthYear', 
    'deathYear', 
    'is_dead',
    'death_after_birth',
    'plausible_life',
    'profession_coherent',
    'credible_birthdate'
]].iloc[0]

display(result)
Markdown(
    f"##### _Birth year **{df_earliest['birthYear'].values[0]}** is "
    f"**{'CREDIBLE' if df_earliest['credible_birthdate'].values[0] else 'IMPLAUSIBLE'}** here._"
)

primaryName            Lucio Anneo Seneca
birthYear                               4
deathYear                              65
is_dead                              True
death_after_birth                    True
plausible_life                       True
profession_coherent                  True
credible_birthdate                   True
Name: 737970, dtype: object

##### _Birth year **4** is **CREDIBLE** here._

The code we used to extract all the professions and pick the non modern ones among the list :

In [None]:
# List all unique professions in the dataset
all_professions = (
    df_name_basics["primaryProfession"]
    .dropna()
    .str.split(",")
    .explode()
    .str.strip()
    .unique()
)

Markdown(
    f"##### _{len(all_professions)} professions:_\n\n"
    + "\n".join([f"- **{p}**" for p in all_professions])
)


##### _47 professions:_

- **actor**
- **miscellaneous**
- **producer**
- **actress**
- **soundtrack**
- **music_department**
- **writer**
- **director**
- **stunts**
- **make_up_department**
- **archive_footage**
- **composer**
- **assistant_director**
- **camera_department**
- **music_artist**
- **production_department**
- **editor**
- **cinematographer**
- **executive**
- **visual_effects**
- **costume_designer**
- **script_department**
- **art_director**
- **editorial_department**
- **costume_department**
- **animation_department**
- **art_department**
- **talent_agent**
- **archive_sound**
- **choreographer**
- **production_designer**
- **special_effects**
- **manager**
- **production_manager**
- **sound_department**
- **casting_department**
- **location_management**
- **casting_director**
- **set_decorator**
- **transportation_department**
- **\N**
- **legal**
- **publicist**
- **accountant**
- **podcaster**
- **assistant**
- **electrical_department**

***Reasoning explanations***

To determine whether the birth year is credible using only the dataset, we apply a series of internal consistency checks based on the information available for this person.

1. **Death must occur after birth**  
   The recorded death year must be strictly greater than the recorded birth year.  
   If someone is listed as dying before they were born, the birth year cannot be credible.

2. **The lifespan must be humanly plausible**  
   We check that the person did not live more than 150 years.  
   This threshold is not meant to be historically precise, but simply to exclude biologically impossible ages.  
   If the computed age at death exceeds this limit, the birth year is considered implausible.

3. **The professions must be compatible with an ancient birth year**  
   Since a previous question showed that the earliest birth year in the dataset is year 4,  
   any individual born at that time must have professions that could exist in antiquity.  
   Only the professions *writer* and *miscellaneous* satisfy this condition.  
   If the person has any modern profession (e.g., actor, director, cinematographer, etc.),  
   then their recorded birth year cannot be credible, because these occupations depend on technologies that did not exist at that time.

4. **Final decision**  
   The birth year is marked as **credible** only if:  
   - the death year is valid and occurs after the birth year,  
   - the lifespan is plausible,  
   - and every listed profession is compatible with an ancient birth date.  
   If any of these conditions fails, the birth year is considered **implausible** based solely on the dataset.

#### **What is the most recent date of birth?**

In [None]:
# Find the most recent birth year in the dataset and display the corresponding people
latest_year = df_name_basics['birthYear'].replace('\\N', pd.NA).dropna().astype(int).max()
df_latest = df_name_basics[df_name_basics['birthYear'] == str(latest_year)]

display(df_latest)
Markdown(f"##### _The most recent date of birthday in the dataset is in year **{df_latest['birthYear'].values[0]}**._")

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
5900298,nm1542939,Vadim Smirnov,2025,\N,"actor,stunts","tt0325005,tt0387358,tt1502417,tt0309853"
7063452,nm16784939,Kyrah Ivy Jackson,2025,\N,actress,\N
11510332,nm5642311,Chase Ramsey,2025,\N,"actor,director,writer","tt17505010,tt14715170,tt4236770,tt17062324"


##### _The most recent date of birthday in the dataset is in year **2025**._

#### **What percentage of the people do not have a listed date of birth?**

In [None]:
# Count how many people have no birthYear ('\N' in the dataset)
missing_births = df_name_basics['birthYear'].value_counts().get('\\N', 0)

# Total number of people in the dataset
total_people = df_name_basics.shape[0]

# Compute the percentage of missing birth years
percentage_missing = (missing_births / total_people) * 100

Markdown(
    f"##### _Percentage of people without listed date of birth: **{percentage_missing:.2f}%**._"
)

##### _Percentage of people without listed date of birth: **95.57%**._

#### **What is the length of the longest "short" after 1900?**

In [None]:
# Convert startYear to numeric for filtering
start_year = pd.to_numeric(df_title_basics['startYear'], errors='coerce')

# Select short films released after 1900
df_after_1900 = df_title_basics[
    (start_year > 1900) &
    (df_title_basics['titleType'] == 'short')
]

# Convert runtime to numeric
runtime = pd.to_numeric(df_after_1900['runtimeMinutes'], errors='coerce')

# Find the longest short film in minutes
longest_short = int(runtime.max())

# Convert length into hours and minutes
hours = longest_short // 60
minutes = longest_short % 60

# Display the record corresponding to that longest short
df_longest_short = df_after_1900[runtime == longest_short]
display(df_longest_short)

Markdown(
    f"##### _Length of the longest \"short\" after 1900: **{longest_short} minutes** "
    f"(= **{hours}h{minutes}**)._"
)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8520566,tt35509411,short,Our First Day,Our First Day,0,2025,\N,1311,"Drama,Short"


##### _Length of the longest "short" after 1900: **1311 minutes** (= **21h51**)._

#### **What is the length of the shortest "movie" after 1900?**

In [None]:
# Convert startYear to numeric for filtering
start_year = pd.to_numeric(df_title_basics['startYear'], errors='coerce')

# Select movies released after 1900
df_movies_after_1900 = df_title_basics[
    (start_year > 1900) &
    (df_title_basics['titleType'] == 'movie')
]

# Convert runtimes to numeric
runtime_movies = pd.to_numeric(df_movies_after_1900['runtimeMinutes'], errors='coerce')

# Find the shortest movie runtime
shortest_movie = int(runtime_movies.min())

# Display the record corresponding to the shortest movie
df_shortest_movie = df_movies_after_1900[runtime_movies == shortest_movie]
display(df_shortest_movie)

Markdown(
    f"##### _Length of the shortest \"movie\" after 1900: **{shortest_movie} minute(s)**._"
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
24728,tt0025166,movie,George White's Scandals,George White's Scandals,0,1934,\N,1,"Comedy,Musical,Romance"
450320,tt0469119,movie,Love Trap,Love Trap,0,2005,\N,1,Drama
784824,tt0810779,movie,Bound by Blood,Bound by Blood,0,2007,\N,1,Action
821097,tt0848384,movie,Nikkatsu on Parade,Nikkatsu on Parade,0,1930,\N,1,Documentary
2548473,tt12893768,movie,If I Die Tomorrow,If I Die Tomorrow,0,2020,\N,1,Documentary
6249529,tt26348770,movie,Dancing Boy,Dancing Boy,0,2023,\N,1,Documentary
7722600,tt32276067,movie,Honest Vikky (Life Coach),Honest Vikky (Life Coach),1,2024,\N,1,Adult
9390607,tt39051124,movie,The Challenger: Comic brought to Life,The Challenger: Comic brought to Life,0,2025,\N,1,Animation


##### _Length of the shortest "movie" after 1900: **1 minute(s)**._

#### **List of all of the genres represented.**

In [None]:
# Collect all distinct genres that appear in the dataset
all_genres = set()
for genres in df_title_basics['genres'].dropna():
    for genre in genres.split(','):
        all_genres.add(genre)

Markdown(
    "##### _All genres:_\n\n"
    + "\n".join([f"- **{g}**" for g in all_genres])
)

##### _All genres:_

- **Talk-Show**
- **Documentary**
- **War**
- **Adult**
- **Music**
- **News**
- **Drama**
- **Thriller**
- **Film-Noir**
- **Family**
- **Mystery**
- **Sci-Fi**
- **History**
- **Comedy**
- **Musical**
- **\N**
- **Reality-TV**
- **Sport**
- **Short**
- **Action**
- **Game-Show**
- **Adventure**
- **Western**
- **Animation**
- **Biography**
- **Romance**
- **Fantasy**
- **Horror**
- **Crime**

#### **What is the highest rated comedy "movie" in the dataset?** 
#### **Note, if there is a tie, the tie shall be broken by the movie with the most votes.**


In [None]:
# Select all titles that are movies and include the genre 'Comedy'
df_comedy_movies = df_title_basics[
    (df_title_basics['titleType'] == 'movie') &
    (df_title_basics['genres'].str.contains('Comedy', na=False))
]

# Merge with ratings to get score and vote counts
df_comedy_with_ratings = pd.merge(
    df_comedy_movies,
    df_title_ratings,
    on='tconst',
    how='inner'
)

# Convert rating and vote counts to numeric
df_comedy_with_ratings['averageRating'] = pd.to_numeric(df_comedy_with_ratings['averageRating'], errors='coerce')
df_comedy_with_ratings['numVotes'] = pd.to_numeric(df_comedy_with_ratings['numVotes'], errors='coerce')

# Sort by rating first, then number of votes (both descending)
highest_rated_comedy = df_comedy_with_ratings.sort_values(
    by=['averageRating', 'numVotes'],
    ascending=[False, False]
)

display(highest_rated_comedy)

# Print the top result
Markdown(
    f"##### _The highest rated comedy movie is **\"{highest_rated_comedy.iloc[0]['primaryTitle']}\"** "
    f"with a rating of **{highest_rated_comedy.iloc[0]['averageRating']}**, "
    f"and **{highest_rated_comedy.iloc[0]['numVotes']} votes**._"
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
78961,tt8458418,movie,O La La,O La La,0,2018,\N,125,Comedy,10.0,6
61952,tt27203772,movie,Rabb Di Awaaz,Rabb Di Awaaz,0,2023,\N,\N,"Comedy,Drama",9.9,22
64593,tt30788051,movie,The ReWrite.,The ReWrite.,0,2025,\N,94,"Comedy,Drama",9.9,21
62618,tt27949780,movie,Shubh Lagan Muhurt,Shubh Lagan Muhurt,0,2023,\N,131,Comedy,9.9,13
61163,tt25967770,movie,Zucchini,Zucchini,0,2025,\N,83,"Comedy,Romance",9.9,8
...,...,...,...,...,...,...,...,...,...,...,...
27218,tt0199358,movie,"Belli, carucci e pettinati","Belli, carucci e pettinati",0,2000,\N,\N,Comedy,1.0,13
59249,tt23012498,movie,Our President's Money,Our President's Money,0,2022,\N,\N,Comedy,1.0,12
62494,tt27819521,movie,Camp Break 2,Farar Az Kamp 2,0,2012,\N,\N,Comedy,1.0,11
63232,tt28689690,movie,The Rat Catcher,Moosh Gir,0,2017,\N,87,Comedy,1.0,9


##### _The highest rated comedy movie is **"O La La"** with a rating of **10.0**, and **6 votes**._

In [None]:
# Question where is included automatically the highest rated comedy movie
Markdown(f'### **Who was the director of the movie "{highest_rated_comedy.iloc[0]['primaryTitle']}"?**')

### **Who was the director of the movie "O La La"?**

In [None]:
# Select the title entry for highest rated comedy
df_best_rate_comedy = df_title_basics[
    (df_title_basics['primaryTitle'] == highest_rated_comedy.iloc[0]['primaryTitle']) &
    (df_title_basics['tconst'] == highest_rated_comedy.iloc[0]['tconst'])
]

# Merge with crew data to access the director IDs
df_best_rate_comedy_crew = pd.merge(
    df_best_rate_comedy,
    df_title_crew,
    on='tconst',
    how='inner'
)

# Extract director IDs and retrieve their names
directors_ids = df_best_rate_comedy_crew.iloc[0]['directors'].split(',')
directors = df_name_basics[df_name_basics['nconst'].isin(directors_ids)]

display(directors)

Markdown(
    f"##### _The director(s) of the movie is/are: **{directors['primaryName'].str.cat(sep=', ')}**._"
)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
13075729,nm7709412,Sripad Pai,1965,\N,"director,writer,assistant_director","tt32752066,tt5177914,tt8499736,tt32751718"


##### _The director(s) of the movie is/are: **Sripad Pai**._

In [None]:
Markdown(f'### **List, if any, the alternate titles for the movie "{highest_rated_comedy.iloc[0]['primaryTitle']}".**')

### **List, if any, the alternate titles for the movie "O La La".**

In [None]:
df_best_comedy_akas = df_title_akas[
    df_title_akas['titleId'] == highest_rated_comedy.iloc[0]['tconst']
]

display(df_best_comedy_akas)


original_title = highest_rated_comedy.iloc[0]['primaryTitle']
aka_titles = df_best_comedy_akas['titleId'].dropna().unique()

different_akas = [t for t in aka_titles if t != original_title]

aka_display = df_best_comedy_akas.loc[
    df_best_comedy_akas['titleId'].isin(different_akas),
       'title'
].dropna().unique().tolist()

Markdown(
    f'##### _All different titles of "{original_title}":_\n'
    + (
        "\n".join([f"- **{t}**" for t in aka_display if t != original_title])
        if any(t != original_title for t in aka_display)
        else f"##### _There are no alternate titles for this movie. **The movie exist in {df_best_comedy_akas.shape[0]} region(s) with the same name**._"
    )
)

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
50818492,tt8458418,1,O La La,\N,\N,original,\N,1
50818493,tt8458418,2,O La La,IN,en,imdbDisplay,\N,0


##### _All different titles of "O La La":_
##### _There are no alternate titles for this movie. **The movie exist in 2 region(s) with the same name**._