## BIG DATA Project

- BEMMOUSSAT Marwan
- BOUKAOUI Mohamed
- EL AICHOUNI Yahya

In [3]:
import pandas as pd
import numpy as np

# Base URL for IMDB datasets
base_url = "https://datasets.imdbws.com/"

# Liste des fichiers nécessaires
datasets = {
    "name_basics": "name.basics.tsv.gz",
    #"title_akas": "title.akas.tsv.gz",
    "title_basics": "title.basics.tsv.gz",
    "title_crew": "title.crew.tsv.gz",
    "title_episode": "title.episode.tsv.gz",
    #"title_principals": "title.principals.tsv.gz",
    "title_ratings": "title.ratings.tsv.gz",
}

dataframes = {}

print("Start loading of dataset IMDB...")

for name, filename in datasets.items():
    url = base_url + filename
    try:

        print(f"Loading {filename}...")
        df = pd.read_csv(
            url,
            sep='\t',
            compression='gzip',
            low_memory=False,
            na_values='\\N'
        )
        dataframes[name] = df
        print(f"✅ {name} chargé. Dimensions : {df.shape}")
    except Exception as e:
        print(f"❌ Erreur lors du chargement de {filename} : {e}")





Start loading of dataset IMDB...
Loading name.basics.tsv.gz...
✅ name_basics chargé. Dimensions : (14948799, 6)
Loading title.basics.tsv.gz...
✅ title_basics chargé. Dimensions : (12146875, 9)
Loading title.crew.tsv.gz...
✅ title_crew chargé. Dimensions : (12146875, 3)
Loading title.episode.tsv.gz...
✅ title_episode chargé. Dimensions : (9370391, 4)
Loading title.ratings.tsv.gz...
✅ title_ratings chargé. Dimensions : (1608088, 3)


We rename the dataframes

In [4]:
df_name_basics = dataframes['name_basics']
df_title_basics = dataframes['title_basics']
df_title_crew = dataframes['title_crew']
df_title_episode = dataframes['title_episode']
df_title_ratings = dataframes['title_ratings']

We cannot load title_akas et title_principales with the link because of a problem with the RAM

In [5]:
df_name_basics.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14948799 entries, 0 to 14948798
Data columns (total 6 columns):
 #   Column             Dtype  
---  ------             -----  
 0   nconst             object 
 1   primaryName        object 
 2   birthYear          float64
 3   deathYear          float64
 4   primaryProfession  object 
 5   knownForTitles     object 
dtypes: float64(2), object(4)
memory usage: 684.3+ MB


In [6]:
df_name_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164"
1,nm0000002,Lauren Bacall,1924.0,2014.0,"actress,miscellaneous,soundtrack","tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,1934.0,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949.0,1982.0,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918.0,2007.0,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976"
...,...,...,...,...,...,...
14948794,nm9993714,Romeo del Rosario,,,"animation_department,art_department","tt11657662,tt14069590,tt2455546"
14948795,nm9993716,Essias Loberg,,,,
14948796,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744
14948797,nm9993718,Aayush Nair,,,cinematographer,tt8736744


In [7]:
df_title_basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892.0,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,Short
...,...,...,...,...,...,...,...,...,...
12146870,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009.0,,,Drama
12146871,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010.0,,,Drama
12146872,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010.0,,,Drama
12146873,tt9916856,short,The Wind,The Wind,0,2015.0,,27,Short


In [8]:
df_title_crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,
...,...,...,...
12146870,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
12146871,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
12146872,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
12146873,tt9916856,nm10538645,nm6951431


In [9]:
df_title_episode

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0
...,...,...,...,...
9370386,tt9916846,tt1289683,3.0,18.0
9370387,tt9916848,tt1289683,3.0,17.0
9370388,tt9916850,tt1289683,3.0,19.0
9370389,tt9916852,tt1289683,3.0,20.0


In [10]:
df_title_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2188
1,tt0000002,5.5,307
2,tt0000003,6.4,2274
3,tt0000004,5.1,197
4,tt0000005,6.2,3012
...,...,...,...
1608083,tt9916846,5.3,7
1608084,tt9916848,5.2,7
1608085,tt9916850,6.0,7
1608086,tt9916852,5.7,7


2. How many total people in data set?

In [11]:
total_people = df_name_basics.shape[0]

print(f"total people in data set : {total_people:,}")


total people in data set : 14,948,799


3. What is the earliest year of birth?


In [12]:
earliest_birth_year = df_name_basics['birthYear'].min()

print(f"Earliest year of birth is : {earliest_birth_year}")

Earliest year of birth is : 4.0


4. How many years ago was this person born?

This person bornt 2021 years old

5. Using only the data in the data set, determine if this date of birth correct.

In [13]:

outlier_mask = df_name_basics['birthYear'] == 4.0

line_with_outlier = df_name_basics[outlier_mask]

print(" birthYear is equal to 4.0")
print(line_with_outlier)

 birthYear is equal to 4.0
           nconst         primaryName  birthYear  deathYear primaryProfession  \
737944  nm0784172  Lucio Anneo Seneca        4.0       65.0            writer   

                                 knownForTitles  
737944  tt0043802,tt0218822,tt0049203,tt0972562  


6. Explain the reasoning for the answer in a code comment or new markdown cell.

To prioi this date is correct, Lucio bornt in 4 and died in 65 which meant he lived 61 years old

7. What is the most recent date of birth?

In [14]:
recent_birth_year = df_name_basics['birthYear'].max()

print(f"Recent year of birth is : {recent_birth_year}")

Recent year of birth is : 2025.0


8. What percentage of the people do not have a listed date of birth?


In [15]:
missing_birth_years = df_name_basics['birthYear'].isna().sum()

percentage_missing = (missing_birth_years / total_people) * 100

print(f"Number of people do not have a listed date of birth : {missing_birth_years:,}")
print(f"percentage_missing : {percentage_missing:.2f}%")

Number of people do not have a listed date of birth : 14,287,661
percentage_missing : 95.58%


9. What is the length of the longest "short" after 1900?


In [16]:
df_title_basics.info()
df_title_basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12146875 entries, 0 to 12146874
Data columns (total 9 columns):
 #   Column          Dtype  
---  ------          -----  
 0   tconst          object 
 1   titleType       object 
 2   primaryTitle    object 
 3   originalTitle   object 
 4   isAdult         int64  
 5   startYear       float64
 6   endYear         float64
 7   runtimeMinutes  object 
 8   genres          object 
dtypes: float64(2), int64(1), object(6)
memory usage: 834.1+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892.0,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,,1,Short


In [17]:
# We convert our column in integer to compare them
# create a copy
df_titles = df_title_basics.copy()

df_titles['runtimeMinutes_clean'] = pd.to_numeric(
    df_titles['runtimeMinutes'],
    errors='coerce'
)

df_titles['startYear'] = pd.to_numeric(df_titles['startYear'], errors='coerce')


In [18]:
shorts = df_titles[df_titles['titleType'] == 'short']

shorts_after_1900 = shorts[shorts['startYear'] > 1900]

longest_short_runtime = shorts_after_1900['runtimeMinutes_clean'].max()

print(f" The length of the longest 'short' after 1900 is : {longest_short_runtime} minutes.")

 The length of the longest 'short' after 1900 is : 1311.0 minutes.


10. What is the length of the shortest "movie" after 1900?

In [19]:
movies = df_titles[df_titles['titleType'] == 'movie']

movies_after_1900 = movies[movies['startYear'] > 1900]

longest_movie_runtime = movies_after_1900['runtimeMinutes_clean'].max()

print(f" The length of the longest 'movie' after 1900 is : {longest_movie_runtime} minutes.")

 The length of the longest 'movie' after 1900 is : 59460.0 minutes.


11. List of all of the genres represented

In [20]:
all_genres_series = df_titles['genres'].dropna().astype(str)

all_genres_list = ','.join(all_genres_series).split(',')

unique_genres = sorted(list(set(all_genres_list)))

print(f"Number of unique genre : {len(unique_genres)}")
print("\nList of uniques genres :")
print(unique_genres)

Number of unique genre : 28

List of uniques genres :
['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western']


12. What is the highest rated comedy "movie" in the dataset? Note, if there is a tie, the tie shall be broken by the movie with the most votes .

In [21]:
# CLeaning of df_title_ratings :
df_title_ratings['averageRating'] = pd.to_numeric(
    df_title_ratings['averageRating'], errors='coerce'
)
df_title_ratings['numVotes'] = pd.to_numeric(
    df_title_ratings['numVotes'], errors='coerce'
)

df_merged = df_titles.merge(
    df_title_ratings,
    on='tconst',
    how='inner'
)

print(f"DataFrame fused. shape: {df_merged.shape}")

DataFrame fused. shape: (1608046, 12)


In [22]:
comedy_movies = df_merged[df_merged['titleType'] == 'movie'].copy()

comedy_movies = comedy_movies[
    comedy_movies['genres'].astype(str).str.contains('Comedy', na=False)
]

print(f"Number of 'movies' comedy found : {comedy_movies.shape[0]:,}")

Number of 'movies' comedy found : 80,879


In [23]:
best_comedy = comedy_movies.sort_values(
    by=['averageRating', 'numVotes'],
    ascending=[False, False]
).iloc[0]

best_comedy_title = best_comedy['primaryTitle']
best_comedy_rating = best_comedy['averageRating']
best_comedy_votes = best_comedy['numVotes']

print(f"Best comedy is : {best_comedy_title}")
print(f"Note : {best_comedy_rating}, Votes : {best_comedy_votes:,}")

Best comedy is : O La La
Note : 10.0, Votes : 6


13. Who was the director of the movie?

In [24]:
#  Extract the tconst of the highest rated movie
best_tconst = best_comedy['tconst']

#  Filter the Crew DataFrame to find the director(s) of this movie
# df_title_crew contains the 'directors' column (a comma-separated string of IDs)
director_row = df_title_crew[df_title_crew['tconst'] == best_tconst].iloc[0]

#  Retrieve the ID(s) of the director(s)
director_ids_str = director_row['directors']

#  Clean up and retrieve the name (assuming there is at least one director)
if pd.isna(director_ids_str):
    director_name = "N/A (Not found in df_title_crew)"
else:
    # Retrieve the first director ID (if multiple are listed)
    first_director_id = director_ids_str.split(',')[0]

    #  Search for the director's name in df_name_basics
    # nconst is the ID for the person
    director_name_row = df_name_basics[df_name_basics['nconst'] == first_director_id]

    if not director_name_row.empty:
        # Get the primaryName from the matching row
        director_name = director_name_row.iloc[0]['primaryName']
    else:
        director_name = "N/A (Director ID not found in df_name_basics)"


print(f"The director of the movie {best_comedy_title} is: {director_name}")

The director of the movie O La La is: Sripad Pai


14. List, if any, the alternate titles for the movie.

i cannot load the title_akas -, i can't ask this question