In [1]:
import json
from pathlib import Path

import pandas as pd
import plotly.express as px

pd.options.plotting.backend = "plotly"

In [2]:
dataset_dir = Path("./dataset/")
print(f"genres: {list(map(lambda d: d.name, dataset_dir.glob('./*')))}")
print(f"genres count: {len(list(map(lambda d: d.name, dataset_dir.glob('./*'))))}")

genres: ['action', 'adventure', 'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama', 'family', 'fantasy', 'film noir', 'history', 'horror', 'music', 'musical', 'mystery', 'romance', 'sci-fi', 'short film', 'sport', 'superhero', 'thriller', 'war', 'western']
genres count: 24


In [3]:
films_list = []

for film_info_path in dataset_dir.glob("./*/*/info.json"):
    with open(film_info_path) as f:
        films_list.append(json.load(f))

In [4]:
df = pd.DataFrame.from_records(films_list, columns=["title_id", "genre", "name", "description"])

print(df.shape)
df.head()

(11902, 4)


Unnamed: 0,title_id,genre,name,description
0,tt0055928,action,Dr. No,A resourceful British government agent seeks a...
1,tt0061578,action,The Dirty Dozen,"During World War II, a rebellious U.S. Army Ma..."
2,tt0069897,action,Coffy,A sexy black nurse takes vigilante justice aga...
3,tt0071517,action,Foxy Brown,A voluptuous black vigilante takes a job as a ...
4,tt0076759,action,Star Wars,Luke Skywalker joins forces with a Jedi Knight...


In [5]:
mean_qty_genre = df.groupby("genre").agg(films_count=("title_id", "count")).mean().values[0]

print(f"Total number of films: {df.shape[0]}")
print(f"Number of unique films: {df['title_id'].unique().shape[0]}")
print(f"Mean quantity per genre: {mean_qty_genre:.2f}")

Total number of films: 11902
Number of unique films: 7146
Mean quantity per genre: 495.92


In [6]:
df_plot = df.groupby("genre", as_index=False).agg(films_count=("title_id", "count"))
fig = px.histogram(df_plot, x="genre", y="films_count", text_auto=True).update_layout(yaxis_title="films_count")

fig.show()