In [None]:
# Imports
from pathlib import Path
import numpy as np

from src.data_loading import download_json_files, load_and_clean_json_files, save_dataframe
from src.data_analysis import top_genres, top_actors, genre_trends, actor_trends, actor_genre_summary
from src.data_visualization import (
    plot_top_genres_bar,
    plot_top_genres_pie,
    plot_top10_actors_bar,
    plot_genre_trends_by_year,
    plot_actor_trends_top3,
    plot_actor_genre_stacked,
    plot_actor_genre_heatmap
)

In [None]:
# Setup directories
DATA_DIR = Path("../data")
OUT_DIR = Path("../output")
DATA_DIR.mkdir(exist_ok=True, parents=True)
OUT_DIR.mkdir(exist_ok=True, parents=True)

In [None]:
# Download JSON movie datasets
base_url = "https://raw.githubusercontent.com/prust/wikipedia-movie-data/refs/heads/master/"
decades = np.arange(1900, 2020, 10)

print("1) Downloading JSON files...")
total_movies = download_json_files(base_url, decades, DATA_DIR)
print(f"Total movies downloaded: {total_movies}")

In [None]:
# Load and clean data
print("\n2) Loading and cleaning JSON files...")
df = load_and_clean_json_files(decades, DATA_DIR)
print(f"DataFrame shape: {df.shape}")
print(df.info())

In [None]:
# Save cleaned data
clean_path = OUT_DIR / "movies_clean.csv"
save_dataframe(df, clean_path)

In [None]:
# Analysis: Top Genres and Actors
group_by_genres = top_genres(df)
group_by_actors = top_actors(df)

print("\nTop 10 Genres:")
print(group_by_genres)

print("\nTop 10 Actors:")
print(group_by_actors)

**Visualizations**


In [None]:
# Top 10 Genres
plot_top_genres_bar(group_by_genres, OUT_DIR)
plot_top_genres_pie(group_by_genres, OUT_DIR)

**Key insights**

The most popular genre is Drama (12,831 films), followed by Comedy (9,693) and Silent (6,825).

This indicates that during the period under review, viewers mainly chose drama and comedy films, and that silent films accounted for a significant share.

Genres such as Western, Romance, and Crime were also quite popular, but lagged behind the leaders in terms of the number of films.

In [None]:
# Top 10 Actors
plot_top10_actors_bar(group_by_actors, OUT_DIR)

**Key insights**

The most prolific actors were Harold Lloyd (950 films), Hoot Gibson (785), and Charles Starrett (685).

All of these actors were active during the silent and early sound film eras, as evidenced by their large number of roles.**bold text**

In [None]:
# Genre trends by year (selected genres)
selected_genres = ["Drama", "Comedy", "Silent"]
df_genre_trends = genre_trends(df, selected_genres)
plot_genre_trends_by_year(df_genre_trends, OUT_DIR)

**Key insights**

The chart clearly illustrates a historical transition in film production from silent films to sound-based Drama and Comedy.

While technological innovation drastically reduced silent films, popular genres like Drama and Comedy continued to evolve, highlighting both industry adaptation and changing audience preferences.

In [None]:
# Actor trends for top-3 actors
top3_actors_list = group_by_actors['actor'].head(3).tolist()
df_actor_trends = actor_trends(df, top3_actors_list)
plot_actor_trends_top3(df_actor_trends, OUT_DIR)

**Key insights**

The three most popular actors had different periods of activity.

* Harold Lloyd was actively filming in the 1910s, but by 1920 his activity had declined sharply.

* Hoot Gibson was active mainly in the 1920s and 1930s, peaking in 1919–1920.

* Charles Starrett appeared later, in the 1930s, and remained active until the early 1950s.


This shows that each actor dominated a different historical period of cinema.

In [None]:
# Actor genre summary: top-3 genres vs other for top-10 actors
result_actor_genre = actor_genre_summary(df, group_by_actors, group_by_genres)
plot_actor_genre_stacked(result_actor_genre, OUT_DIR)

In [None]:
# Pivot table heatmap: actors vs genres
top3_genres_list = group_by_genres['genres'].head(3).tolist()
df['has_top3'] = df['genres'].apply(lambda g: [genre for genre in g if genre in top3_genres_list])
df_exploded_cast = df.explode('cast')
heatmap_data = df_exploded_cast.explode('has_top3').groupby(['cast','has_top3'])['title'].nunique().unstack(fill_value=0)
heatmap_data = heatmap_data.loc[group_by_actors['actor']]

plot_actor_genre_heatmap(heatmap_data, OUT_DIR)

**Key insights**

Harold Lloyd and Bebe Daniels have the highest share of films in the top 3 genres (~19–20%).

Some actors, such as Charles Starrett and Johnny Mack Brown, appeared mainly in genres that are not in the top 3, so their share of the top 3 is less than 3%.

This indicates the specialization of actors: some focus on popular genres, while others focus on niche genres.

**General conclusions**

The popularity of genres and the specialization of actors do not always coincide: great acting careers do not necessarily focus on top genres.

Actors with a high number of films in the top 3 genres, such as Harold Lloyd and Bebe Daniels, had a more “popular” role profile.

Other actors with a large number of films but a low percentage of top 3 genres may have specialized in specific niches (e.g., Western or Crime).

In [None]:
# Save CSV Outputs
group_by_genres.to_csv(OUT_DIR / "top10_genres.csv", index=False)
group_by_actors.to_csv(OUT_DIR / "top10_actors.csv", index=False)
result_actor_genre.to_csv(OUT_DIR / "top10_actors_top3_vs_other.csv", index=False)

print("\nFinished analysis. Outputs (CSV + PNG) saved in ./output/")