In [1]:
import pandas as pd
from collections import Counter

# Charger les données
df_romance = pd.read_csv("../data/processed/romance_books_clean.csv", 
                         encoding='utf-8', sep=';')

print(f"Dataset : {len(df_romance)} romances\n")

# ========================================
# ÉTAPE 1 : EXTRAIRE TOUS LES GENRES
# ========================================

all_genres = []

for genres_str in df_romance['genres'].dropna():
    # Séparer par ; et par ,
    genres_list = genres_str.replace(';', ',').split(',')
    # Nettoyer (enlever espaces, mettre en titre)
    genres_list = [g.strip() for g in genres_list if g.strip()]
    all_genres.extend(genres_list)

# Compter les occurrences
genre_counts = Counter(all_genres)

print("=== TOP 30 GENRES LES PLUS FRÉQUENTS ===")
for genre, count in genre_counts.most_common(30):
    print(f"{genre:30} : {count:4} occurrences ({count/len(df_romance)*100:.1f}%)")

# ========================================
# ÉTAPE 2 : IDENTIFIER LES SOUS-GENRES DE ROMANCE
# ========================================

print("\n\n=== SOUS-GENRES CONTENANT 'ROMANCE' ===")
romance_subgenres = [(g, c) for g, c in genre_counts.items() if 'Romance' in g]
romance_subgenres.sort(key=lambda x: x[1], reverse=True)

for genre, count in romance_subgenres:
    print(f"{genre:40} : {count:4} occurrences ({count/len(df_romance)*100:.1f}%)")

Dataset : 1566 romances

=== TOP 30 GENRES LES PLUS FRÉQUENTS ===
Romance                        : 2302 occurrences (147.0%)
Fiction                        : 1427 occurrences (91.1%)
Fantasy                        : 1099 occurrences (70.2%)
Historical                     :  985 occurrences (62.9%)
Literature                     :  691 occurrences (44.1%)
Adult                          :  523 occurrences (33.4%)
Contemporary                   :  522 occurrences (33.3%)
Historical Fiction             :  499 occurrences (31.9%)
Young Adult                    :  489 occurrences (31.2%)
Classics                       :  482 occurrences (30.8%)
Womens Fiction                 :  436 occurrences (27.8%)
Novels                         :  408 occurrences (26.1%)
Chick Lit                      :  370 occurrences (23.6%)
Mystery                        :  350 occurrences (22.3%)
Paranormal                     :  338 occurrences (21.6%)
Cultural                       :  336 occurrences (21.5%)
Seque