In [1]:
import pandas as pd

# Chargement
df = pd.read_csv("../data/raw/Goodreads_books_with_genres.csv", 
                 encoding='latin-1',
                 sep=';',
                 on_bad_lines='skip')

# Filtrer les romances
df_romance = df[df['genres'].str.contains('Romance', case=False, na=False)].copy()

print(f"Dataset romance : {len(df_romance)} livres")

# NETTOYAGE

# 1. Convertir average_rating en nombre
# Remplacer les virgules par des points si nécessaire
df_romance['average_rating'] = df_romance['average_rating'].str.replace(',', '.')
df_romance['average_rating'] = pd.to_numeric(df_romance['average_rating'], errors='coerce')

# 2. Vérifier les valeurs manquantes après conversion
print("\n=== VALEURS MANQUANTES APRÈS NETTOYAGE ===")
print(df_romance.isnull().sum())

# 3. Supprimer les lignes avec note manquante (si nécessaire)
df_romance = df_romance.dropna(subset=['average_rating'])

# 4. Statistiques de base
print("\n=== STATISTIQUES DES ROMANCES ===")
print(f"Nombre de romances propres : {len(df_romance)}")
print(f"Note moyenne : {df_romance['average_rating'].mean():.2f}")
print(f"Note min : {df_romance['average_rating'].min()}")
print(f"Note max : {df_romance['average_rating'].max()}")
print(f"Nombre moyen d'avis : {df_romance['ratings_count'].mean():.0f}")

# 5. Sauvegarder les données nettoyées
df_romance.to_csv("../data/processed/romance_books_clean.csv", index=False, sep=';', encoding='utf-8')
print("\n✅ Données nettoyées sauvegardées !")

df_romance.head()

Dataset romance : 1566 livres

=== VALEURS MANQUANTES APRÈS NETTOYAGE ===
Book Id               0
Title                 0
Author                0
average_rating        0
isbn                  0
isbn13                0
language_code         0
num_pages             0
ratings_count         0
text_reviews_count    0
publication_date      0
publisher             0
genres                0
dtype: int64

=== STATISTIQUES DES ROMANCES ===
Nombre de romances propres : 1566
Note moyenne : 3.90
Note min : 2.4
Note max : 4.55
Nombre moyen d'avis : 29295

✅ Données nettoyées sauvegardées !


Unnamed: 0,Book Id,Title,Author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
33,57,A Changeling for All Seasons (Changeling Seaso...,Angela Knight/Sahara Kelly/Judy Mays/Marteeka ...,3.76,1595962808,"9,7816E+12",eng,304,167,4,11/1/2005,Changeling Press,"Romance;Fantasy,Paranormal;Anthologies;Adult F..."
35,59,The Changeling Sea,"Patricia A, McKillip",4.06,141312629,"9,78014E+12",eng,137,4454,302,4/14/2003,Firebird,"Fantasy;Young Adult;Romance;Fiction;Fantasy,Ma..."
38,66,The Changeling (Daughters of England #15),Philippa Carr,3.98,449146979,"9,78045E+12",eng,369,345,12,8/28/1990,Ivy Books,"Historical,Historical Fiction;Romance;Fiction;..."
89,151,Anna Karenina,Leo Tolstoy/Richard Pevear/Larissa Volokhonsky,4.05,143035002,"9,78014E+12",eng,838,16643,1851,5/31/2004,Penguin Classics,"Classics;Fiction;Romance;Cultural,Russia;Histo..."
90,152,Anna Karenina,Leo Tolstoy/David Magarshack/Priscilla Meyer,4.05,451528611,"9,78045E+12",eng,960,109420,5696,11/5/2002,Signet,"Classics;Fiction;Romance;Cultural,Russia;Histo..."
