# Dataset overview

This notebook explores the Goodreads dataset to understand its structure and data quality.

In [6]:
import pandas as pd

df = pd.read_csv("../data/raw/Goodreads_books_with_genres.csv", encoding='latin-1', sep=';', on_bad_lines='skip')
df.head()

print("=== INFORMATIONS GÉNÉRALES ===")
print(f"Nombre de lignes : {len(df)}")
print(f"Nombre de colonnes : {len(df.columns)}")

print("\n=== COLONNES DISPONIBLES ===")
print(df.columns.tolist())

print("\n=== PREMIÈRES LIGNES ===")
display(df.head())

print("\n=== TYPES DE DONNÉES ===")
print(df.dtypes)

print("\n=== VALEURS MANQUANTES ===")
print(df.isnull().sum())

# Afficher EXACTEMENT les noms des colonnes
print("=== NOMS DES COLONNES ===")
for i, col in enumerate(df.columns):
    print(f"{i}: '{col}'")  # Les guillemets montrent les espaces cachés

print("\n=== PREMIÈRES LIGNES ===")
df.head()

# Filtrer les livres contenant "Romance" dans la colonne genres
df_romance = df[df['genres'].str.contains('Romance', case=False, na=False)]

print(f"\n=== ROMANCES FILTRÉES ===")
print(f"Nombre de romances : {len(df_romance)}")
print(f"Pourcentage : {len(df_romance)/len(df)*100:.1f}%")

df_romance.head()

=== INFORMATIONS GÉNÉRALES ===
Nombre de lignes : 11127
Nombre de colonnes : 13

=== COLONNES DISPONIBLES ===
['Book Id', 'Title', 'Author', 'average_rating', 'isbn', 'isbn13', 'language_code', 'num_pages', 'ratings_count', 'text_reviews_count', 'publication_date', 'publisher', 'genres']

=== PREMIÈRES LIGNES ===


Unnamed: 0,Book Id,Title,Author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
0,1,Harry Potter and the Half-Blood Prince (Harry ...,"J,K, Rowling/Mary GrandPré",457,0439785960,"9,78044E+12",eng,652,2095690,27591,9/16/2006,"Scholastic Inc,","Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
1,2,Harry Potter and the Order of the Phoenix (Har...,"J,K, Rowling/Mary GrandPré",449,0439358078,"9,78044E+12",eng,870,2153167,29221,9/1/2004,"Scholastic Inc,","Fantasy;Young Adult;Fiction;Fantasy,Magic;Chil..."
2,4,Harry Potter and the Chamber of Secrets (Harry...,"J,K, Rowling",442,0439554896,"9,78044E+12",eng,352,6333,244,11/1/2003,Scholastic,"Fantasy;Fiction;Young Adult;Fantasy,Magic;Chil..."
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,"J,K, Rowling/Mary GrandPré",456,043965548X,"9,78044E+12",eng,435,2339585,36325,5/1/2004,"Scholastic Inc,","Fantasy;Fiction;Young Adult;Fantasy,Magic;Chil..."
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,"J,K, Rowling/Mary GrandPré",478,0439682584,"9,78044E+12",eng,2690,41428,164,9/13/2004,Scholastic,"Fantasy;Young Adult;Fiction;Fantasy,Magic;Adve..."



=== TYPES DE DONNÉES ===
Book Id                int64
Title                 object
Author                object
average_rating        object
isbn                  object
isbn13                object
language_code         object
num_pages              int64
ratings_count          int64
text_reviews_count     int64
publication_date      object
publisher             object
genres                object
dtype: object

=== VALEURS MANQUANTES ===
Book Id                0
Title                  0
Author                 0
average_rating         0
isbn                   0
isbn13                 0
language_code          0
num_pages              0
ratings_count          0
text_reviews_count     0
publication_date       0
publisher              0
genres                97
dtype: int64
=== NOMS DES COLONNES ===
0: 'Book Id'
1: 'Title'
2: 'Author'
3: 'average_rating'
4: 'isbn'
5: 'isbn13'
6: 'language_code'
7: 'num_pages'
8: 'ratings_count'
9: 'text_reviews_count'
10: 'publication_date'
11: 'publishe

Unnamed: 0,Book Id,Title,Author,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,genres
33,57,A Changeling for All Seasons (Changeling Seaso...,Angela Knight/Sahara Kelly/Judy Mays/Marteeka ...,376,1595962808,"9,7816E+12",eng,304,167,4,11/1/2005,Changeling Press,"Romance;Fantasy,Paranormal;Anthologies;Adult F..."
35,59,The Changeling Sea,"Patricia A, McKillip",406,141312629,"9,78014E+12",eng,137,4454,302,4/14/2003,Firebird,"Fantasy;Young Adult;Romance;Fiction;Fantasy,Ma..."
38,66,The Changeling (Daughters of England #15),Philippa Carr,398,449146979,"9,78045E+12",eng,369,345,12,8/28/1990,Ivy Books,"Historical,Historical Fiction;Romance;Fiction;..."
89,151,Anna Karenina,Leo Tolstoy/Richard Pevear/Larissa Volokhonsky,405,143035002,"9,78014E+12",eng,838,16643,1851,5/31/2004,Penguin Classics,"Classics;Fiction;Romance;Cultural,Russia;Histo..."
90,152,Anna Karenina,Leo Tolstoy/David Magarshack/Priscilla Meyer,405,451528611,"9,78045E+12",eng,960,109420,5696,11/5/2002,Signet,"Classics;Fiction;Romance;Cultural,Russia;Histo..."
