In [12]:
import pandas as pd

In [13]:
df_books = pd.read_csv('data/books.csv')
df_books.head()

Unnamed: 0,isbn13,isbn10,title,subtitle,authors,categories,thumbnail,description,published_year,average_rating,num_pages,ratings_count
0,9780002005883,2005883,Gilead,,Marilynne Robinson,Fiction,http://books.google.com/books/content?id=KQZCP...,A NOVEL THAT READERS and critics have been eag...,2004.0,3.85,247.0,361.0
1,9780002261982,2261987,Spider's Web,A Novel,Charles Osborne;Agatha Christie,Detective and mystery stories,http://books.google.com/books/content?id=gA5GP...,A new 'Christie for Christmas' -- a full-lengt...,2000.0,3.83,241.0,5164.0
2,9780006163831,6163831,The One Tree,,Stephen R. Donaldson,American fiction,http://books.google.com/books/content?id=OmQaw...,Volume Two of Stephen Donaldson's acclaimed se...,1982.0,3.97,479.0,172.0
3,9780006178736,6178731,Rage of angels,,Sidney Sheldon,Fiction,http://books.google.com/books/content?id=FKo2T...,"A memorable, mesmerizing heroine Jennifer -- b...",1993.0,3.93,512.0,29532.0
4,9780006280897,6280897,The Four Loves,,Clive Staples Lewis,Christian life,http://books.google.com/books/content?id=XhQ5X...,Lewis' work on the nature of love divides love...,2002.0,4.15,170.0,33684.0


In [14]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   isbn13          6810 non-null   int64  
 1   isbn10          6810 non-null   object 
 2   title           6810 non-null   object 
 3   subtitle        2381 non-null   object 
 4   authors         6738 non-null   object 
 5   categories      6711 non-null   object 
 6   thumbnail       6481 non-null   object 
 7   description     6548 non-null   object 
 8   published_year  6804 non-null   float64
 9   average_rating  6767 non-null   float64
 10  num_pages       6767 non-null   float64
 11  ratings_count   6767 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 638.6+ KB


### CHECK NULL VALUE

In [15]:
df_books.isna().sum() 

isbn13               0
isbn10               0
title                0
subtitle          4429
authors             72
categories          99
thumbnail          329
description        262
published_year       6
average_rating      43
num_pages           43
ratings_count       43
dtype: int64

### CHECK DUPLICATED DATA

In [16]:
print(df_books.duplicated().sum()) 

0


# Data Features

- **isbn13**: Código ISBN de 13 dígitos que identifica de manera única cada libro.
- **isbn10**: Código ISBN de 10 dígitos, versión más antigua del ISBN.
- **title**: El título del libro.
- **subtitle**: El subtítulo del libro (si lo tiene).
- **authors**: Nombre(s) del o los autores del libro.
- **categories**: Categorías o géneros a los que pertenece el libro.
- **thumbnail**: URL de la imagen miniatura del libro.
- **description**: Descripción o sinopsis del libro.
- **published_year**: El año en que el libro fue publicado.
- **average_rating**: La calificación promedio de los usuarios sobre el libro.
- **num_pages**: El número de páginas del libro.
- **ratings_count**: El número total de calificaciones recibidas por el libro.

# Data Preproccessing

#### Primero separaremos los campos que vamos a usar 

In [99]:
df_books = df_books[['title','subtitle','authors','categories','description','published_year','average_rating','ratings_count']]

#### Empezaremos con el campo subtitle ####

In [24]:
print(df_books.subtitle.isna().sum())

4429


In [30]:
df_books.subtitle[0:20]

0                                                NaN
1                                            A Novel
2                                                NaN
3                                                NaN
4                                                NaN
5                                                NaN
6                                                NaN
7     A History of the Indian Ocean and Its Invaders
8                                    Chaos and Order
9                                                NaN
10                                               NaN
11                                               NaN
12                                               NaN
13                                               NaN
14                                               NaN
15                                               NaN
16                                               NaN
17                                               NaN
18                                            

In [31]:
# Reemplazamos los nulos por "without subtitle"
df_books['subtitle'].fillna("without subtitle",inplace=True)

In [33]:
df_books['subtitle'].isna().sum()

np.int64(0)

#### Trataremos el campo authors

In [43]:
# vemos cuantos nulos tiene el campo authors
df_books.authors.isna().sum()

np.int64(72)

In [102]:
# analizamos los nulos dentro del campo authors, verificamos en internet y estos libros si tienen autor
df_books[df_books.authors.isna()].head()

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,ratings_count


In [78]:
# creamos una lista con los titulos de los libros que les falta autor para buscarlos usando web scraping
df_title_null = df_books[df_books.authors.isna()]['title'].tolist() 
df_title_null

['Wodehouse on crime',
 'Burt Dow, Deep-Water Man',
 'The Epic of Gilgamesh',
 "One Flew Over the Cuckoo's Nest",
 'A Year Down Yonder',
 'Tales of the Norse Gods',
 'Sir Gawain and the Green Knight, Pearl, and Sir Orfeo',
 'Flap Your Wings',
 'Big Dog ... Little Dog',
 'Maccabbees',
 'Atonement',
 'Three Greek Plays',
 "McElligot's Pool",
 "Dr. Seuss's Sleep Book",
 "Eric Carle's Very Little Library: The very hungry caterpillar (1st board book ed., 1994)",
 'Amphigorey',
 'Blow Fly',
 "About the B'nai Bagels",
 'Wizard and Glass',
 'The Gunslinger',
 'Grade 4 Common Core Exemplar Collection',
 'The Drama of the Gifted Child',
 'Out of this World',
 'Sherlock Holmes the Complete Novels and Stories Book Discussion Kit',
 'Frankenstein: City of Night: A Novel',
 'Breakfast of Champions',
 'Summer for the Gods',
 'Gift from the Sea',
 'The Sibley Field Guide to Birds of Western North America',
 'The Little Butterfly',
 "Three Tales of My Father's Dragon",
 'Second Home',
 'The Legend of t

In [79]:
import requests

# Función para obtener el autor de un libro por su título
def get_author_by_title(title):
    url = "https://www.googleapis.com/books/v1/volumes"  # Endpoint de la API de Google Books
    params = {
        'q': title,  # Parámetro de búsqueda: título del libro
        'fields': 'items(volumeInfo(title,authors))',  # Campos que queremos obtener: título y autores
        'maxResults': 1  # Queremos solo el primer resultado
    }
    
    # Hacemos la solicitud GET a la API con el título y los parámetros
    response = requests.get(url, params=params)
    
    # Si la solicitud fue exitosa (código de estado 200)
    if response.status_code == 200:
        data = response.json()  # Convertimos la respuesta en formato JSON
        
        # Si encontramos resultados y hay autores en el primer resultado
        if 'items' in data and 'authors' in data['items'][0]['volumeInfo']:
            return data['items'][0]['volumeInfo']['authors'][0]  # Retornamos el primer autor
        else:
            return 'Author not found'  # Si no hay autores, devolvemos un mensaje
    else:
        return 'Request failed'  # Si la solicitud falló, devolvemos un mensaje de error

In [82]:
# creamos diccionario para almacenar los autores
authors_dict = {}

In [83]:
# Iteramos sobre la lista de títulos
for title in df_title_null:
    author = get_author_by_title(title)
    authors_dict[title] = author

In [86]:
# Reemplazar valores nulos en 'authors' usando el diccionario
df_books['authors'] = df_books.apply(
    lambda row: authors_dict.get(row['title'], row['authors']) if pd.isnull(row['authors']) else row['authors'], axis=1
)

# Verificar que los valores nulos han sido reemplazados
print(df_books['authors'].isnull().sum())


0


#### Categories

In [103]:
# analzamos los nulos
df_books[df_books.categories.isna()]

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,ratings_count
16,Well of Darkness,without subtitle,Margaret Weis;Tracy Hickman,,Gareth is just a frightened young lad when he ...,2001.0,3.66,68.0
555,The Professor of Desire,without subtitle,Philip Roth,,"As a student in college, David Kepesh styles h...",1995.0,3.65,1950.0
644,The Pearl,The Red Pony ; Ill. by Wesley Dennis ; [both B...,John Steinbeck,,,1987.0,3.64,1431.0
646,Wodehouse on crime,without subtitle,Pelham Grenville Wodehouse,,,1981.0,4.39,3143.0
650,The New York Trilogy,Ghosts,Paul Auster,,,1987.0,3.79,11696.0
...,...,...,...,...,...,...,...,...
6722,Collected Works of Carson McCullers,without subtitle,Carson McCullers,,,2017.0,4.38,1015.0
6785,"J. D. Salinger, The Catcher in the Rye",Annotations and Study Aids,Jerome David Salinger;Rudolph F. Rau,,,1999.0,3.80,69.0
6786,In the Country of Last Things,Hauptbd.,Paul Auster,,,2001.0,3.91,6230.0
6787,About a Boy,without subtitle,Nick Hornby,,,2002.0,3.80,155.0


In [118]:
# filtramos los libros que les falta categories
df_categories_null = df_books[df_books.categories.isna()]['title'].tolist()

In [119]:
def get_categories_by_title(title):
    url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        "q": f"intitle:{title}",
        "maxResults": 1
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if "items" in data:
            # Extraer la categoria si existe
            categories = data['items'][0]['volumeInfo'].get('categories', None)
            return categories
        else:
            return None
    except (requests.RequestException, ValueError) as e:
        print(f"Error al obtener datos para el libro {title}: {e}")
        return None

In [120]:
categorie_dict={}

In [121]:
for title in df_categories_null:
    categorie = get_categories_by_title(title)
    categorie_dict[title] = categorie

In [129]:
# Llenar las descripciones nulas en el DataFrame original
df_books['categories'] = df_books.apply(
    lambda row: categorie_dict.get(row['title'], row['categories']) if pd.isnull(row['categories']) else row['categories'],
    axis=1
)

In [135]:
# la api de google no pudo llenar la categoria de estos libros
df_books.categories.isna().sum()

np.int64(19)

In [136]:
# Procedemos a reemplazarlo por "unknown"
df_books.categories.fillna('unknown',inplace=True)

#### Description

In [146]:
df_description_null=df_books[df_books.description.isna()]['title'].tolist()

In [141]:
def get_description_by_title(title):
    url = "https://www.googleapis.com/books/v1/volumes"
    params = {
        "q": f"intitle:{title}",
        "maxResults": 1
    }
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()
        
        if "items" in data:
            # Extraer la categoria si existe
            description = data['items'][0]['volumeInfo'].get('description', None)
            return description
        else:
            return None
    except (requests.RequestException, ValueError) as e:
        print(f"Error al obtener datos para el libro {title}: {e}")
        return None

In [147]:
description_dict = {}

In [152]:
for title in df_description_null:
    description = get_description_by_title(title)
    description_dict[title] = description

In [155]:
df_books['description'] = df_books.apply(
    lambda row: description_dict.get(row['title'], row['description']) if pd.isnull(row['description']) else row['description'],
    axis=1
)

In [160]:
# Quedaron nulos, los reemplazaremos por 'with description'
df_books['description'].isna().sum()

np.int64(87)

In [161]:
df_books['description'].fillna('with description',inplace=True)

#### publisher_year

In [170]:
df_year_null = df_books[df_books.published_year.isna()]['title'].tolist()

In [171]:
df_year_null

['Book Club',
 'The Civil War',
 'The Fellowship of the Ring',
 'Grade 4 Common Core Exemplar Collection',
 'Frankenstein: City of Night: A Novel',
 '20000 LEAGUES UNDER THE SEA']

In [173]:
df_books[df_books.published_year.isna()]

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,ratings_count
1114,Book Club,Life of Pi (7 Copies).,Yann Martel,[Book clubs (Discussion groups)],"""Love curling up with a good book? If so, a bo...",,3.9,4242.0
1428,The Civil War,A Narrative,Shelby Foote,United States,"""Foote's comprehensive history of the Civil Wa...",,4.53,6406.0
1779,The Fellowship of the Ring,without subtitle,John Ronald Reuel Tolkien,Fiction,"Frodo Baggins, Sam, and a small band of compan...",,,
3062,Grade 4 Common Core Exemplar Collection,without subtitle,Author not found,English language,"""Common Core Classroom Collections include a m...",,4.13,1709.0
3554,Frankenstein: City of Night: A Novel,without subtitle,Dean Koontz,Horror tales,Detectives Carson O'Connor and Michael Maddiso...,,3.97,1041.0
4548,20000 LEAGUES UNDER THE SEA,without subtitle,JULIO VERNE,[Fiction],A group of men set sail to solve the mystery o...,,3.71,34.0


In [182]:
# Como son pocos libros busque en internet su año de publiacion
year_dict = {}
year = [2001, 1987, 1954, 2011, 2004, 1870]

In [183]:
for i, title in enumerate(df_year_null):  # enumerate() para obtener índice y título
    year_dict[title] = year[i]

In [185]:
# LLenaremos los valores nulos en df_books
df_books['published_year'] = df_books.apply(
    lambda row: year_dict.get(row['title'], row['published_year']) if pd.isnull(row['published_year']) else row['published_year'],
    axis=1
)

In [None]:
# Convertimos el tipo de dato float a datetime

In [191]:
# Convertimos los años a formato de fecha usando el 1 de enero como día predeterminado
df_books['published_year'] = pd.to_datetime(df_books['published_year'], format='%Y', errors='coerce')

In [194]:
df_books['year'] = df_books['published_year'].dt.year

#### average_rating  

In [202]:
# analizamos los datos en el campo averga_rating
df_books[df_books.average_rating.isna()]

Unnamed: 0,title,subtitle,authors,categories,description,published_year,average_rating,ratings_count,year


In [200]:
df_books.average_rating.mean()

np.float64(3.933283582089552)

In [201]:
df_books.average_rating.fillna(3.93, inplace=True)

#### ratings_count   

In [None]:
df_books[df_books.ratings_count.isna()]

In [207]:
df_books.ratings_count.mean()

np.float64(21069.09989655682)

In [208]:
df_books.ratings_count.fillna(21069.09,inplace=True)

# Exportamos en csv el dataframe final

In [210]:
df_final = df_books[['title','subtitle','authors','categories','description','average_rating','ratings_count','year']]

In [213]:
df_final.to_csv('books_clean.csv',index=False)