## Data Preparation

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
result = pd.read_csv("Input/dataset-books.csv")
df = pd.DataFrame(result)
df.head(3)

Unnamed: 0,authors,bestsellers-rank,categories,description,dimension-x,dimension-y,dimension-z,edition,edition-statement,for-ages,...,isbn10,isbn13,lang,publication-date,publication-place,rating-avg,rating-count,title,url,weight
0,[1],,"[334, 335, 341, 2622, 352, 2626, 353, 2627]",Renowned urban artist Shepard Fairey's new loo...,111.0,181.0,20.0,,Repr.,,...,141036141,9780141000000.0,en,2008-10-01,1.0,4.18,2918204.0,1984,/1984-George-Orwell/9780141036144,184.0
1,[2],67532.0,"[376, 378, 868, 873, 900, 2771, 3097, 2590]","These 100 skills, adapted for civilians from a...",140.0,210.0,18.0,,,,...,1501143905,9781501000000.0,en,2016-11-16,2.0,4.01,425.0,100 Deadly Skills: Survival Edition : The SEAL...,/100-Deadly-Skills-Survival-Edition-Clint-Emer...,367.0
2,"[3, 4]",150327.0,"[2978, 2980]",The world is full of pointless things. From ra...,193.0,197.0,21.0,,,,...,1444762052,9781445000000.0,en,2013-05-23,1.0,3.59,539.0,The 100 Most Pointless Things in the World : A...,/100-Most-Pointless-Things-World-Alexander-Arm...,220.0


### 1. Analizamos el dataset de libros

In [3]:
# Eliminamos las columnas que no necesitamos

drop_cols = ['id','bestsellers-rank','dimension-x','edition','edition-statement', 'for-ages', 'format', 'illustrations-note',
       'imprint', 'index-date', 'dimension-y',"isbn13",'publication-place','dimension-z','weight','rating-count','url']
df = df.drop(drop_cols, axis =1)
df.head(5)

Unnamed: 0,authors,categories,description,isbn10,lang,publication-date,rating-avg,title
0,[1],"[334, 335, 341, 2622, 352, 2626, 353, 2627]",Renowned urban artist Shepard Fairey's new loo...,141036141,en,2008-10-01,4.18,1984
1,[2],"[376, 378, 868, 873, 900, 2771, 3097, 2590]","These 100 skills, adapted for civilians from a...",1501143905,en,2016-11-16,4.01,100 Deadly Skills: Survival Edition : The SEAL...
2,"[3, 4]","[2978, 2980]",The world is full of pointless things. From ra...,1444762052,en,2013-05-23,3.59,The 100 Most Pointless Things in the World : A...
3,[5],"[2948, 2953]",Delight children and adults alike by creating ...,1784943460,en,2017-11-14,3.66,50 Knitted Dolls
4,[6],"[2942, 2948, 2953]",Make your own collection of irresistibly cute ...,1782216952,en,2019-06-04,4.66,20 to Knit: Pocket Pets


In [4]:
# Vemos los data types
df.dtypes

authors              object
categories           object
description          object
isbn10               object
lang                 object
publication-date     object
rating-avg          float64
title                object
dtype: object

In [5]:
# Analizamos valores nulos

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

authors - 0.0%
categories - 0.0%
description - 7.0%
isbn10 - 0.0%
lang - 5.0%
publication-date - 0.0%
rating-avg - 64.0%
title - 0.0%


In [6]:
null_cols = df.isnull().sum()
null_cols

authors                 0
categories              0
description          4022
isbn10                  0
lang                 2851
publication-date      139
rating-avg          35990
title                   0
dtype: int64

In [7]:
# Modificamos la columna de "publication-date" formato fecha y sacamos el año de la fecha

df["publication-date"] = pd.to_datetime(df["publication-date"])
df["year"] = pd.DatetimeIndex(df["publication-date"]).year


In [8]:
# Eliminamos las columnas sin años ya que es un campo necesario a la hora de filtrar
df = df.dropna(subset=['year'])

In [9]:
# Arregalamos la columna "year"
df["year"] =  df["year"].apply(lambda x: int(x))
df.head(3)

Unnamed: 0,authors,categories,description,isbn10,lang,publication-date,rating-avg,title,year
0,[1],"[334, 335, 341, 2622, 352, 2626, 353, 2627]",Renowned urban artist Shepard Fairey's new loo...,141036141,en,2008-10-01,4.18,1984,2008
1,[2],"[376, 378, 868, 873, 900, 2771, 3097, 2590]","These 100 skills, adapted for civilians from a...",1501143905,en,2016-11-16,4.01,100 Deadly Skills: Survival Edition : The SEAL...,2016
2,"[3, 4]","[2978, 2980]",The world is full of pointless things. From ra...,1444762052,en,2013-05-23,3.59,The 100 Most Pointless Things in the World : A...,2013


In [10]:
# Finalmente eliminamos la columna de fecha de publicación y autor
drop_cols = ['publication-date', 'authors']
df = df.drop(drop_cols, axis =1)

In [11]:
df.head(5)

Unnamed: 0,categories,description,isbn10,lang,rating-avg,title,year
0,"[334, 335, 341, 2622, 352, 2626, 353, 2627]",Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008
1,"[376, 378, 868, 873, 900, 2771, 3097, 2590]","These 100 skills, adapted for civilians from a...",1501143905,en,4.01,100 Deadly Skills: Survival Edition : The SEAL...,2016
2,"[2978, 2980]",The world is full of pointless things. From ra...,1444762052,en,3.59,The 100 Most Pointless Things in the World : A...,2013
3,"[2948, 2953]",Delight children and adults alike by creating ...,1784943460,en,3.66,50 Knitted Dolls,2017
4,"[2942, 2948, 2953]",Make your own collection of irresistibly cute ...,1782216952,en,4.66,20 to Knit: Pocket Pets,2019


### 3. Filtrado y limpieza por géneros

In [12]:
# Utilizamos la función explode para asignar una fila por género literario y quitamos los caracteres especiales

In [13]:
df = df.assign(categories=df.categories.str.split(",")).explode('categories')

spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–"]
for char in spec_chars:
    df['categories'] = df['categories'].str.replace(char, '')

In [14]:
# Renombramos la columna de "categories" por "category_id" para posteriomente poder unirla con el dataset de categorias
df = df.rename(columns={'categories':'category_id'})


In [15]:
df.head(5)

Unnamed: 0,category_id,description,isbn10,lang,rating-avg,title,year
0,334,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008
0,335,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008
0,341,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008
0,2622,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008
0,352,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008


### 3. Analizamos el dataset de categorias

In [16]:
# Analizamos el dataset de categorias
categories = pd.read_csv("Input/categories.csv")
categories.head(5)

Unnamed: 0,category_id,category_name
0,1998,.Net Programming
1,176,20th Century & Contemporary Classical Music
2,3291,20th Century & Contemporary Classical Music
3,2659,20th Century History: C 1900 To C 2000
4,2661,21st Century History: From C 2000 -


In [17]:
# Pasamos las dos columnas de category_id a int para poder unirlas. En este caso transformamos la columna del dataset de libros
df["category_id"] = df["category_id"].apply(lambda x: int(x))

In [18]:
# Hacemos merge de ambos dataset por category_id para poder filtrar por nombre
result = pd.merge(df, categories, on='category_id')
result.head(5)

Unnamed: 0,category_id,description,isbn10,lang,rating-avg,title,year,category_name
0,334,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008,Contemporary Fiction
1,334,"""Richardson's Clarissa explores the subtleties...",1641813911,en,,"Clarissa Harlowe : Or, The History of a Young ...",2020,Contemporary Fiction
2,334,World's Classics Deluxe Edition,1641813997,en,3.49,Nightmare Abbey,2020,Contemporary Fiction
3,334,World's Classics Deluxe Edition,164181599X,en,,The Mayor of Casterbridge : The Life and Death...,2020,Contemporary Fiction
4,334,"Elisabeth Matrei, fotógrafa de renombre y prot...",8498414261,es,3.89,Tres senderos hacia el lago / Three Paths to t...,2011,Contemporary Fiction


A continuación están los géneros que vamos a utilizar en este ejercicio_

* Classic Books & Novels
* poetry
* drama
* adventure
* romance
* Thrillers
* biographies and autobiographies
* comic
* science fiction
* life science
* for kids
* romance
* Crime
* Religious
* horror
* Historical Fiction
* Personal Development
* Graphic Novels: Manga
* Guidebooks

In [19]:
result.head(3)

Unnamed: 0,category_id,description,isbn10,lang,rating-avg,title,year,category_name
0,334,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008,Contemporary Fiction
1,334,"""Richardson's Clarissa explores the subtleties...",1641813911,en,,"Clarissa Harlowe : Or, The History of a Young ...",2020,Contemporary Fiction
2,334,World's Classics Deluxe Edition,1641813997,en,3.49,Nightmare Abbey,2020,Contemporary Fiction


In [24]:
genres = ["Classic Books & Novels", "poetry", "drama", "adventure",
"romance", "Thrillers", "biographies", "comic", "science fiction", "life science", "for kids", "romance","Crime",
"Religious","horror", "Historical Fiction", "Personal Development", "Graphic Novels: Manga","Guidebooks"
]

In [21]:
# Filtramos el dataset por las celdas que CONTIENEN los generos anterior (de esta forma no nos limitamos a coincidencias exactas)
# Ignoramos mayúsculas/minúsculas

result = result[result['category_name'].str.contains("Classic Books & Novels|poetry|drama|adventure|romance|Thrillers|biographies|comic|science fiction|life science|for kids|romance|Crime|Religious|horror|Historical Fiction|Personal Development|Graphic Novels: Manga|Guidebooks", regex=True,flags= re.IGNORECASE)]
result.head(5)

Unnamed: 0,category_id,description,isbn10,lang,rating-avg,title,year,category_name
844,335,Renowned urban artist Shepard Fairey's new loo...,141036141,en,4.18,1984,2008,Classic Books & Novels
845,335,"The Marquis de Sade, vilified by respectable s...",802130127,en,3.46,The 120 Days of Sodom and Other Writings,1994,Classic Books & Novels
846,335,The 120 Days of Sodom is the Marquis de Sade's...,99629607,en,3.11,The 120 Days Of Sodom : And Other Writings,1995,Classic Books & Novels
847,335,This book is the very simple story of the love...,751503843,en,4.2,84 Charing Cross Road,1982,Classic Books & Novels
848,335,"""Richardson's Clarissa explores the subtleties...",1641813911,en,,"Clarissa Harlowe : Or, The History of a Young ...",2020,Classic Books & Novels


In [22]:
# Vemos que hemos reducido considerablemente nuestro dataset
result.shape

(19313, 8)

In [23]:
# Guardamos el dataset que es con el que vamos a trabajar
result.to_csv('Output/books_clean_dataset.csv')