# PANDAS

In [2]:
import pandas as pd
import numpy as np

# PANDAS SERIES

In [2]:
personas_ = pd.Series(["Pedro", "Paco", "Luis", "Rodolfo"],
          index=[1, 3, 7, 10]
         )
personas_

1       Pedro
3        Paco
7        Luis
10    Rodolfo
dtype: object

In [3]:
personas = pd.Series(["Pedro", "Paco", "Luis", "Rodolfo"])
personas

0      Pedro
1       Paco
2       Luis
3    Rodolfo
dtype: object

In [4]:
# Especificando un índice personalizado (etiqueta a cada valor)
pd.Series(range(1, 4), index=['a', 'b', 'c'])

a    1
b    2
c    3
dtype: int64

In [5]:
# Especificando un diccionario con etiquetas y valores
items = {'a': 1, 'b': 2, 'c': 3}
pd.Series(items)

a    1
b    2
c    3
dtype: int64

In [6]:
pd.Series([1, 2, 3], name='integers')

0    1
1    2
2    3
Name: integers, dtype: int64

In [7]:
data = {
    'Apple': 147000,
    'Samsung': 267937,
    'Google': 135301,
    'Microsoft': 163000,
    'Huawei': 197000,
    'Dell': 158000,
    'Facebook': 58604,
    'Foxconn': 878429,
    'Sony': 109700
}

In [8]:
employees = pd.Series(data, name='Tech Employees')
employees

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Dell         158000
Facebook      58604
Foxconn      878429
Sony         109700
Name: Tech Employees, dtype: int64

In [9]:
employees.index

Index(['Apple', 'Samsung', 'Google', 'Microsoft', 'Huawei', 'Dell', 'Facebook',
       'Foxconn', 'Sony'],
      dtype='object')

In [10]:
employees.values

array([147000, 267937, 135301, 163000, 197000, 158000,  58604, 878429,
       109700])

In [11]:
# Tipo de la serie
employees.dtype

dtype('int64')

In [12]:
# Nombre de la serie
employees.name

'Tech Employees'

In [13]:
# Memoria ocupada por la serie
employees.nbytes

72

In [14]:
# Número de registros de la serie
employees.size

9

#### Seleccionar por indice

In [15]:
employees

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Dell         158000
Facebook      58604
Foxconn      878429
Sony         109700
Name: Tech Employees, dtype: int64

In [16]:
employees[0]

  employees[0]


147000

In [17]:
employees[-1]

  employees[-1]


109700

In [18]:
employees[2:5]

Google       135301
Microsoft    163000
Huawei       197000
Name: Tech Employees, dtype: int64

In [19]:
employees[0:8:2]

Apple       147000
Google      135301
Huawei      197000
Facebook     58604
Name: Tech Employees, dtype: int64

In [20]:
employees.iloc[0:8:2]

Apple       147000
Google      135301
Huawei      197000
Facebook     58604
Name: Tech Employees, dtype: int64

#### Sleccionar por etiqueta

In [21]:
employees

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Dell         158000
Facebook      58604
Foxconn      878429
Sony         109700
Name: Tech Employees, dtype: int64

In [22]:
employees["Apple"]

147000

In [23]:
employees["Apple":"Huawei"]

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Name: Tech Employees, dtype: int64

#### head y tail

In [24]:
employees.head(3)

Apple      147000
Samsung    267937
Google     135301
Name: Tech Employees, dtype: int64

In [25]:
employees.tail(2)

Foxconn    878429
Sony       109700
Name: Tech Employees, dtype: int64

### OPERACIONES CON LAS SERIES

In [26]:
employees

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Dell         158000
Facebook      58604
Foxconn      878429
Sony         109700
Name: Tech Employees, dtype: int64

In [27]:
employees > 200_000

Apple        False
Samsung       True
Google       False
Microsoft    False
Huawei       False
Dell         False
Facebook     False
Foxconn       True
Sony         False
Name: Tech Employees, dtype: bool

Mostrar las empresas con mas de 200,000 empleados

In [28]:
employees[employees > 200_000]

Samsung    267937
Foxconn    878429
Name: Tech Employees, dtype: int64

### ORDENACION POR VALORES

In [29]:
employees.sort_values()

Facebook      58604
Sony         109700
Google       135301
Apple        147000
Dell         158000
Microsoft    163000
Huawei       197000
Samsung      267937
Foxconn      878429
Name: Tech Employees, dtype: int64

In [30]:
employees.sort_values(ascending=True)

Facebook      58604
Sony         109700
Google       135301
Apple        147000
Dell         158000
Microsoft    163000
Huawei       197000
Samsung      267937
Foxconn      878429
Name: Tech Employees, dtype: int64

### ORDENACION POR INDICE

In [31]:
employees.sort_index()

Apple        147000
Dell         158000
Facebook      58604
Foxconn      878429
Google       135301
Huawei       197000
Microsoft    163000
Samsung      267937
Sony         109700
Name: Tech Employees, dtype: int64

### CONTANDO VALORES

In [32]:
marks = pd.Series([5, 5, 3, 6, 5, 2, 8, 3, 8, 7, 6]) 
marks

0     5
1     5
2     3
3     6
4     5
5     2
6     8
7     3
8     8
9     7
10    6
dtype: int64

In [33]:
marks.value_counts()

5    3
3    2
6    2
8    2
2    1
7    1
Name: count, dtype: int64

In [34]:
# OBTENER EL NÚMERO DE VALORES ÚNICOS DE LA SERIE
marks.nunique()

6

In [35]:
# OBTENER EL NÚMERO DE VALORES NO NULO DE LA SERIE
marks.count()

11

### OPERACIONES ARITMÉTICAS

In [36]:
employees

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Dell         158000
Facebook      58604
Foxconn      878429
Sony         109700
Name: Tech Employees, dtype: int64

In [37]:
data = {
    "Apple": 274515,
    "Samsung": 200734,
    "Google": 182527,
    "Microsoft": 143015,
    "Huawei": 129184,
    "Dell": 92224,
    "Facebook": 85965,
    "Foxconn": 181945,
    "Sony": 84893
}
revenues = pd.Series(data, name='Tech Revenues')
revenues

Apple        274515
Samsung      200734
Google       182527
Microsoft    143015
Huawei       129184
Dell          92224
Facebook      85965
Foxconn      181945
Sony          84893
Name: Tech Revenues, dtype: int64

In [38]:
# CALCULAR LA RATIO DE INGRESOS POR TRABAJADOR
# Tener en cuenta que las operaciones se realizan entre registros que tienen el mismo índice (etiqueta)
revenues / employees

Apple        1.867449
Samsung      0.749184
Google       1.349044
Microsoft    0.877393
Huawei       0.655756
Dell         0.583696
Facebook     1.466879
Foxconn      0.207125
Sony         0.773865
dtype: float64

In [39]:
# El método `mean()` calcula la media (promedio) de todas las columnas numéricas en el DataFrame.
employees.mean()

234996.77777777778

In [40]:
# El método `std()` calcula la desviación estándar de todas las columnas numéricas en el DataFrame.
employees.std()

248027.7840619765

### VALOR MÍNIMO/MÁXIMO DE UNA SERIE

In [41]:
employees.min()

58604

In [42]:
employees.max()

878429

### ETIQUETAS VALOR MÍNIMO/MÁXIMO DE UNA SERIE

In [43]:
employees.idxmin()

'Facebook'

In [44]:
employees.idxmax()

'Foxconn'

### OBTENER TRES VALORES MENORES DE LA SERIE

In [45]:
employees.nsmallest(3)

Facebook     58604
Sony        109700
Google      135301
Name: Tech Employees, dtype: int64

In [46]:
employees.nlargest(3)

Foxconn    878429
Samsung    267937
Huawei     197000
Name: Tech Employees, dtype: int64

### EXPORTAR SERIES A OTRO FORMATO
Suele ser bastante habitual intercambiar datos en distintos formatos (y aplicaciones). Para ello, pandas nos permite exportar una serie a multitud de formatos.

In [47]:
employees

Apple        147000
Samsung      267937
Google       135301
Microsoft    163000
Huawei       197000
Dell         158000
Facebook      58604
Foxconn      878429
Sony         109700
Name: Tech Employees, dtype: int64

In [48]:
employees.to_list()

[147000, 267937, 135301, 163000, 197000, 158000, 58604, 878429, 109700]

In [49]:
employees.to_dict()

{'Apple': 147000,
 'Samsung': 267937,
 'Google': 135301,
 'Microsoft': 163000,
 'Huawei': 197000,
 'Dell': 158000,
 'Facebook': 58604,
 'Foxconn': 878429,
 'Sony': 109700}

In [50]:
employees.to_json()

'{"Apple":147000,"Samsung":267937,"Google":135301,"Microsoft":163000,"Huawei":197000,"Dell":158000,"Facebook":58604,"Foxconn":878429,"Sony":109700}'

In [51]:
employees.to_csv()

',Tech Employees\nApple,147000\nSamsung,267937\nGoogle,135301\nMicrosoft,163000\nHuawei,197000\nDell,158000\nFacebook,58604\nFoxconn,878429\nSony,109700\n'

### EXPORTAR SERIES A DATAFRAME

In [52]:
employees.to_frame()

Unnamed: 0,Tech Employees
Apple,147000
Samsung,267937
Google,135301
Microsoft,163000
Huawei,197000
Dell,158000
Facebook,58604
Foxconn,878429
Sony,109700


In [53]:
dict = {1: "Pedro", 4: "Paco", 8: "Luis", 60: "Rodolfo"}
pd.Series(dict)

1       Pedro
4        Paco
8        Luis
60    Rodolfo
dtype: object

In [54]:
personas

0      Pedro
1       Paco
2       Luis
3    Rodolfo
dtype: object

# DATA FRAME
### [kaggle.com](https://www.kaggle.com/)
Link para obtener datasets muchos!

Un DataFrame es una estructura de datos bidimensional etiquetada con columnas de tipos potencialmente diferentes. Puedes considerarlo como una hoja de cálculo o una tabla SQL. Es el objeto pandas más utilizado. Puede crear un DataFrame a partir de una variedad de fuentes de datos, como un archivo CSV, un archivo Excel, una consulta SQL o un diccionario Python. Una vez que tenga un DataFrame, puede realizar varias operaciones en él, como filtrar, seleccionar, agrupar y agregar los datos

In [55]:
dict_0 = {"Jugador": ["Pedro", "Paco", "Luis", "Rodolfo"],
 "Altura:": [1.79, 1.89, 1.80, 1.79],
 "Goles": [23, 43, 23, 67]
}

In [56]:
df_personas = pd.DataFrame(dict_0, index=[1, 3, 7, 10])
df_personas

Unnamed: 0,Jugador,Altura:,Goles
1,Pedro,1.79,23
3,Paco,1.89,43
7,Luis,1.8,23
10,Rodolfo,1.79,67


Mostrar columnas

In [57]:
df_personas.columns

Index(['Jugador', 'Altura:', 'Goles'], dtype='object')

Mostrar indices

In [58]:
df_personas.index

Index([1, 3, 7, 10], dtype='int64')

# LEER ARCHIVOS CSV
- [pandas.read_csv link](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
- [pandas.read_json link](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html)

In [59]:
df_books = pd.read_csv("sample_data/bestsellers with categories.csv", sep=",", header=0, names=["Titles", "Author", "User Rating", "Reviews", "Price", "Year", "Genre"])
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [60]:
df_books.columns

Index(['Titles', 'Author', 'User Rating', 'Reviews', 'Price', 'Year', 'Genre'], dtype='object')

In [61]:
pd.read_json("sample_data/HPCharactersDataRaw.json")

Unnamed: 0,Name,Link,Descr,Gender,Species/Race,Blood,School,Profession
0,Mrs. Abbott,https://www.hp-lexicon.org/character/abbott-fa...,"Mrs. Abbott was the mother of Hannah Abbott, a...",Female,Witch,Muggle-born,Unknown,Unknown
1,Hannah Abbott,https://www.hp-lexicon.org/character/abbott-fa...,Hannah Abbott is a Hufflepuff student in Harry...,Female,Witch,Half-blood,Hogwarts - Hufflepuff,Landlady of the Leaky Cauldron
2,Abel Treetops,https://www.hp-lexicon.org/character/abel-tree...,Abel Treetops was a wizard from Cincinnati who...,Male,Wizard,Unknown,Unknown,Unknown
3,Euan Abercrombie,https://www.hp-lexicon.org/character/abercromb...,Euan Abercrombie was a small boy with prominen...,Male,Wizard,Unknown,Hogwarts - Gryffindor,Unknown
4,Aberforth Dumbledore,https://www.hp-lexicon.org/character/dumbledor...,"Aberforth Dumbledore was a tall, thin, grumpy-...",Male,Wizard,Half-blood,Hogwarts - Student,Barman
...,...,...,...,...,...,...,...,...
1935,Georgi Zdravko,https://www.hp-lexicon.org/character/georgi-zd...,Georgi Zdravko played Keeper for the Bulgarian...,Male,Wizard,Unknown,Unknown,Quidditch player (Seeker)
1936,Zograf,https://www.hp-lexicon.org/character/zograf/,Zograf played Keeper for the Bulgarian Nationa...,,Wizard,Unknown,Unknown,Quidditch player (Keeper)
1937,Zonko,https://www.hp-lexicon.org/character/zonko/,Founder(?) of Zonko’s Joke Shop. Possibly a re...,,Unknown,Unknown,Unknown,Unknown
1938,Valentina Vázquez,https://www.hp-lexicon.org/character/valentina...,Valentina Vázquez was President of the Argenti...,Female,Witch,Unknown,Unknown,President of the Argentinian Council of Magic


In [62]:
pd.read_csv("sample_data/spotify_most_streamed_artists_of_all_time.csv")

Unnamed: 0,Artist,Streams,Daily,As lead,Solo,As feature
0,Drake,85041.3,50.775,57252.6,32681.6,27788.7
1,Bad Bunny,67533.0,44.820,40969.6,23073.0,26563.4
2,Taylor Swift,57859.0,85.793,55566.7,50425.7,2292.4
3,The Weeknd,53665.2,44.437,42673.3,31164.2,10991.9
4,Ed Sheeran,47907.7,17.506,42767.9,33917.0,5139.8
...,...,...,...,...,...,...
2995,Vicente Garcia,729.9,0.397,542.2,434.8,187.6
2996,Yasin,729.6,0.573,645.9,542.0,83.6
2997,Vedo,729.4,0.284,718.2,607.6,11.1
2998,Kings of Convenience,728.6,0.252,728.6,618.9,


In [63]:
# SLICING DE UN DATAFRAME
df_books[0:4]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction


In [64]:
# FILTRAR POR EL EJE DE LAS COLUMNAS
df_books[["Author", "User Rating"]]

Unnamed: 0,Author,User Rating
0,JJ Smith,4.7
1,Stephen King,4.6
2,Jordan B. Peterson,4.7
3,George Orwell,4.7
4,National Geographic Kids,4.8
...,...,...
545,Jeff Kinney,4.9
546,Jen Sincero,4.7
547,Jen Sincero,4.7
548,Jen Sincero,4.7


In [65]:
# FILTRAR POR LAS COLUMNAS Y RENGLONES
# FILTRAR POR EL EJE DE LAS COLUMNAS
df_books[["Author", "User Rating", "Reviews"]][0:4]

Unnamed: 0,Author,User Rating,Reviews
0,JJ Smith,4.7,17350
1,Stephen King,4.6,2052
2,Jordan B. Peterson,4.7,18979
3,George Orwell,4.7,21424


### LOC

In [66]:
# LOC SE REFIERE TANTO A LA ETIQUETA DE LA COLUMNA COMO A LA DE LA FILA.
df_books.loc[0:4, ["Author", "Price"]]

Unnamed: 0,Author,Price
0,JJ Smith,8
1,Stephen King,22
2,Jordan B. Peterson,15
3,George Orwell,6
4,National Geographic Kids,12


In [67]:
df_books.loc[0:4]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction


In [68]:
df_books.loc[:5, ["Titles", "Author", "User Rating", "Price"]]

Unnamed: 0,Titles,Author,User Rating,Price
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,8
1,11/22/63: A Novel,Stephen King,4.6,22
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,15
3,1984 (Signet Classics),George Orwell,4.7,6
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,12
5,A Dance with Dragons (A Song of Ice and Fire),George R. R. Martin,4.4,11


In [69]:
# OPERACIONES CON DATAFRAME CON LOC
df_books.loc[:5, ["Price"]] * 2

Unnamed: 0,Price
0,16
1,44
2,30
3,12
4,24
5,22


In [70]:
df_books.loc[:5, ["Author"]] == "JJ Smith"

Unnamed: 0,Author
0,True
1,False
2,False
3,False
4,False
5,False


### ILOC

In [71]:
df_books.iloc[:]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [72]:
df_books.iloc[:4]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction


In [73]:
df_books.iloc[0]

Titles         10-Day Green Smoothie Cleanse
Author                              JJ Smith
User Rating                              4.7
Reviews                                17350
Price                                      8
Year                                    2016
Genre                            Non Fiction
Name: 0, dtype: object

In [74]:
df_books.iloc[3]

Titles         1984 (Signet Classics)
Author                  George Orwell
User Rating                       4.7
Reviews                         21424
Price                               6
Year                             2017
Genre                         Fiction
Name: 3, dtype: object

In [75]:
# BÚSQUEDA POR FILAS Y COLUMNAS(POR INDICE)
df_books.iloc[:, 0:3]

Unnamed: 0,Titles,Author,User Rating
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7
1,11/22/63: A Novel,Stephen King,4.6
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7
3,1984 (Signet Classics),George Orwell,4.7
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8
...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7


In [76]:
df_books.iloc[1,3]

2052

In [77]:
df_books.iloc[1,3] * 2

4104

In [78]:
# LOS TRES PRIMEROS REGISTROS EN FILAS Y LAS DOS PRIMERAS COLUMNAS
df_books.iloc[:3, 2:]

Unnamed: 0,User Rating,Reviews,Price,Year,Genre
0,4.7,17350,8,2016,Non Fiction
1,4.6,2052,22,2011,Fiction
2,4.7,18979,15,2018,Non Fiction


### AGREGAR O ELIMINAR DATOS CON PANDAS

In [79]:
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [80]:
# ELIMINARE LA COLUMNA "GENRE" DEL DATAFRAME - 0 FILAS 1 COLUMNAS
# SE BORRA DE L SALIDA PERO NO DEL DATAFRAME
df_books.drop("Genre", axis=1)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019
...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018


In [81]:
df_books.head(3)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction


In [82]:
# PARA ELIMINAR LO DEL DATAFRAME
df_books.drop("Genre", axis=1, inplace=True)

In [83]:
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019
...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018


In [84]:
df_books = df_books.drop("Year", axis=1)
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8
1,11/22/63: A Novel,Stephen King,4.6,2052,22
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15
3,1984 (Signet Classics),George Orwell,4.7,21424,6
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12
...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8


In [85]:
df_books = pd.read_csv("sample_data/bestsellers with categories.csv", sep=",", header=0, names=["Titles", "Author", "User Rating", "Reviews", "Price", "Year", "Genre"])
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [86]:
del df_books["Genre"]
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019
...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018


### ELIMINAR FILAS

In [87]:
df_books.drop(0, axis=0).head(3) #ELIMINAR EL RENGLÓN CON EL INDICE 0

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017


In [88]:
df_books.drop([0, 1, 2, 3], axis=0).head(3) #ELIMINAR EL RENGLÓN CON EL INDICE 0, 1, 2, 3 QUE ESTÁN EN LA LISTA


Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019
5,A Dance with Dragons (A Song of Ice and Fire),George R. R. Martin,4.4,12643,11,2011
6,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,30,2014


In [89]:
df_books.drop(range(0, 10), axis=0).head(10) #ELIMINAR EL RENGLÓN CON EL INDICE 0 al 10 con un rango

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year
10,A Man Called Ove: A Novel,Fredrik Backman,4.6,23848,8,2017
11,A Patriot's History of the United States: From...,Larry Schweikart,4.6,460,2,2010
12,A Stolen Life: A Memoir,Jaycee Dugard,4.6,4149,32,2011
13,A Wrinkle in Time (Time Quintet),Madeleine L'Engle,4.5,5153,5,2018
14,"Act Like a Lady, Think Like a Man: What Men Re...",Steve Harvey,4.6,5013,17,2009
15,Adult Coloring Book Designs: Stress Relief Col...,Adult Coloring Book Designs,4.5,2313,4,2016
16,Adult Coloring Book: Stress Relieving Animal D...,Blue Star Coloring,4.6,2925,6,2015
17,Adult Coloring Book: Stress Relieving Patterns,Blue Star Coloring,4.4,2951,6,2015
18,Adult Coloring Books: A Coloring Book for Adul...,Coloring Books for Adults,4.5,2426,8,2015
19,Alexander Hamilton,Ron Chernow,4.8,9198,13,2016


### AGREGAR COLUMNAS

In [90]:
# AGREGAR UNA NUEVA COLUMNA CON UN VALOR POR DEFECTO NO NUMÉRICO CON NUMPY
df_books["Nueva Columna"] = np.nan
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Nueva Columna
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,


In [91]:
df_books.head(3)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Nueva Columna
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,


In [92]:
# SABER CUANTAS FILAS HAY EN AL DATAFRAME
df_books.shape[0]

550

In [93]:
data = np.arange(0, df_books.shape[0])
data

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [94]:
df_books["Nueva Columna"] = data
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Nueva Columna
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,0
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,1
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,2
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,3
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,4
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,545
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,546
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,547
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,548


### AGREGAR FILAS
Se puede usar el método append para agregar filas a un DataFrame. Pero esta deprecado [Removal of prior version deprecations/changes](https://pandas.pydata.org/docs/whatsnew/v2.0.0.html#removal-of-prior-version-deprecations-changes), por lo que se recomienda usar concat [pandas.concat](https://pandas.pydata.org/docs/reference/api/pandas.concat.html#pandas.concat).

In [95]:
# GREGAR LAS MIMAS FILAS AL DATAFRAME
pd.concat([df_books, df_books])

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Nueva Columna
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,0
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,1
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,2
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,3
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,4
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,545
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,546
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,547
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,548


### MANEJO DE DATOS NULOS

`np.nan` es tomado como `Float`  
`pd.NA` es tomado como `Entero`  
`None` es tomado como `Texto`  
y a la vez todos representan "Valores Nulos"

In [96]:
dict = {
    "Col1": [1, 2, 3, np.nan],
    "Col2": [np.nan, 5, 6, np.nan],
    "Col3": ["a", "b", "c", None]
}
df = pd.DataFrame(dict)
df

Unnamed: 0,Col1,Col2,Col3
0,1.0,,a
1,2.0,5.0,b
2,3.0,6.0,c
3,,,


In [97]:
df.isnull()

Unnamed: 0,Col1,Col2,Col3
0,False,True,False
1,False,False,False
2,False,False,False
3,True,True,True


In [98]:
df.isnull() * 1

Unnamed: 0,Col1,Col2,Col3
0,0,1,0
1,0,0,0
2,0,0,0
3,1,1,1


In [99]:
df.fillna("Missing")

Unnamed: 0,Col1,Col2,Col3
0,1.0,Missing,a
1,2.0,5.0,b
2,3.0,6.0,c
3,Missing,Missing,Missing


In [100]:
# SI ENCUENTRA UN VALOR NULL EN EL DATAFRAME REMPLAZARLO POR LA MEDIA 
df.fillna(df.mean())

TypeError: can only concatenate str (not "int") to str

In [None]:
df

Unnamed: 0,Col1,Col2,Col3
0,1.0,,a
1,2.0,5.0,b
2,3.0,6.0,c
3,,,


El método para reemplazar nulos de **interpolación** visto en clase es muy interesante. Tiene variados argumentos que la hacen muy poderosa.

Algunos son:  
`METHOD`: método usado par interpolar. Por default es el linear. Pero existen otros:

- `Time`: para interpolar entre intervalos de tiempo.
- `Index`: reemplaza el nulo por el valor del index.
- `Pad`: reemplaza el nulo por algún valor existente en el dataframe.

Se puede especificar que la interpolación sea cuadrática, cúbica , polinómica, entre otros.
Se puede especificar que se reemplace por el cero, o el valor contiguo más cercano, entre otros.  
`AXIS`: tiene 3 opciones:

En dirección de los index (axis = 0).  
En dirección de las columnas (axis = 1).  

`None`: ninguna. Esta es la opción default.
- `LIMIT` (opcional) : el número máximo de NULOS consecutivos que se pueden reemplazar. Tiene que ser mayor a cero.
- `LIMIT_DIRECTION` (opcional): idem anterior pero en alguna dirección. Puede ser:

Forward: hacia adelante.  
Backward: hacia atrás.  
Both: ambos.  

[pandas.DataFrame.interpolate](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html)

In [None]:
# REMPLASAR LOS VALORES NULOS CON interpolación
df.interpolate()

Unnamed: 0,Col1,Col2,Col3
0,1.0,,a
1,2.0,5.0,b
2,3.0,6.0,c
3,3.0,6.0,


In [None]:
df.interpolate(method='linear', limit_direction='both', axis=0)

Unnamed: 0,Col1,Col2,Col3
0,1.0,5.0,a
1,2.0,5.0,b
2,3.0,6.0,c
3,3.0,6.0,


In [None]:
# ELIMINAR LOS VALORES NULOS
df.dropna()

Unnamed: 0,Col1,Col2,Col3
1,2.0,5.0,b
2,3.0,6.0,c


### FILTRADO POR CONDICIONES

In [None]:
df_books = pd.read_csv("sample_data/bestsellers with categories.csv", sep=",", header=0, names=["Titles", "Author", "User Rating", "Reviews", "Price", "Year", "Genre"])
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [None]:
mayor_a_2016 = df_books["Year"] > 2016
mayor_a_2016

0      False
1      False
2       True
3       True
4       True
       ...  
545     True
546    False
547     True
548     True
549     True
Name: Year, Length: 550, dtype: bool

In [None]:
df_books[mayor_a_2016]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
7,A Gentleman in Moscow: A Novel,Amor Towles,4.7,19699,15,2017,Fiction
8,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,3,2018,Non Fiction
...,...,...,...,...,...,...,...
544,Wonder,R. J. Palacio,4.8,21625,9,2017,Fiction
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [None]:
df_books[df_books["Year"] > 2016]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
7,A Gentleman in Moscow: A Novel,Amor Towles,4.7,19699,15,2017,Fiction
8,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,3,2018,Non Fiction
...,...,...,...,...,...,...,...
544,Wonder,R. J. Palacio,4.8,21625,9,2017,Fiction
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [None]:
df_books_fiction = df_books["Genre"] == "Fiction"
df_books_fiction

0      False
1       True
2      False
3       True
4      False
       ...  
545     True
546    False
547    False
548    False
549    False
Name: Genre, Length: 550, dtype: bool

In [None]:
df_books[df_books_fiction & mayor_a_2016]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
7,A Gentleman in Moscow: A Novel,Amor Towles,4.7,19699,15,2017,Fiction
10,A Man Called Ove: A Novel,Fredrik Backman,4.6,23848,8,2017,Fiction
13,A Wrinkle in Time (Time Quintet),Madeleine L'Engle,4.5,5153,5,2018,Fiction
40,"Brown Bear, Brown Bear, What Do You See?",Bill Martin Jr.,4.9,14344,5,2017,Fiction
...,...,...,...,...,...,...,...
509,To Kill a Mockingbird,Harper Lee,4.8,26234,7,2019,Fiction
529,What Should Danny Do? (The Power to Choose Ser...,Adir Levy,4.8,8170,13,2019,Fiction
534,Where the Crawdads Sing,Delia Owens,4.8,87841,15,2019,Fiction
544,Wonder,R. J. Palacio,4.8,21625,9,2017,Fiction


In [None]:
# FILTRAR POR MENORES O IGUALES A 2016
df_books[~mayor_a_2016]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
5,A Dance with Dragons (A Song of Ice and Fire),George R. R. Martin,4.4,12643,11,2011,Fiction
6,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,30,2014,Fiction
9,A Man Called Ove: A Novel,Fredrik Backman,4.6,23848,8,2016,Fiction
...,...,...,...,...,...,...,...
540,Wonder,R. J. Palacio,4.8,21625,9,2013,Fiction
541,Wonder,R. J. Palacio,4.8,21625,9,2014,Fiction
542,Wonder,R. J. Palacio,4.8,21625,9,2015,Fiction
543,Wonder,R. J. Palacio,4.8,21625,9,2016,Fiction


### FUNCIONES PRINCIPALES

In [None]:
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [None]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 550 entries, 0 to 549
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Titles       550 non-null    object 
 1   Author       550 non-null    object 
 2   User Rating  550 non-null    float64
 3   Reviews      550 non-null    int64  
 4   Price        550 non-null    int64  
 5   Year         550 non-null    int64  
 6   Genre        550 non-null    object 
dtypes: float64(1), int64(3), object(3)
memory usage: 30.2+ KB


In [None]:
df_books.describe()

Unnamed: 0,User Rating,Reviews,Price,Year
count,550.0,550.0,550.0,550.0
mean,4.618364,11953.281818,13.1,2014.0
std,0.22698,11731.132017,10.842262,3.165156
min,3.3,37.0,0.0,2009.0
25%,4.5,4058.0,7.0,2011.0
50%,4.7,8580.0,11.0,2014.0
75%,4.8,17253.25,16.0,2017.0
max,4.9,87841.0,105.0,2019.0


In [None]:
df_books.tail(3)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction
549,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2019,Non Fiction


In [None]:
df_books.head(3)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction


In [None]:
# CUANTA MEMORIA OCUPA EL DATAFRAME
df_books.memory_usage(deep=True)

Index            128
Titles         59737
Author         39078
User Rating     4400
Reviews         4400
Price           4400
Year            4400
Genre          36440
dtype: int64

In [None]:
df_books["Author"].value_counts()

Author
Jeff Kinney                           12
Rick Riordan                          11
Suzanne Collins                       11
Gary Chapman                          11
American Psychological Association    10
                                      ..
Geneen Roth                            1
Alan Moore                             1
Sara Gruen                             1
Hillary Rodham Clinton                 1
Randall Munroe                         1
Name: count, Length: 248, dtype: int64

In [None]:
#ELIMINAR DIPLICADOS EN EL DATAFRAME
df_books.drop_duplicates()

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [None]:
df_books.sort_values(by="User Rating", ascending=False)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
486,The Wonderful Things You Will Be,Emily Winfield Martin,4.9,8842,10,2016,Fiction
420,The Legend of Zelda: Hyrule Historia,Patrick Thorpe,4.9,5396,20,2013,Fiction
84,Dog Man: Brawl of the Wild: From the Creator o...,Dav Pilkey,4.9,7235,4,2019,Fiction
83,Dog Man: Brawl of the Wild: From the Creator o...,Dav Pilkey,4.9,7235,4,2018,Fiction
82,Dog Man: A Tale of Two Kitties: From the Creat...,Dav Pilkey,4.9,4786,8,2017,Fiction
...,...,...,...,...,...,...,...
392,The Goldfinch: A Novel (Pulitzer Prize for Fic...,Donna Tartt,3.9,33844,20,2013,Fiction
107,Fifty Shades of Grey: Book One of the Fifty Sh...,E L James,3.8,47265,14,2013,Fiction
106,Fifty Shades of Grey: Book One of the Fifty Sh...,E L James,3.8,47265,14,2012,Fiction
132,Go Set a Watchman: A Novel,Harper Lee,3.6,14982,19,2015,Fiction


In [None]:
df_books[df_books["Author"].str.contains("Smith")]

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
184,Instant Pot Pressure Cooker Cookbook: 500 Ever...,Jennifer Smith,4.4,7396,13,2019,Non Fiction
185,Instant Pot Pressure Cooker Cookbook: 500 Ever...,Jennifer Smith,4.4,7396,13,2018,Non Fiction
298,Shred: The Revolutionary Diet: 6 Weeks 4 Inche...,Ian K. Smith M.D.,4.1,2272,6,2013,Non Fiction
490,The Wonky Donkey,Craig Smith,4.8,30183,4,2018,Fiction
491,The Wonky Donkey,Craig Smith,4.8,30183,4,2019,Fiction


In [None]:
df_books.sample(3)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
445,The Paris Wife: A Novel,Paula McLain,4.3,3759,16,2011,Fiction
310,StrengthsFinder 2.0,Gallup,4.0,5069,17,2015,Non Fiction
278,Publication Manual of the American Psychologic...,American Psychological Association,4.5,8580,46,2016,Non Fiction


### GROUPBY

Permite agrupar datos en función de los demás. Es decir, hacer el análisis del DataFrame en función de una de las columnas.

[Comparison with SQL - LINK](https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_sql.html)

In [None]:
df_books = pd.read_csv("sample_data/bestsellers with categories.csv", sep=",", header=0, names=["Name", "Author", "User Rating", "Reviews", "Price", "Year", "Genre"])
df_books

Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [None]:
# CONTAR EL NÚMERO DE VECES QUE SE REPITE UN VALOR EN UNA COLUMNA, POR EJEMPLO EL AUTOR
df_books.groupby("Author").count()

Unnamed: 0_level_0,Name,User Rating,Reviews,Price,Year,Genre
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abraham Verghese,2,2,2,2,2,2
Adam Gasiewski,1,1,1,1,1,1
Adam Mansbach,1,1,1,1,1,1
Adir Levy,1,1,1,1,1,1
Admiral William H. McRaven,1,1,1,1,1,1
...,...,...,...,...,...,...
Walter Isaacson,3,3,3,3,3,3
William Davis,2,2,2,2,2,2
William P. Young,2,2,2,2,2,2
Wizards RPG Team,3,3,3,3,3,3


In [None]:
# Se utiliza el método `agg()` para agregar los datos de cada autor. El diccionario pasado a
# `agg()` especifica qué columnas agregar y cómo agregarlas. En este caso, se promedian la 
# "Calificación del usuario" y las "Reseñas", mientras que el "Precio" se agrega para mostrar los valores mínimo y máximo.
# "Author" es el indice
df_books.groupby("Author").agg({"User Rating": "mean", "Reviews": "mean", "Price": ["min", "max"]})


Unnamed: 0_level_0,User Rating,Reviews,Price,Price
Unnamed: 0_level_1,mean,mean,min,max
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Abraham Verghese,4.600000,4866.000000,11,11
Adam Gasiewski,4.400000,3113.000000,6,6
Adam Mansbach,4.800000,9568.000000,9,9
Adir Levy,4.800000,8170.000000,13,13
Admiral William H. McRaven,4.700000,10199.000000,11,11
...,...,...,...,...
Walter Isaacson,4.566667,6222.666667,20,21
William Davis,4.400000,7497.000000,6,6
William P. Young,4.600000,19720.000000,8,8
Wizards RPG Team,4.800000,16990.000000,27,27


In [None]:
# "Author" es el indice
df_books.groupby("Author").sum()

Unnamed: 0_level_0,Name,User Rating,Reviews,Price,Year,Genre
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abraham Verghese,Cutting for StoneCutting for Stone,9.2,9732,22,4021,FictionFiction
Adam Gasiewski,Milk and Vine: Inspirational Quotes From Class...,4.4,3113,6,2017,Non Fiction
Adam Mansbach,Go the F**k to Sleep,4.8,9568,9,2011,Fiction
Adir Levy,What Should Danny Do? (The Power to Choose Ser...,4.8,8170,13,2019,Fiction
Admiral William H. McRaven,Make Your Bed: Little Things That Can Change Y...,4.7,10199,11,2017,Non Fiction
...,...,...,...,...,...,...
Walter Isaacson,Leonardo da VinciSteve JobsSteve Jobs,13.7,18668,61,6040,Non FictionNon FictionNon Fiction
William Davis,"Wheat Belly: Lose the Wheat, Lose the Weight, ...",8.8,14994,12,4025,Non FictionNon Fiction
William P. Young,The Shack: Where Tragedy Confronts EternityThe...,9.2,39440,16,4026,FictionFiction
Wizards RPG Team,Player's Handbook (Dungeons & Dragons)Player's...,14.4,50970,81,6054,FictionFictionFiction


In [None]:
# "Author" es el indice
df_books.groupby("Author").min()

Unnamed: 0_level_0,Name,User Rating,Reviews,Price,Year,Genre
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abraham Verghese,Cutting for Stone,4.6,4866,11,2010,Fiction
Adam Gasiewski,Milk and Vine: Inspirational Quotes From Class...,4.4,3113,6,2017,Non Fiction
Adam Mansbach,Go the F**k to Sleep,4.8,9568,9,2011,Fiction
Adir Levy,What Should Danny Do? (The Power to Choose Ser...,4.8,8170,13,2019,Fiction
Admiral William H. McRaven,Make Your Bed: Little Things That Can Change Y...,4.7,10199,11,2017,Non Fiction
...,...,...,...,...,...,...
Walter Isaacson,Leonardo da Vinci,4.5,3014,20,2011,Non Fiction
William Davis,"Wheat Belly: Lose the Wheat, Lose the Weight, ...",4.4,7497,6,2012,Non Fiction
William P. Young,The Shack: Where Tragedy Confronts Eternity,4.6,19720,8,2009,Fiction
Wizards RPG Team,Player's Handbook (Dungeons & Dragons),4.8,16990,27,2017,Fiction


In [None]:
# "Author" es el indice
df_books.groupby("Author").max()

Unnamed: 0_level_0,Name,User Rating,Reviews,Price,Year,Genre
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abraham Verghese,Cutting for Stone,4.6,4866,11,2011,Fiction
Adam Gasiewski,Milk and Vine: Inspirational Quotes From Class...,4.4,3113,6,2017,Non Fiction
Adam Mansbach,Go the F**k to Sleep,4.8,9568,9,2011,Fiction
Adir Levy,What Should Danny Do? (The Power to Choose Ser...,4.8,8170,13,2019,Fiction
Admiral William H. McRaven,Make Your Bed: Little Things That Can Change Y...,4.7,10199,11,2017,Non Fiction
...,...,...,...,...,...,...
Walter Isaacson,Steve Jobs,4.6,7827,21,2017,Non Fiction
William Davis,"Wheat Belly: Lose the Wheat, Lose the Weight, ...",4.4,7497,6,2013,Non Fiction
William P. Young,The Shack: Where Tragedy Confronts Eternity,4.6,19720,8,2017,Fiction
Wizards RPG Team,Player's Handbook (Dungeons & Dragons),4.8,16990,27,2019,Fiction


In [None]:
# AGRUPAR POR AUTOR Y HACER LA SUMATORIA POR EL NOMBRE(label) DE AUTOR "William Davis"
# "Author" ES EL INDICE
df_books.groupby("Author").sum().loc["William Davis"]

Name           Wheat Belly: Lose the Wheat, Lose the Weight, ...
User Rating                                                  8.8
Reviews                                                    14994
Price                                                         12
Year                                                        4025
Genre                                     Non FictionNon Fiction
Name: William Davis, dtype: object

In [None]:
# AGRUPAR POR AUTOR Y HACER LA SUMATORIA CON CON VALORES PLANOS
# AUTOR ES UNA COLUMNA Y EL INDICE ES EL INDICE POR DEFECTO
df_books.groupby("Author").sum().reset_index()


Unnamed: 0,Author,Name,User Rating,Reviews,Price,Year,Genre
0,Abraham Verghese,Cutting for StoneCutting for Stone,9.2,9732,22,4021,FictionFiction
1,Adam Gasiewski,Milk and Vine: Inspirational Quotes From Class...,4.4,3113,6,2017,Non Fiction
2,Adam Mansbach,Go the F**k to Sleep,4.8,9568,9,2011,Fiction
3,Adir Levy,What Should Danny Do? (The Power to Choose Ser...,4.8,8170,13,2019,Fiction
4,Admiral William H. McRaven,Make Your Bed: Little Things That Can Change Y...,4.7,10199,11,2017,Non Fiction
...,...,...,...,...,...,...,...
243,Walter Isaacson,Leonardo da VinciSteve JobsSteve Jobs,13.7,18668,61,6040,Non FictionNon FictionNon Fiction
244,William Davis,"Wheat Belly: Lose the Wheat, Lose the Weight, ...",8.8,14994,12,4025,Non FictionNon Fiction
245,William P. Young,The Shack: Where Tragedy Confronts EternityThe...,9.2,39440,16,4026,FictionFiction
246,Wizards RPG Team,Player's Handbook (Dungeons & Dragons)Player's...,14.4,50970,81,6054,FictionFictionFiction


In [None]:
# AGRUPAR POR AUTOR Y USAR FUNCIONES DE AGREGACIÓN PARA OBTENER EL VALOR MÁXIMO Y MÍNIMO
# "Author" ES EL INDICE
df_books.groupby("Author").agg(["min", "max"])

Unnamed: 0_level_0,Name,Name,User Rating,User Rating,Reviews,Reviews,Price,Price,Year,Year,Genre,Genre
Unnamed: 0_level_1,min,max,min,max,min,max,min,max,min,max,min,max
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Abraham Verghese,Cutting for Stone,Cutting for Stone,4.6,4.6,4866,4866,11,11,2010,2011,Fiction,Fiction
Adam Gasiewski,Milk and Vine: Inspirational Quotes From Class...,Milk and Vine: Inspirational Quotes From Class...,4.4,4.4,3113,3113,6,6,2017,2017,Non Fiction,Non Fiction
Adam Mansbach,Go the F**k to Sleep,Go the F**k to Sleep,4.8,4.8,9568,9568,9,9,2011,2011,Fiction,Fiction
Adir Levy,What Should Danny Do? (The Power to Choose Ser...,What Should Danny Do? (The Power to Choose Ser...,4.8,4.8,8170,8170,13,13,2019,2019,Fiction,Fiction
Admiral William H. McRaven,Make Your Bed: Little Things That Can Change Y...,Make Your Bed: Little Things That Can Change Y...,4.7,4.7,10199,10199,11,11,2017,2017,Non Fiction,Non Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...
Walter Isaacson,Leonardo da Vinci,Steve Jobs,4.5,4.6,3014,7827,20,21,2011,2017,Non Fiction,Non Fiction
William Davis,"Wheat Belly: Lose the Wheat, Lose the Weight, ...","Wheat Belly: Lose the Wheat, Lose the Weight, ...",4.4,4.4,7497,7497,6,6,2012,2013,Non Fiction,Non Fiction
William P. Young,The Shack: Where Tragedy Confronts Eternity,The Shack: Where Tragedy Confronts Eternity,4.6,4.6,19720,19720,8,8,2009,2017,Fiction,Fiction
Wizards RPG Team,Player's Handbook (Dungeons & Dragons),Player's Handbook (Dungeons & Dragons),4.8,4.8,16990,16990,27,27,2017,2019,Fiction,Fiction


In [None]:
# Agrupar por Author, obtener el minimo y maximo de la columna "Reviews" y sumar los valores de la columna "User Rating"
df_books.groupby("Author").agg({"Reviews": ["min", "max"], "User Rating": "sum"})

Unnamed: 0_level_0,Reviews,Reviews,User Rating
Unnamed: 0_level_1,min,max,sum
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Abraham Verghese,4866,4866,9.2
Adam Gasiewski,3113,3113,4.4
Adam Mansbach,9568,9568,4.8
Adir Levy,8170,8170,4.8
Admiral William H. McRaven,10199,10199,4.7
...,...,...,...
Walter Isaacson,3014,7827,13.7
William Davis,7497,7497,8.8
William P. Young,19720,19720,9.2
Wizards RPG Team,16990,16990,14.4


In [None]:
df_books.groupby("Author").agg({"Reviews": ["min", "max"], "User Rating": "sum"}).sort_values(by=("Author"), ascending=False)

Unnamed: 0_level_0,Reviews,Reviews,User Rating
Unnamed: 0_level_1,min,max,sum
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Zhi Gang Sha,37,220,9.2
Wizards RPG Team,16990,16990,14.4
William P. Young,19720,19720,9.2
William Davis,7497,7497,8.8
Walter Isaacson,3014,7827,13.7
...,...,...,...
Admiral William H. McRaven,10199,10199,4.7
Adir Levy,8170,8170,4.8
Adam Mansbach,9568,9568,4.8
Adam Gasiewski,3113,3113,4.4


In [None]:
df_books.groupby("Author").agg({"Reviews": ["min", "max"], "User Rating": "sum"}).sort_values([("User Rating", "sum")], ascending=False)

Unnamed: 0_level_0,Reviews,Reviews,User Rating
Unnamed: 0_level_1,min,max,sum
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Jeff Kinney,3181,9413,57.6
Rick Riordan,548,6982,52.5
Gary Chapman,803,25554,52.1
Suzanne Collins,16949,32122,51.3
American Psychological Association,8580,8580,45.0
...,...,...,...
Elizabeth Strout,4519,4519,4.2
Pierre Dukan,2023,2023,4.1
Ian K. Smith M.D.,2272,2272,4.1
Chris Cleave,1467,1467,4.1


In [None]:
df_books.groupby("Author").agg({"Reviews": ["min", "max"], "User Rating": "sum"}).sample(3)

Unnamed: 0_level_0,Reviews,Reviews,User Rating
Unnamed: 0_level_1,min,max,sum
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Chris Kyle,15921,15921,4.6
Don Miguel Ruiz,23308,23308,28.2
Robert Munsch,18613,18613,9.6


In [None]:
df_books.groupby("Author").agg(["min", "max"]).sample()

Unnamed: 0_level_0,Name,Name,User Rating,User Rating,Reviews,Reviews,Price,Price,Year,Year,Genre,Genre
Unnamed: 0_level_1,min,max,min,max,min,max,min,max,min,max,min,max
Author,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Bessel van der Kolk M.D.,"The Body Keeps the Score: Brain, Mind, and Bod...","The Body Keeps the Score: Brain, Mind, and Bod...",4.8,4.8,12361,12361,12,12,2019,2019,Non Fiction,Non Fiction


In [None]:
df_books.groupby(["Author", "Year"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,User Rating,Reviews,Price,Genre
Author,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abraham Verghese,2010,1,1,1,1,1
Abraham Verghese,2011,1,1,1,1,1
Adam Gasiewski,2017,1,1,1,1,1
Adam Mansbach,2011,1,1,1,1,1
Adir Levy,2019,1,1,1,1,1
...,...,...,...,...,...,...
Wizards RPG Team,2017,1,1,1,1,1
Wizards RPG Team,2018,1,1,1,1,1
Wizards RPG Team,2019,1,1,1,1,1
Zhi Gang Sha,2009,1,1,1,1,1


### COMBINANDO DATAFRAMES
![combinando-dataframes](imgs/merge-data-frames.webp)

In [101]:
df1 = pd.DataFrame({'A':['A0', 'A1', 'A2','A3'],
	'B':['B0', 'B1', 'B2','B3'],
	'C':['C0', 'C1', 'C2','C3'],
	'D':['D0', 'D1', 'D2','D3']})
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [102]:
df2 = pd.DataFrame({'A':['A4', 'A5', 'A6','A7'],
	'B':['B4', 'B5', 'B6','B7'],
	'C':['C4', 'C5', 'C6','C7'],
	'D':['D4', 'D5', 'D6','D7']})
df2

Unnamed: 0,A,B,C,D
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


### CONCATENAR DATAFRAMES (UNIR)

In [104]:
# CONCATENAR DATAFRAMES POR FILAS AXIS=0
# el indice se reinicia
pd.concat([df1, df2])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
0,A4,B4,C4,D4
1,A5,B5,C5,D5
2,A6,B6,C6,D6
3,A7,B7,C7,D7


In [106]:
# CONCATENAR DATAFRAMES POR FILAS AXIS=0 REINICIANDO EL INDICE
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [107]:
# CONCATENAR DATAFRAMES POR COLUMNA AXIS=1
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,A4,B4,C4,D4
1,A1,B1,C1,D1,A5,B5,C5,D5
2,A2,B2,C2,D2,A6,B6,C6,D6
3,A3,B3,C3,D3,A7,B7,C7,D7


### MERGE

In [108]:
izq = pd.DataFrame({'key' : ['k0', 'k1', 'k2','k3'],
 'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']})
izq

Unnamed: 0,key,A,B
0,k0,A0,B0
1,k1,A1,B1
2,k2,A2,B2
3,k3,A3,B3


In [109]:
der = pd.DataFrame({'key' : ['k0', 'k1', 'k2','k3'],
 'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})
der

Unnamed: 0,key,C,D
0,k0,C0,D0
1,k1,C1,D1
2,k2,C2,D2
3,k3,C3,D3


In [111]:
izq.merge(der)

Unnamed: 0,key,A,B,C,D
0,k0,A0,B0,C0,D0
1,k1,A1,B1,C1,D1
2,k2,A2,B2,C2,D2
3,k3,A3,B3,C3,D3


In [112]:
# HACER EL MERGE POR LA COLUMNA "key"
izq.merge(der, on="key")

Unnamed: 0,key,A,B,C,D
0,k0,A0,B0,C0,D0
1,k1,A1,B1,C1,D1
2,k2,A2,B2,C2,D2
3,k3,A3,B3,C3,D3


In [113]:
izq = pd.DataFrame({'key_1' : ['k0', 'k1', 'k2','k3'],
 'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']})

der = pd.DataFrame({'key_2' : ['k0', 'k1', 'k2','k3'],
 'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})

In [115]:
izq.merge(der, left_on="key_1", right_on="key_2")

Unnamed: 0,key_1,A,B,key_2,C,D
0,k0,A0,B0,k0,C0,D0
1,k1,A1,B1,k1,C1,D1
2,k2,A2,B2,k2,C2,D2
3,k3,A3,B3,k3,C3,D3


In [116]:
izq = pd.DataFrame({'key_1' : ['k0', 'k1', 'k2','k3'],
 'A' : ['A0', 'A1', 'A2','A3'],
'B': ['B0', 'B1', 'B2','B3']})

der = pd.DataFrame({'key_2' : ['k0', 'k1', 'k2', np.nan],
 'C' : ['C0', 'C1', 'C2','C3'],
'D': ['D0', 'D1', 'D2','D3']})

In [117]:
der

Unnamed: 0,key_2,C,D
0,k0,C0,D0
1,k1,C1,D1
2,k2,C2,D2
3,,C3,D3


In [118]:
# HACE EL MERGE TIPO INNER JOIN(POR DEFECTO)
izq.merge(der, left_on="key_1", right_on="key_2")

Unnamed: 0,key_1,A,B,key_2,C,D
0,k0,A0,B0,k0,C0,D0
1,k1,A1,B1,k1,C1,D1
2,k2,A2,B2,k2,C2,D2


In [124]:
# HACE EL MERGE TIPO INNER JOIN(POR DEFECTO) INDICANDO QUE LEFT TIENE PRIORIDAD
izq.merge(der, left_on="key_1", right_on="key_2", how="left")

Unnamed: 0,key_1,A,B,key_2,C,D
0,k0,A0,B0,k0,C0,D0
1,k1,A1,B1,k1,C1,D1
2,k2,A2,B2,k2,C2,D2
3,k3,A3,B3,,,


### JOIN
![Python Pandas Join](imgs/Python-Pandas-Join.jpg)

In [3]:
izq = pd.DataFrame({'A': ['A0','A1','A2'],
  'B':['B0','B1','B2']},
  index=['k0','k1','k2'])

der =pd.DataFrame({'C': ['C0','C1','C2'],
  'D':['D0','D1','D2']},
  index=['k0','k2','k3'])

In [4]:
izq

Unnamed: 0,A,B
k0,A0,B0
k1,A1,B1
k2,A2,B2


In [5]:
der

Unnamed: 0,C,D
k0,C0,D0
k2,C1,D1
k3,C2,D2


In [6]:
izq.join(der)

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k1,A1,B1,,
k2,A2,B2,C1,D1


In [7]:
izq.join(der, how="inner")

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k2,A2,B2,C1,D1


In [8]:
izq.join(der, how="left")

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k1,A1,B1,,
k2,A2,B2,C1,D1


In [9]:
izq.join(der, how="right")

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k2,A2,B2,C1,D1
k3,,,C2,D2


In [10]:
izq.join(der, how="outer")

Unnamed: 0,A,B,C,D
k0,A0,B0,C0,D0
k1,A1,B1,,
k2,A2,B2,C1,D1
k3,,,C2,D2


### PIVOT Y MELT

#### pivot_table
Esta función puede traer recuerdos a las personas interesadas en el mundo del SQL, ya que Oracle, PostgreSQL y otros motores de bases de datos la tienen implementada desde hace muchos años. Pivot, básicamente, transforma los valores de determinadas columnas o filas en los índices de un nuevo DataFrame, y la intersección de estos es el valor resultante.



In [11]:
df_books = pd.read_csv("sample_data/bestsellers with categories.csv", sep=",", header=0, names=["Titles", "Author", "User Rating", "Reviews", "Price", "Year", "Genre"])
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [17]:
df_books.head(5)

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction


In [21]:
df_books.pivot_table(index="Author", columns="Genre", values="User Rating")


Genre,Fiction,Non Fiction
Author,Unnamed: 1_level_1,Unnamed: 2_level_1
Abraham Verghese,4.6,
Adam Gasiewski,,4.400000
Adam Mansbach,4.8,
Adir Levy,4.8,
Admiral William H. McRaven,,4.700000
...,...,...
Walter Isaacson,,4.566667
William Davis,,4.400000
William P. Young,4.6,
Wizards RPG Team,4.8,


Como resultado, los valores de `Author` pasan a formar el índice por fila y los valores de `Genre`  
pasan a formar parte de los índices por columna, y el User Rating se mantiene como valor.

Por supuesto, para este caso, un `Author` suele tener un solo género literario,  
así que no es una transformación muy útil, pero veamos si podemos lograr algo mejor.

In [20]:
df_books.pivot_table(index="Author", columns="Genre", values="User Rating", aggfunc="count")

Genre,Fiction,Non Fiction
Author,Unnamed: 1_level_1,Unnamed: 2_level_1
Abraham Verghese,2.0,
Adam Gasiewski,,1.0
Adam Mansbach,1.0,
Adir Levy,1.0,
Admiral William H. McRaven,,1.0
...,...,...
Walter Isaacson,,3.0
William Davis,,2.0
William P. Young,2.0,
Wizards RPG Team,3.0,


In [23]:
df_books.pivot_table(index='Genre', columns='Year', values='User Rating', aggfunc='sum')

Year,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Fiction,110.2,92.3,97.0,94.4,109.1,134.3,79.1,89.6,113.7,99.5,96.4
Non Fiction,119.0,135.6,130.9,132.2,118.6,96.8,153.3,144.3,119.3,133.9,140.6


En este caso tenemos por cada género, la suma a lo largo de los años. Esto es mucho más interesante,  
La mejor noticia es que no solo podemos obtener la suma, también podemos obtener la media,  
la desviación estándar, el conteo, la varianza, etc.

Únicamente con cambiar el parámetro `aggfunc` que traduce función de agrupamiento.

### MELT
El método melt toma las columnas del DataFrame y las pasa a filas,  
con dos nuevas columnas para especificar la antigua columna y el valor que traía.

In [26]:
df_books[["Titles", "Genre"]].head(5)

Unnamed: 0,Titles,Genre
0,10-Day Green Smoothie Cleanse,Non Fiction
1,11/22/63: A Novel,Fiction
2,12 Rules for Life: An Antidote to Chaos,Non Fiction
3,1984 (Signet Classics),Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",Non Fiction


In [27]:
df_books[["Titles", "Genre"]].head(5).melt()

Unnamed: 0,variable,value
0,Titles,10-Day Green Smoothie Cleanse
1,Titles,11/22/63: A Novel
2,Titles,12 Rules for Life: An Antidote to Chaos
3,Titles,1984 (Signet Classics)
4,Titles,"5,000 Awesome Facts (About Everything!) (Natio..."
5,Genre,Non Fiction
6,Genre,Fiction
7,Genre,Non Fiction
8,Genre,Fiction
9,Genre,Non Fiction


Cada resultado de las dos columnas pasa a una fila de este modo a tipo `llave:valor`

In [28]:
df_books.melt(id_vars="Year", value_vars="Genre")

Unnamed: 0,Year,variable,value
0,2016,Genre,Non Fiction
1,2011,Genre,Fiction
2,2018,Genre,Non Fiction
3,2017,Genre,Fiction
4,2019,Genre,Non Fiction
...,...,...,...
545,2019,Genre,Fiction
546,2016,Genre,Non Fiction
547,2017,Genre,Non Fiction
548,2018,Genre,Non Fiction


Simplemente, podemos seleccionar las columnas que no quiero hacer `melt` usando el parámetro `id_vars`

Para este caso `Year` y también la única columna que quiero aplicar el `melt`, para este caso `Genre` con la propiedad `value_vars`

In [29]:
df_books.melt(id_vars="Year", value_vars=["Genre", "Author"])

Unnamed: 0,Year,variable,value
0,2016,Genre,Non Fiction
1,2011,Genre,Fiction
2,2018,Genre,Non Fiction
3,2017,Genre,Fiction
4,2019,Genre,Non Fiction
...,...,...,...
1095,2019,Author,Jeff Kinney
1096,2016,Author,Jen Sincero
1097,2017,Author,Jen Sincero
1098,2018,Author,Jen Sincero


In [30]:
df_books.melt(id_vars="Year", value_vars=["Genre", "Author"], var_name="Columna")

Unnamed: 0,Year,Columna,value
0,2016,Genre,Non Fiction
1,2011,Genre,Fiction
2,2018,Genre,Non Fiction
3,2017,Genre,Fiction
4,2019,Genre,Non Fiction
...,...,...,...
1095,2019,Author,Jeff Kinney
1096,2016,Author,Jen Sincero
1097,2017,Author,Jen Sincero
1098,2018,Author,Jen Sincero


### APPLY

In [31]:
df_books = pd.read_csv("sample_data/bestsellers with categories.csv", sep=",", header=0, names=["Titles", "Author", "User Rating", "Reviews", "Price", "Year", "Genre"])
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction


In [32]:
def two_times(x):
 return x * 2

In [33]:
df_books["User Rating"].apply(two_times)

0      9.4
1      9.2
2      9.4
3      9.4
4      9.6
      ... 
545    9.8
546    9.4
547    9.4
548    9.4
549    9.4
Name: User Rating, Length: 550, dtype: float64

In [34]:
df_books["User Rating_2"] = df_books["User Rating"].apply(two_times)
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre,User Rating_2
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction,9.4
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction,9.2
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction,9.4
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction,9.4
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction,9.6
...,...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction,9.8
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction,9.4
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction,9.4
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction,9.4


In [35]:
df_books["User Rating_2"] = df_books["User Rating"].apply(lambda x: x * 3)
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre,User Rating_2
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction,14.1
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction,13.8
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction,14.1
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction,14.1
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction,14.4
...,...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction,14.7
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction,14.1
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction,14.1
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction,14.1


In [36]:
df_books["User Rating_2"] = df_books.apply(lambda x: x["User Rating"] * 2 if x["Genre"] == "Fiction" else x["User Rating"], axis=1)
df_books

Unnamed: 0,Titles,Author,User Rating,Reviews,Price,Year,Genre,User Rating_2
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction,4.7
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction,9.2
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction,4.7
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction,9.4
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction,4.8
...,...,...,...,...,...,...,...,...
545,Wrecking Ball (Diary of a Wimpy Kid Book 14),Jeff Kinney,4.9,9413,8,2019,Fiction,9.8
546,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2016,Non Fiction,4.7
547,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2017,Non Fiction,4.7
548,You Are a Badass: How to Stop Doubting Your Gr...,Jen Sincero,4.7,14331,8,2018,Non Fiction,4.7
