< [Online Version Python Data Science Handbook](https://github.com/jakevdp/PythonDataScienceHandbook)| [Menu](https://)>
# 3. Data Manipulation with Pandas 
* Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame.
* In this chapter, we will focus on the mechanics of using **Series**, **DataFrame**, and related structures effectively
* More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/.

## XXXX
* XXXX
* XXX

In [15]:
import pandas as pd
import numpy as np

class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)

# SERIES

## The bad way

In [67]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
df = pd.Series(populations, index=index)
df

(California, 2000)    33871648
(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
(Texas, 2010)         25145561
dtype: int64

In [68]:
df[('California', 2010):('Texas', 2000)]

(California, 2010)    37253956
(New York, 2000)      18976457
(New York, 2010)      19378102
(Texas, 2000)         20851820
dtype: int64

In [69]:
df[[i for i in df.index if i[1] == 2010]]

(California, 2010)    37253956
(New York, 2010)      19378102
(Texas, 2010)         25145561
dtype: int64

## The Better Way: Pandas MultiIndex

In [70]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           )

In [83]:
df_test = df.reindex(index)
df_test

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [84]:
df_test[:, 2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [85]:
# unstack > https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.unstack.html
# Devuelve un DataFrame con un nuevo "level" como columna mediante el index pivotado
display('df_test.unstack()', 'df_test.unstack(level=-1)', 'df_test.unstack(level=0)')

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561

Unnamed: 0,California,New York,Texas
2000,33871648,18976457,20851820
2010,37253956,19378102,25145561


In [87]:
df_test.unstack().stack()

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [88]:
df_test = pd.DataFrame({'total': df_test,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
df_test

Unnamed: 0,Unnamed: 1,total,under18
California,2000,33871648,9267089
California,2010,37253956,9284094
New York,2000,18976457,4687374
New York,2010,19378102,4318033
Texas,2000,20851820,5906301
Texas,2010,25145561,6879014


In [89]:
df_test = df_test['under18'] / df_test['total']
display('df_test.unstack()')

Unnamed: 0,2000,2010
California,0.273594,0.249211
New York,0.24701,0.222831
Texas,0.283251,0.273568


# DATAFRAME

In [305]:
# Make a MultiIndex from the cartesian product of multiple iterables.
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'], ['HR','Temp']], names=['subject','type'])
df = pd.DataFrame(np.random.rand(4, 6),
                  index=index,
                  columns=columns)
df

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
2013,2,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
2014,1,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
2014,2,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653


### INDEX & COLUMN NAMES

In [282]:
df.index.names = ['year','visit']
df.columns.names = ['subject','type']

### RESET_INDEX() & SET_INDEX()

In [326]:
# convert the old index is added as a column
df.reset_index()
df.reset_index(level=['year','visit'])
df.reset_index(level=['year','visit'], col_level=0)

subject,year,visit,Bob,Bob,Guido,Guido,Sue,Sue
type,Unnamed: 1_level_1,Unnamed: 2_level_1,HR,Temp,HR,Temp,HR,Temp
0,2013,1,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
1,2013,2,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
2,2014,1,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
3,2014,2,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653


In [331]:
# We can place it in another column level
display('df.reset_index(level="year", col_level=0)','df.reset_index(level="year", col_level=1)')

subject,year,Bob,Bob,Guido,Guido,Sue,Sue
type,Unnamed: 1_level_1,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,2013,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
2,2013,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
1,2014,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
2,2014,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653

subject,Unnamed: 1_level_0,Bob,Bob,Guido,Guido,Sue,Sue
type,year,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,2013,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
2,2013,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
1,2014,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
2,2014,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653


In [337]:
# we can specify under which one with the parameter col_fill
display('df.reset_index(level="year", col_level=0, col_fill="CORE")','df.reset_index(level="year", col_level=1, col_fill="CORE")')

subject,year,Bob,Bob,Guido,Guido,Sue,Sue
type,CORE,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,2013,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
2,2013,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
1,2014,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
2,2014,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653

subject,CORE,Bob,Bob,Guido,Guido,Sue,Sue
type,year,HR,Temp,HR,Temp,HR,Temp
visit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,2013,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
2,2013,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
1,2014,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
2,2014,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653


In [344]:
# remove the old index and set the new ones through columns
df.set_index([("Bob", "HR"),("Guido", "HR")])

Unnamed: 0_level_0,subject,Bob,Guido,Sue,Sue
Unnamed: 0_level_1,type,Temp,Temp,HR,Temp
"(Bob, HR)","(Guido, HR)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0.086281,0.380218,0.389059,0.762121,0.715832,0.062086
0.821915,0.950678,0.251752,0.160238,0.417696,0.104296
0.476176,0.564192,0.059724,0.136666,0.987059,0.069839
0.524305,0.325693,0.902393,0.230522,0.159248,0.027653


In [332]:
# drop parameter to avoid the old index being added as a column
df.reset_index(drop=True)

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
0,0.086281,0.389059,0.380218,0.762121,0.715832,0.062086
1,0.821915,0.251752,0.950678,0.160238,0.417696,0.104296
2,0.476176,0.059724,0.564192,0.136666,0.987059,0.069839
3,0.524305,0.902393,0.325693,0.230522,0.159248,0.027653


In [299]:
df = df.sort_index()
df

subject,year,visit,Bob,Bob,Guido,Guido,Sue,Sue
type,Unnamed: 1_level_1,Unnamed: 2_level_1,HR,Temp,HR,Temp,HR,Temp
0,2013,1,0.591394,0.199464,0.231985,0.079515,0.635766,0.78459
1,2013,2,0.137454,0.677708,0.560458,0.700524,0.689017,0.997988
2,2014,1,0.581993,0.509665,0.931039,0.427988,0.476549,0.279972
3,2014,2,0.647763,0.861086,0.384841,0.328627,0.0074,0.084427


### FILTER BY COLUMN

In [179]:
display('df["Guido"]', 'df.loc[:, "Guido"]', 'df.iloc[:, 2:4]')

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,0.134034,0.838502
2013,2,0.497756,0.384821
2014,1,0.503793,0.633109
2014,2,0.528611,0.560056

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,0.134034,0.838502
2013,2,0.497756,0.384821
2014,1,0.503793,0.633109
2014,2,0.528611,0.560056

Unnamed: 0_level_0,subject,Guido,Guido
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,0.134034,0.838502
2013,2,0.497756,0.384821
2014,1,0.503793,0.633109
2014,2,0.528611,0.560056


In [187]:
display('df["Guido", "HR"].to_frame()', 'df.loc[:, ("Guido", "HR")].to_frame()', 'df.iloc[:, 2:3]')

Unnamed: 0_level_0,Unnamed: 1_level_0,Guido
Unnamed: 0_level_1,Unnamed: 1_level_1,HR
year,visit,Unnamed: 2_level_2
2013,1,0.134034
2013,2,0.497756
2014,1,0.503793
2014,2,0.528611

Unnamed: 0_level_0,Unnamed: 1_level_0,Guido
Unnamed: 0_level_1,Unnamed: 1_level_1,HR
year,visit,Unnamed: 2_level_2
2013,1,0.134034
2013,2,0.497756
2014,1,0.503793
2014,2,0.528611

Unnamed: 0_level_0,subject,Guido
Unnamed: 0_level_1,type,HR
year,visit,Unnamed: 2_level_2
2013,1,0.134034
2013,2,0.497756
2014,1,0.503793
2014,2,0.528611


### FILTER BY ROW

In [270]:
# all columns
df[:2]
df.loc[2013]
df.loc[(2013, 1):(2013, 2)]
df.loc[[(2013, 1),(2013, 2)]]
df.iloc[:2, :]

Unnamed: 0_level_0,subject,Bob,Bob,Guido,Guido,Sue,Sue
Unnamed: 0_level_1,type,HR,Temp,HR,Temp,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2013,1,0.047475,0.866058,0.134034,0.838502,0.709231,0.405188
2013,2,0.69758,0.119472,0.497756,0.384821,0.240598,0.021321


In [261]:
# filter by columns
df.loc[:(2013, 2), 'Guido']
df.loc[:(2013, 2), ('Guido','HR'):('Guido','Temp')]
df.loc[:(2013, 2), [('Guido','HR'),('Guido','Temp')]]
df.iloc[:2, 2:4]
df.iloc[:2, [2,3]]

Unnamed: 0_level_0,subject,Guido,Guido
Unnamed: 0_level_1,type,HR,Temp
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2
2013,1,0.134034,0.838502
2013,2,0.497756,0.384821


In [262]:
# get value filtering by specific row and column
df.loc[(2013, 1), ('Guido','HR')]

0.13403380777617369

### FILTER BY SUBSET

In [272]:
# df.loc[(:, 1), (:, "HR")] > Devuelve ERROR, se debe hacer mediante IndexSlice

# The bad way
df.loc[[(2013, 1),(2014, 1)], [('Bob','HR'),('Guido','HR'),('Sue','HR')]]

# The best way
idx = pd.IndexSlice
df.loc[idx[:, 1], idx[:, 'HR']]

Unnamed: 0_level_0,subject,Bob,Guido,Sue
Unnamed: 0_level_1,type,HR,HR,HR
year,visit,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2013,1,0.047475,0.134034,0.709231
2014,1,0.137234,0.503793,0.303124


### STACKING & UNSTACKING INDICES

In [347]:
df.unstack(level=0)
df.unstack(level='year')

subject,Bob,Bob,Bob,Bob,Guido,Guido,Guido,Guido,Sue,Sue,Sue,Sue
type,HR,HR,Temp,Temp,HR,HR,Temp,Temp,HR,HR,Temp,Temp
year,2013,2014,2013,2014,2013,2014,2013,2014,2013,2014,2013,2014
visit,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
1,0.086281,0.476176,0.389059,0.059724,0.380218,0.564192,0.762121,0.136666,0.715832,0.987059,0.062086,0.069839
2,0.821915,0.524305,0.251752,0.902393,0.950678,0.325693,0.160238,0.230522,0.417696,0.159248,0.104296,0.027653


In [350]:
df.unstack(level=1)
df.unstack(level='visit')

subject,Bob,Bob,Bob,Bob,Guido,Guido,Guido,Guido,Sue,Sue,Sue,Sue
type,HR,HR,Temp,Temp,HR,HR,Temp,Temp,HR,HR,Temp,Temp
visit,1,2,1,2,1,2,1,2,1,2,1,2
year,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
2013,0.086281,0.821915,0.389059,0.251752,0.380218,0.950678,0.762121,0.160238,0.715832,0.417696,0.062086,0.104296
2014,0.476176,0.524305,0.059724,0.902393,0.564192,0.325693,0.136666,0.230522,0.987059,0.159248,0.069839,0.027653


In [355]:
df.unstack(level=['year','visit']).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0
subject,type,year,visit,Unnamed: 4_level_1
Bob,HR,2013,1,0.086281
Bob,HR,2013,2,0.821915
Bob,HR,2014,1,0.476176
Bob,HR,2014,2,0.524305
Bob,Temp,2013,1,0.389059
Bob,Temp,2013,2,0.251752
Bob,Temp,2014,1,0.059724
Bob,Temp,2014,2,0.902393
Guido,HR,2013,1,0.380218
Guido,HR,2013,2,0.950678


### DATA AGGREGATIONS ON MULTI-INDICES

In [357]:
df.mean(level='year')
df.mean(axis=0, level='year')

subject,Bob,Bob,Guido,Guido,Sue,Sue
type,HR,Temp,HR,Temp,HR,Temp
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2013,0.454098,0.320406,0.665448,0.461179,0.566764,0.083191
2014,0.50024,0.481059,0.444942,0.183594,0.573153,0.048746


In [356]:
df.mean(axis=1, level='type')

Unnamed: 0_level_0,type,HR,Temp
year,visit,Unnamed: 2_level_1,Unnamed: 3_level_1
2013,1,0.39411,0.404422
2013,2,0.730096,0.172095
2014,1,0.675809,0.088743
2014,2,0.336415,0.386856
