In [1]:
import pandas as pd

In [11]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"])
bigmac["Country"] = bigmac["Country"].astype("category") 
bigmac.head(3)

Unnamed: 0,Date,Country,Price in US Dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35


In [12]:
bigmac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 652 non-null    datetime64[ns]
 1   Country              652 non-null    category      
 2   Price in US Dollars  652 non-null    float64       
dtypes: category(1), datetime64[ns](1), float64(1)
memory usage: 13.5 KB


In [17]:
bigmac.dtypes

Date                   datetime64[ns]
Country                      category
Price in US Dollars           float64
dtype: object

## **Create a MultiIndex with the set_index Method**

In [20]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"])
bigmac["Country"] = bigmac["Country"].astype("category") 
bigmac.head(3)

Unnamed: 0,Date,Country,Price in US Dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35


### A boa prática aqui é utilizar a coluna com a menor quantidade de valores únicos nas camadas mais externas dos multi-indices, pois elas irão empacotar as camadas internas

In [27]:
bigmac.nunique()

Date                    12
Country                 58
Price in US Dollars    330
dtype: int64

In [28]:
bigmac.set_index(keys = ["Date", "Country"], inplace = True)

In [29]:
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14


In [40]:
bigmac.sort_index(inplace = True) # todos os níveis são ordenados, iniciando pela camada mais externa para a mais interna.

In [41]:
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97


In [42]:
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [43]:
bigmac.index.names

FrozenList(['Date', 'Country'])

In [44]:
type(bigmac.index)

pandas.core.indexes.multi.MultiIndex

In [53]:
bigmac.index[0]

(Timestamp('2010-01-01 00:00:00'), 'Argentina')

In [38]:
bigmac.loc[('2016-01-01', 'Brazil')]

Price in US Dollars    3.35
Name: (2016-01-01 00:00:00, Brazil), dtype: float64

In [54]:
bigmac.loc[bigmac.index[0]]

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

## **Extract Index Level Values with the ```get_level_values()``` Method**

In [72]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [64]:
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [70]:
bigmac.index.get_level_values("Date") # retorna todos os valores da camada de indice fornecida como argumento
bigmac.index.get_level_values(0)

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [71]:
bigmac.index.get_level_values("Country")
bigmac.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

## **Change Index Level Name with the ```.set_names()``` Method**

In [73]:
bigmac = pd.read_csv("bigmac.csv", parse_dates = ["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [75]:
bigmac.index = bigmac.index.set_names(names = ["Day", "Location"])

In [77]:
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Day,Location,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [78]:
bigmac.index.set_names(names = ["Day", "Location"], inplace = True)

In [79]:
# Como fazemos para alterar o nome de apenas uma camada de indice?

bigmac.index.set_names(names = "Date", level = 0) # apenas especificamos o "level" que gostariamos de alterar no nosso "index"

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Location'], length=652)

## **The ```.sort_index()``` Method on a MultiIndex Dataframe**

In [85]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"])
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35


In [84]:
bigmac.index.nlevels

2

In [87]:
# podemos ordenar as camadas de indice separadamente:

bigmac.sort_index(ascending = [True, False], inplace = True)

In [91]:
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Uruguay,3.32
2010-01-01,United States,3.58
2010-01-01,Ukraine,1.83
2010-01-01,UAE,2.99
2010-01-01,Turkey,3.83


In [90]:
# podemos ordenar apenas uma das nossas camadas de indice:

bigmac.sort_index(level = "Country") # estamos dizendo ao pandas para ordenar apenas os valores do indice especificado em 'level' e ignorar os outros indices

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-07-01,Argentina,3.56
2011-07-01,Argentina,4.84
2012-01-01,Argentina,4.64
2012-07-01,Argentina,4.16
...,...,...
2014-01-01,Vietnam,2.84
2014-07-01,Vietnam,2.83
2015-01-01,Vietnam,2.81
2015-07-01,Vietnam,2.75


## **Extract Rows from a ```MultiIndex Dataframe```**

In [92]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [95]:
bigmac.loc["2010-01-01", "Argentina"] # essa sintaxe é ambígua e confusa porque geralmente o segundo argumento desse método é o nome de uma coluna
bigmac.loc["2010-01-01", "Price in US Dollars"] # a mesma sintaxe porém com o nome da coluna ao invés do nome do rótulo --> retorna uma Serie Pandas

Country
Argentina         1.84
Australia         3.98
Brazil            4.76
Britain           3.67
Canada            3.97
Chile             3.18
China             1.83
Colombia          3.91
Costa Rica        3.52
Czech Republic    3.71
Denmark           5.99
Egypt             2.38
Euro area         4.84
Hong Kong         1.91
Hungary           3.86
Indonesia         2.24
Israel            3.99
Japan             3.50
Latvia            3.09
Lithuania         2.87
Malaysia          2.08
Mexico            2.50
New Zealand       3.61
Norway            7.02
Pakistan          2.42
Peru              2.81
Philippines       2.21
Poland            2.86
Russia            2.34
Saudi Arabia      2.67
Singapore         3.19
South Africa      2.46
South Korea       2.98
Sri Lanka         1.83
Sweden            5.51
Switzerland       6.30
Taiwan            2.36
Thailand          2.11
Turkey            3.83
UAE               2.99
Ukraine           1.83
United States     3.58
Uruguay           3.32
Nam

#### Dado o que comentamos acima, a documentação do pandas recomenda que quando tivermos um objeto MultiIndex, e tivermos como objetivo extrairmos valores de registros com base nesses rótulos múltiplos, que o primeiro argumento da propriedade 'loc' encapsule todas as informações do nível do indice e qu o segundo argumento contenha apenas o nome de colunas.

In [97]:
bigmac.loc[("2010-01-01", "Brazil"), "Price in US Dollars"]

4.76

In [105]:
# Existe uma difença entre filtrar uma tupla e apenas um valor entre parênteses (ou não :p):

bigmac.loc[("2010-01-01", "Argentina"):("2012-01-01", "Australia")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2011-07-01,United States,4.07
2011-07-01,Uruguay,4.88
2011-07-01,Venezuela,6.52
2012-01-01,Argentina,4.64


In [None]:
bigmac.iloc[]

## **The ```transpose``` Method**

In [112]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [118]:
bigmac = bigmac.T # pega as colunas do dataframe e transforma em indices e os indices em colunas
bigmac.head()

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,...,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01
Country,Argentina,Australia,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica,Czech Republic,...,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine,United States,Uruguay,Venezuela,Vietnam
Price in US Dollars,1.84,3.98,4.76,3.67,3.97,3.18,1.83,3.91,3.52,3.71,...,6.44,2.08,3.09,3.41,3.54,1.54,4.93,3.74,0.66,2.67


In [124]:
bigmac.loc[("Price in US Dollars",), ("2010-01-01", "Brazil"):("2010-01-01", "Costa Rica")] # agora temos indices múltiplos em nossas colunas

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01
Country,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica
Price in US Dollars,4.76,3.67,3.97,3.18,1.83,3.91,3.52


## **The ```swaplevel``` Method**

In [125]:
bigmac = pd.read_csv("bigmac.csv", parse_dates = ["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67


In [130]:
bigmac.index = bigmac.swaplevel().index # inverte a ordem os indices múltiplos

In [131]:
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [132]:
bigmac.swaplevel(i = "Date", j = "Country") # os argumentos são os rótulos que queremos inverter

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Country,Date,Unnamed: 2_level_1
Argentina,2010-01-01,1.84
Australia,2010-01-01,3.98
Brazil,2010-01-01,4.76
Britain,2010-01-01,3.67
Canada,2010-01-01,3.97
...,...,...
Ukraine,2016-01-01,1.54
United States,2016-01-01,4.93
Uruguay,2016-01-01,3.74
Venezuela,2016-01-01,0.66


## **The ```.stack()``` Method**

In [141]:
stats = pd.read_csv("worldstats.csv", parse_dates = ["year"], index_col = ["country", "year"])
# nesse caso faz sentido agrupar por pais porque cada pais tem vários registros para anos diferentes (então acaba repetindo menos valores do que as datas)
stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015-01-01,392022276.0,2530102000000.0
Arab World,2014-01-01,384222592.0,2873600000000.0
Arab World,2013-01-01,376504253.0,2846994000000.0
Arab World,2012-01-01,368802611.0,2773270000000.0
Arab World,2011-01-01,361031820.0,2497945000000.0


Colunas ainda são indices na posição 0

In [147]:
stats.stack() 

# empilha as colunas em indices para cada combinação única de rótulos de indice + rótulos de colunas para obter valores empilhados
# Não estamos mudando os dados do nosso Dataframe, estamos apenas olhando para ele de uma perspectiva diferente
# Além de estarmos duplicando a quantidade de registros no nosso Dataframe
# Estamos também transformando o nosso conjunto de dados em um objeto unidimensional

country     year                  
Arab World  2015-01-01  Population    3.920223e+08
                        GDP           2.530102e+12
            2014-01-01  Population    3.842226e+08
                        GDP           2.873600e+12
            2013-01-01  Population    3.765043e+08
                                          ...     
Zimbabwe    1962-01-01  GDP           1.117602e+09
            1961-01-01  Population    3.876638e+06
                        GDP           1.096647e+09
            1960-01-01  Population    3.752390e+06
                        GDP           1.052990e+09
Length: 22422, dtype: float64

In [148]:
stats.stack().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015-01-01,Population,3.920223e+08
Arab World,2015-01-01,GDP,2.530102e+12
Arab World,2014-01-01,Population,3.842226e+08
Arab World,2014-01-01,GDP,2.873600e+12
Arab World,2013-01-01,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962-01-01,GDP,1.117602e+09
Zimbabwe,1961-01-01,Population,3.876638e+06
Zimbabwe,1961-01-01,GDP,1.096647e+09
Zimbabwe,1960-01-01,Population,3.752390e+06


## **The ```.unstack()``` Method, Part 1**

In [149]:
world = pd.read_csv("worldstats.csv", index_col = ["country", "year"]).sort_index()
world.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,537777800.0
Afghanistan,1961,9164945.0,548888900.0
Afghanistan,1962,9343772.0,546666700.0


In [155]:
s = world.stack()
s.head(2)

country      year            
Afghanistan  1960  Population    8.994793e+06
                   GDP           5.377778e+08
dtype: float64

In [160]:
s.unstack().unstack().unstack() # desempilha o tipo de camada mais interno no nosso conjunto de dados

            year  country           
Population  1960  Afghanistan           8.994793e+06
                  Albania                        NaN
                  Algeria               1.112489e+07
                  Andorra                        NaN
                  Angola                         NaN
                                            ...     
GDP         2015  West Bank and Gaza    1.267740e+10
                  World                 7.343364e+13
                  Yemen, Rep.                    NaN
                  Zambia                2.120156e+10
                  Zimbabwe              1.389294e+10
Length: 28224, dtype: float64