In [1]:
import pandas as pd

In [2]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"])
bigmac["Country"] = bigmac["Country"].astype("category") 
bigmac.head(3)

Unnamed: 0,Date,Country,Price in US Dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35


In [3]:
bigmac.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 652 non-null    datetime64[ns]
 1   Country              652 non-null    category      
 2   Price in US Dollars  652 non-null    float64       
dtypes: category(1), datetime64[ns](1), float64(1)
memory usage: 13.5 KB


In [4]:
bigmac.dtypes

Date                   datetime64[ns]
Country                      category
Price in US Dollars           float64
dtype: object

## **Create a MultiIndex with the set_index Method**

In [5]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"])
bigmac["Country"] = bigmac["Country"].astype("category") 
bigmac.head(3)

Unnamed: 0,Date,Country,Price in US Dollars
0,2016-01-01,Argentina,2.39
1,2016-01-01,Australia,3.74
2,2016-01-01,Brazil,3.35


### A boa prática aqui é utilizar a coluna com a menor quantidade de valores únicos nas camadas mais externas dos multi-indices, pois elas irão empacotar as camadas internas

In [6]:
bigmac.nunique()

Date                    12
Country                 58
Price in US Dollars    330
dtype: int64

In [7]:
bigmac.set_index(keys = ["Date", "Country"], inplace = True)

In [8]:
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35
2016-01-01,Britain,4.22
2016-01-01,Canada,4.14


In [9]:
bigmac.sort_index(inplace = True) # todos os níveis são ordenados, iniciando pela camada mais externa para a mais interna.

In [10]:
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97


In [11]:
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [12]:
bigmac.index.names

FrozenList(['Date', 'Country'])

In [13]:
type(bigmac.index)

pandas.core.indexes.multi.MultiIndex

In [14]:
bigmac.index[0]

(Timestamp('2010-01-01 00:00:00'), 'Argentina')

In [15]:
bigmac.loc[('2016-01-01', 'Brazil')]

Price in US Dollars    3.35
Name: (2016-01-01 00:00:00, Brazil), dtype: float64

In [16]:
bigmac.loc[bigmac.index[0]]

Price in US Dollars    1.84
Name: (2010-01-01 00:00:00, Argentina), dtype: float64

## **Extract Index Level Values with the ```get_level_values()``` Method**

In [17]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [18]:
bigmac.index

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Country'], length=652)

In [19]:
bigmac.index.get_level_values("Date") # retorna todos os valores da camada de indice fornecida como argumento
bigmac.index.get_level_values(0)

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01', '2016-01-01', '2016-01-01',
               '2016-01-01', '2016-01-01'],
              dtype='datetime64[ns]', name='Date', length=652, freq=None)

In [20]:
bigmac.index.get_level_values("Country")
bigmac.index.get_level_values(1)

Index(['Argentina', 'Australia', 'Brazil', 'Britain', 'Canada', 'Chile',
       'China', 'Colombia', 'Costa Rica', 'Czech Republic',
       ...
       'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'UAE', 'Ukraine',
       'United States', 'Uruguay', 'Venezuela', 'Vietnam'],
      dtype='object', name='Country', length=652)

## **Change Index Level Name with the ```.set_names()``` Method**

In [21]:
bigmac = pd.read_csv("bigmac.csv", parse_dates = ["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [22]:
bigmac.index = bigmac.index.set_names(names = ["Day", "Location"])

In [23]:
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Day,Location,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [24]:
bigmac.index.set_names(names = ["Day", "Location"], inplace = True)

In [25]:
# Como fazemos para alterar o nome de apenas uma camada de indice?

bigmac.index.set_names(names = "Date", level = 0) # apenas especificamos o "level" que gostariamos de alterar no nosso "index"

MultiIndex([('2010-01-01',      'Argentina'),
            ('2010-01-01',      'Australia'),
            ('2010-01-01',         'Brazil'),
            ('2010-01-01',        'Britain'),
            ('2010-01-01',         'Canada'),
            ('2010-01-01',          'Chile'),
            ('2010-01-01',          'China'),
            ('2010-01-01',       'Colombia'),
            ('2010-01-01',     'Costa Rica'),
            ('2010-01-01', 'Czech Republic'),
            ...
            ('2016-01-01',    'Switzerland'),
            ('2016-01-01',         'Taiwan'),
            ('2016-01-01',       'Thailand'),
            ('2016-01-01',         'Turkey'),
            ('2016-01-01',            'UAE'),
            ('2016-01-01',        'Ukraine'),
            ('2016-01-01',  'United States'),
            ('2016-01-01',        'Uruguay'),
            ('2016-01-01',      'Venezuela'),
            ('2016-01-01',        'Vietnam')],
           names=['Date', 'Location'], length=652)

## **The ```.sort_index()``` Method on a MultiIndex Dataframe**

In [26]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"])
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2016-01-01,Argentina,2.39
2016-01-01,Australia,3.74
2016-01-01,Brazil,3.35


In [27]:
bigmac.index.nlevels

2

In [28]:
# podemos ordenar as camadas de indice separadamente:

bigmac.sort_index(ascending = [True, False], inplace = True)

In [29]:
bigmac.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Uruguay,3.32
2010-01-01,United States,3.58
2010-01-01,Ukraine,1.83
2010-01-01,UAE,2.99
2010-01-01,Turkey,3.83


In [30]:
# podemos ordenar apenas uma das nossas camadas de indice:

bigmac.sort_index(level = "Country") # estamos dizendo ao pandas para ordenar apenas os valores do indice especificado em 'level' e ignorar os outros indices

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-07-01,Argentina,3.56
2011-07-01,Argentina,4.84
2012-01-01,Argentina,4.64
2012-07-01,Argentina,4.16
...,...,...
2014-01-01,Vietnam,2.84
2014-07-01,Vietnam,2.83
2015-01-01,Vietnam,2.81
2015-07-01,Vietnam,2.75


## **Extract Rows from a ```MultiIndex Dataframe```**

In [31]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [32]:
bigmac.loc["2010-01-01", "Argentina"] # essa sintaxe é ambígua e confusa porque geralmente o segundo argumento desse método é o nome de uma coluna
bigmac.loc["2010-01-01", "Price in US Dollars"] # a mesma sintaxe porém com o nome da coluna ao invés do nome do rótulo --> retorna uma Serie Pandas

Country
Argentina         1.84
Australia         3.98
Brazil            4.76
Britain           3.67
Canada            3.97
Chile             3.18
China             1.83
Colombia          3.91
Costa Rica        3.52
Czech Republic    3.71
Denmark           5.99
Egypt             2.38
Euro area         4.84
Hong Kong         1.91
Hungary           3.86
Indonesia         2.24
Israel            3.99
Japan             3.50
Latvia            3.09
Lithuania         2.87
Malaysia          2.08
Mexico            2.50
New Zealand       3.61
Norway            7.02
Pakistan          2.42
Peru              2.81
Philippines       2.21
Poland            2.86
Russia            2.34
Saudi Arabia      2.67
Singapore         3.19
South Africa      2.46
South Korea       2.98
Sri Lanka         1.83
Sweden            5.51
Switzerland       6.30
Taiwan            2.36
Thailand          2.11
Turkey            3.83
UAE               2.99
Ukraine           1.83
United States     3.58
Uruguay           3.32
Nam

#### Dado o que comentamos acima, a documentação do pandas recomenda que quando tivermos um objeto MultiIndex, e tivermos como objetivo extrairmos valores de registros com base nesses rótulos múltiplos, que o primeiro argumento da propriedade 'loc' encapsule todas as informações do nível do indice e qu o segundo argumento contenha apenas o nome de colunas.

In [33]:
bigmac.loc[("2010-01-01", "Brazil"), "Price in US Dollars"]

4.76

In [34]:
# Existe uma difença entre filtrar uma tupla e apenas um valor entre parênteses (ou não :p):

bigmac.loc[("2010-01-01", "Argentina"):("2012-01-01", "Australia")]

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2011-07-01,United States,4.07
2011-07-01,Uruguay,4.88
2011-07-01,Venezuela,6.52
2012-01-01,Argentina,4.64


## **The ```transpose``` Method**

In [36]:
bigmac = pd.read_csv("bigmac.csv", parse_dates=["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76


In [37]:
bigmac = bigmac.T # pega as colunas do dataframe e transforma em indices e os indices em colunas
bigmac.head()

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,...,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01,2016-01-01
Country,Argentina,Australia,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica,Czech Republic,...,Switzerland,Taiwan,Thailand,Turkey,UAE,Ukraine,United States,Uruguay,Venezuela,Vietnam
Price in US Dollars,1.84,3.98,4.76,3.67,3.97,3.18,1.83,3.91,3.52,3.71,...,6.44,2.08,3.09,3.41,3.54,1.54,4.93,3.74,0.66,2.67


In [38]:
bigmac.loc[("Price in US Dollars",), ("2010-01-01", "Brazil"):("2010-01-01", "Costa Rica")] # agora temos indices múltiplos em nossas colunas

Date,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01,2010-01-01
Country,Brazil,Britain,Canada,Chile,China,Colombia,Costa Rica
Price in US Dollars,4.76,3.67,3.97,3.18,1.83,3.91,3.52


## **The ```swaplevel``` Method**

In [39]:
bigmac = pd.read_csv("bigmac.csv", parse_dates = ["Date"], index_col = ["Date", "Country"]).sort_index()
bigmac.head(4)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67


In [40]:
bigmac.index = bigmac.swaplevel().index # inverte a ordem os indices múltiplos

In [41]:
bigmac.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Country,Date,Unnamed: 2_level_1
Argentina,2010-01-01,1.84
Australia,2010-01-01,3.98
Brazil,2010-01-01,4.76


In [42]:
bigmac.swaplevel(i = "Date", j = "Country") # os argumentos são os rótulos que queremos inverter

Unnamed: 0_level_0,Unnamed: 1_level_0,Price in US Dollars
Date,Country,Unnamed: 2_level_1
2010-01-01,Argentina,1.84
2010-01-01,Australia,3.98
2010-01-01,Brazil,4.76
2010-01-01,Britain,3.67
2010-01-01,Canada,3.97
...,...,...
2016-01-01,Ukraine,1.54
2016-01-01,United States,4.93
2016-01-01,Uruguay,3.74
2016-01-01,Venezuela,0.66


## **The ```.stack()``` Method**

In [43]:
stats = pd.read_csv("worldstats.csv", parse_dates = ["year"], index_col = ["country", "year"])
# nesse caso faz sentido agrupar por pais porque cada pais tem vários registros para anos diferentes (então acaba repetindo menos valores do que as datas)
stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015-01-01,392022276.0,2530102000000.0
Arab World,2014-01-01,384222592.0,2873600000000.0
Arab World,2013-01-01,376504253.0,2846994000000.0
Arab World,2012-01-01,368802611.0,2773270000000.0
Arab World,2011-01-01,361031820.0,2497945000000.0


Colunas ainda são indices na posição 0

In [44]:
stats.stack() 

# empilha as colunas em indices para cada combinação única de rótulos de indice + rótulos de colunas para obter valores empilhados
# Não estamos mudando os dados do nosso Dataframe, estamos apenas olhando para ele de uma perspectiva diferente
# Além de estarmos duplicando a quantidade de registros no nosso Dataframe
# Estamos também transformando o nosso conjunto de dados em um objeto unidimensional

country     year                  
Arab World  2015-01-01  Population    3.920223e+08
                        GDP           2.530102e+12
            2014-01-01  Population    3.842226e+08
                        GDP           2.873600e+12
            2013-01-01  Population    3.765043e+08
                                          ...     
Zimbabwe    1962-01-01  GDP           1.117602e+09
            1961-01-01  Population    3.876638e+06
                        GDP           1.096647e+09
            1960-01-01  Population    3.752390e+06
                        GDP           1.052990e+09
Length: 22422, dtype: float64

In [45]:
stats.stack().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015-01-01,Population,3.920223e+08
Arab World,2015-01-01,GDP,2.530102e+12
Arab World,2014-01-01,Population,3.842226e+08
Arab World,2014-01-01,GDP,2.873600e+12
Arab World,2013-01-01,Population,3.765043e+08
...,...,...,...
Zimbabwe,1962-01-01,GDP,1.117602e+09
Zimbabwe,1961-01-01,Population,3.876638e+06
Zimbabwe,1961-01-01,GDP,1.096647e+09
Zimbabwe,1960-01-01,Population,3.752390e+06


## **The ```.unstack()``` Method, Part 1**

In [46]:
world = pd.read_csv("worldstats.csv", index_col = ["country", "year"]).sort_index()
world.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,537777800.0
Afghanistan,1961,9164945.0,548888900.0
Afghanistan,1962,9343772.0,546666700.0


In [47]:
s = world.stack()
s.head(2)

country      year            
Afghanistan  1960  Population    8.994793e+06
                   GDP           5.377778e+08
dtype: float64

In [48]:
s.unstack().unstack().unstack() # desempilha o tipo de camada mais interno no nosso conjunto de dados

            year  country           
Population  1960  Afghanistan           8.994793e+06
                  Albania                        NaN
                  Algeria               1.112489e+07
                  Andorra                        NaN
                  Angola                         NaN
                                            ...     
GDP         2015  West Bank and Gaza    1.267740e+10
                  World                 7.343364e+13
                  Yemen, Rep.                    NaN
                  Zambia                2.120156e+10
                  Zimbabwe              1.389294e+10
Length: 28224, dtype: float64

## **The ```.unstack()``` Method, Part 2**

In [49]:
world = pd.read_csv("worldstats.csv", index_col = ["country", "year"])
world.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Arab World,2015,392022276.0,2530102000000.0
Arab World,2014,384222592.0,2873600000000.0
Arab World,2013,376504253.0,2846994000000.0


In [50]:
s = world.stack()
s.head(3)

country     year            
Arab World  2015  Population    3.920223e+08
                  GDP           2.530102e+12
            2014  Population    3.842226e+08
dtype: float64

In [51]:
s.unstack(2)

# Desempilhar os multi-indices 'Population e GDP' normaliza os dados nas colunas porque não temos valores para esses nomes de indice, são rótulos por si mesmo
# Já se desempilharmos os rótulos de indice que possuem valores como por exemplo: year = 1960, year = 1961 e etc, cada um desses valores irá se tornar uma coluna

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,5.377778e+08
Afghanistan,1961,9164945.0,5.488889e+08
Afghanistan,1962,9343772.0,5.466667e+08
Afghanistan,1963,9531555.0,7.511112e+08
Afghanistan,1964,9728645.0,8.000000e+08
...,...,...,...
Zimbabwe,2011,14255592.0,1.095623e+10
Zimbabwe,2012,14565482.0,1.239272e+10
Zimbabwe,2013,14898092.0,1.349023e+10
Zimbabwe,2014,15245855.0,1.419691e+10


In [52]:
s.unstack(-1) # default

Unnamed: 0_level_0,Unnamed: 1_level_0,Population,GDP
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1960,8994793.0,5.377778e+08
Afghanistan,1961,9164945.0,5.488889e+08
Afghanistan,1962,9343772.0,5.466667e+08
Afghanistan,1963,9531555.0,7.511112e+08
Afghanistan,1964,9728645.0,8.000000e+08
...,...,...,...
Zimbabwe,2011,14255592.0,1.095623e+10
Zimbabwe,2012,14565482.0,1.239272e+10
Zimbabwe,2013,14898092.0,1.349023e+10
Zimbabwe,2014,15245855.0,1.419691e+10


In [53]:
# Se o 'level' que estivermos observando tiver um nome, podemos utilizar esse nome como argumento para o parâmetro "level"

s.unstack("year")

Unnamed: 0_level_0,year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,Population,8.994793e+06,9.164945e+06,9.343772e+06,9.531555e+06,9.728645e+06,9.935358e+06,1.014884e+07,1.036860e+07,1.059979e+07,1.084951e+07,...,2.518362e+07,2.587754e+07,2.652874e+07,2.720729e+07,2.796221e+07,2.880917e+07,2.972680e+07,3.068250e+07,3.162751e+07,3.252656e+07
Afghanistan,GDP,5.377778e+08,5.488889e+08,5.466667e+08,7.511112e+08,8.000000e+08,1.006667e+09,1.400000e+09,1.673333e+09,1.373333e+09,1.408889e+09,...,7.057598e+09,9.843842e+09,1.019053e+10,1.248694e+10,1.593680e+10,1.793024e+10,2.053654e+10,2.004633e+10,2.005019e+10,1.919944e+10
Albania,Population,,,,,,,,,,,...,2.992547e+06,2.970017e+06,2.947314e+06,2.927519e+06,2.913021e+06,2.904780e+06,2.900247e+06,2.896652e+06,2.893654e+06,2.889167e+06
Albania,GDP,,,,,,,,,,,...,8.992642e+09,1.070101e+10,1.288135e+10,1.204421e+10,1.192695e+10,1.289087e+10,1.231978e+10,1.278103e+10,1.327796e+10,1.145560e+10
Algeria,Population,1.112489e+07,1.140486e+07,1.169015e+07,1.198513e+07,1.229597e+07,1.262695e+07,1.298027e+07,1.335420e+07,1.374438e+07,1.414444e+07,...,3.374933e+07,3.426197e+07,3.481106e+07,3.540179e+07,3.603616e+07,3.671713e+07,3.743943e+07,3.818614e+07,3.893433e+07,3.966652e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Yemen, Rep.",GDP,,,,,,,,,,,...,1.908173e+10,2.563367e+10,3.039720e+10,2.845950e+10,3.090675e+10,3.107886e+10,3.207477e+10,3.595450e+10,,
Zambia,Population,3.049586e+06,3.142848e+06,3.240664e+06,3.342894e+06,3.449266e+06,3.559687e+06,3.674088e+06,3.792864e+06,3.916928e+06,4.047479e+06,...,1.238151e+07,1.273868e+07,1.311458e+07,1.350785e+07,1.391744e+07,1.434353e+07,1.478658e+07,1.524609e+07,1.572134e+07,1.621177e+07
Zambia,GDP,6.987397e+08,6.823597e+08,6.792797e+08,7.043397e+08,8.226397e+08,1.061200e+09,1.239000e+09,1.340639e+09,1.573739e+09,1.926399e+09,...,1.275686e+10,1.405696e+10,1.791086e+10,1.532834e+10,2.026555e+10,2.345952e+10,2.550306e+10,2.804552e+10,2.713464e+10,2.120156e+10
Zimbabwe,Population,3.752390e+06,3.876638e+06,4.006262e+06,4.140804e+06,4.279561e+06,4.422132e+06,4.568320e+06,4.718612e+06,4.874113e+06,5.036321e+06,...,1.312794e+07,1.329780e+07,1.349546e+07,1.372100e+07,1.397390e+07,1.425559e+07,1.456548e+07,1.489809e+07,1.524586e+07,1.560275e+07


## **The .unstack() Method, Part 3**

In [54]:
world = pd.read_csv("worldstats.csv", index_col = ["country", "year"]).sort_index()
s = world.stack()
s.head(3)

country      year            
Afghanistan  1960  Population    8.994793e+06
                   GDP           5.377778e+08
             1961  Population    9.164945e+06
dtype: float64

In [55]:
s.unstack(level = [1, 0]) # a ordem importa
s.unstack(level = ["country", "year"]) # a ordem importa

country,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,Afghanistan,...,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe,Zimbabwe
year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Population,8994793.0,9164945.0,9343772.0,9531555.0,9728645.0,9935358.0,10148840.0,10368600.0,10599790.0,10849510.0,...,13127940.0,13297800.0,13495460.0,13721000.0,13973900.0,14255590.0,14565480.0,14898090.0,15245860.0,15602750.0
GDP,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,1400000000.0,1673333000.0,1373333000.0,1408889000.0,...,5443896000.0,5291950000.0,4415703000.0,8157077000.0,9422161000.0,10956230000.0,12392720000.0,13490230000.0,14196910000.0,13892940000.0


In [56]:
s.unstack("year", fill_value = 0) # utilizamos esse parâmetro para preencher valores ausentes com algum valor

Unnamed: 0_level_0,year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,Population,8.994793e+06,9.164945e+06,9.343772e+06,9.531555e+06,9.728645e+06,9.935358e+06,1.014884e+07,1.036860e+07,1.059979e+07,1.084951e+07,...,2.518362e+07,2.587754e+07,2.652874e+07,2.720729e+07,2.796221e+07,2.880917e+07,2.972680e+07,3.068250e+07,3.162751e+07,3.252656e+07
Afghanistan,GDP,5.377778e+08,5.488889e+08,5.466667e+08,7.511112e+08,8.000000e+08,1.006667e+09,1.400000e+09,1.673333e+09,1.373333e+09,1.408889e+09,...,7.057598e+09,9.843842e+09,1.019053e+10,1.248694e+10,1.593680e+10,1.793024e+10,2.053654e+10,2.004633e+10,2.005019e+10,1.919944e+10
Albania,Population,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,2.992547e+06,2.970017e+06,2.947314e+06,2.927519e+06,2.913021e+06,2.904780e+06,2.900247e+06,2.896652e+06,2.893654e+06,2.889167e+06
Albania,GDP,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,8.992642e+09,1.070101e+10,1.288135e+10,1.204421e+10,1.192695e+10,1.289087e+10,1.231978e+10,1.278103e+10,1.327796e+10,1.145560e+10
Algeria,Population,1.112489e+07,1.140486e+07,1.169015e+07,1.198513e+07,1.229597e+07,1.262695e+07,1.298027e+07,1.335420e+07,1.374438e+07,1.414444e+07,...,3.374933e+07,3.426197e+07,3.481106e+07,3.540179e+07,3.603616e+07,3.671713e+07,3.743943e+07,3.818614e+07,3.893433e+07,3.966652e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Yemen, Rep.",GDP,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,1.908173e+10,2.563367e+10,3.039720e+10,2.845950e+10,3.090675e+10,3.107886e+10,3.207477e+10,3.595450e+10,0.000000e+00,0.000000e+00
Zambia,Population,3.049586e+06,3.142848e+06,3.240664e+06,3.342894e+06,3.449266e+06,3.559687e+06,3.674088e+06,3.792864e+06,3.916928e+06,4.047479e+06,...,1.238151e+07,1.273868e+07,1.311458e+07,1.350785e+07,1.391744e+07,1.434353e+07,1.478658e+07,1.524609e+07,1.572134e+07,1.621177e+07
Zambia,GDP,6.987397e+08,6.823597e+08,6.792797e+08,7.043397e+08,8.226397e+08,1.061200e+09,1.239000e+09,1.340639e+09,1.573739e+09,1.926399e+09,...,1.275686e+10,1.405696e+10,1.791086e+10,1.532834e+10,2.026555e+10,2.345952e+10,2.550306e+10,2.804552e+10,2.713464e+10,2.120156e+10
Zimbabwe,Population,3.752390e+06,3.876638e+06,4.006262e+06,4.140804e+06,4.279561e+06,4.422132e+06,4.568320e+06,4.718612e+06,4.874113e+06,5.036321e+06,...,1.312794e+07,1.329780e+07,1.349546e+07,1.372100e+07,1.397390e+07,1.425559e+07,1.456548e+07,1.489809e+07,1.524586e+07,1.560275e+07


## **The ```.pivot()``` Method**

In [57]:
# parse_dates converte a coluna para datetime
sales = pd.read_csv("salesmen.csv", parse_dates = ["Date"])
sales["Salesman"] = sales["Salesman"].astype("category") 
sales.head(3)

Unnamed: 0,Date,Salesman,Revenue
0,2016-01-01,Bob,7172
1,2016-01-02,Bob,6362
2,2016-01-03,Bob,5982


In [58]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1830 entries, 0 to 1829
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      1830 non-null   datetime64[ns]
 1   Salesman  1830 non-null   category      
 2   Revenue   1830 non-null   int64         
dtypes: category(1), datetime64[ns](1), int64(1)
memory usage: 30.7 KB


In [59]:
sales["Salesman"].value_counts()

Bob       366
Dave      366
Jeb       366
Oscar     366
Ronald    366
Name: Salesman, dtype: int64

### Abaixo está uma sugestão de alteração do nosso quadro de dados para quando tivermos um número pequeno de registros no nosso dataset + uma quantidade pequena de valores exclusivos para uma coluna desse dataset

In [60]:
df_pivoted = sales.pivot(index = ["Date"], columns = "Salesman", values = ["Revenue"])
df_pivoted

Unnamed: 0_level_0,Revenue,Revenue,Revenue,Revenue,Revenue
Salesman,Bob,Dave,Jeb,Oscar,Ronald
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2016-01-01,7172,1864,4430,5250,2639
2016-01-02,6362,8278,8026,8661,4951
2016-01-03,5982,4226,5188,7075,2703
2016-01-04,7917,3868,3144,2524,4258
2016-01-05,7837,2287,938,2793,7771
...,...,...,...,...,...
2016-12-27,2045,2843,6666,835,2981
2016-12-28,100,8888,1243,3073,6129
2016-12-29,4115,9490,3498,6424,7662
2016-12-30,2577,3594,8858,7088,2570


In [61]:
len(df_pivoted) # diminuimos consideravelmente a quantidade de registros para o nosso Dataframe com o método "pivot"

366

## **The ```.pivot_table()``` Method**

In [63]:
foods = pd.read_csv("foods.csv")
foods.head()

Unnamed: 0,First Name,Gender,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15.66
1,Eric,Male,Stamford,Daily,Chalupa,10.56
2,Charles,Male,New York,Never,Sushi,42.14
3,Anna,Female,Philadelphia,Once,Ice Cream,11.01
4,Deborah,Female,Philadelphia,Daily,Chalupa,23.49


In [78]:
foods.pivot_table(values = ["Spend"], index = ["Gender", "Item"], columns = ["Frequency", "City"], aggfunc = "sum").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend,Spend
Unnamed: 0_level_1,Frequency,Daily,Daily,Daily,Monthly,Monthly,Monthly,Never,Never,Never,Often,...,Once,Seldom,Seldom,Seldom,Weekly,Weekly,Weekly,Yearly,Yearly,Yearly
Unnamed: 0_level_2,City,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford,New York,...,Stamford,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford,New York,Philadelphia,Stamford
Gender,Item,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
Female,Burger,262.67,231.68,144.66,171.86,215.05,238.49,97.89,218.97,90.97,94.96,...,95.05,63.16,409.05,97.53,184.35,32.0,155.02,259.3,123.17,307.03
Female,Burrito,224.45,321.57,195.63,122.74,34.28,67.94,189.73,382.3,366.34,103.6,...,168.01,167.54,198.11,234.49,39.69,62.82,230.91,71.26,116.75,172.98
Female,Chalupa,43.19,23.49,95.7,158.37,289.96,161.98,35.15,121.97,156.36,39.73,...,40.59,160.0,274.51,175.25,171.52,84.41,204.69,157.82,504.44,348.16
Female,Donut,478.1,247.4,124.35,284.53,50.25,229.3,56.07,433.58,157.33,130.63,...,158.24,90.81,183.25,104.66,285.56,208.8,440.06,62.95,175.23,224.48
Female,Ice Cream,262.19,177.69,92.88,92.53,74.51,125.85,206.15,156.03,77.66,232.26,...,279.33,242.35,203.1,117.73,227.62,285.28,125.03,151.67,159.86,15.24


In [82]:
foods.pivot_table(values = ["Spend"], index = ["Gender", "Item"], columns = ["City"], aggfunc = "min").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Spend,Spend,Spend
Unnamed: 0_level_1,City,New York,Philadelphia,Stamford
Gender,Item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,Burger,2.25,1.97,6.24
Female,Burrito,1.02,1.04,1.18
Female,Chalupa,1.96,9.35,9.09
Female,Donut,3.15,2.13,1.68
Female,Ice Cream,13.39,7.61,8.8


In [84]:
# Uma forma alternativa de utilizar o método ".pivot_table()"

pd.pivot_table(data = foods, values = ["Spend"], index = ["Gender", "Item"], columns = ["City"], aggfunc = "mean").head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Spend,Spend,Spend
Unnamed: 0_level_1,City,New York,Philadelphia,Stamford
Gender,Item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Female,Burger,51.626667,52.87871,45.037778
Female,Burrito,42.563043,52.098571,53.532647
Female,Chalupa,46.135789,52.291562,64.094
Female,Donut,46.670323,54.642,48.734118
Female,Ice Cream,56.356296,46.225625,46.910455


## **The ```pd.melt()``` Method**

Realiza uma operação inversa ao método 'pivot_table()', recebendo um dataset contendo agregações e o converte novamente para um Dataframe sem operações de agregação.

In [86]:
sales = pd.read_csv("quarters.csv")
sales

Unnamed: 0,Salesman,Q1,Q2,Q3,Q4
0,Boris,602908,233879,354479,32704
1,Bob,43790,514863,297151,544493
2,Tommy,392668,113579,430882,247231
3,Travis,834663,266785,749238,570524
4,Donald,580935,411379,110390,651572
5,Ted,656644,70803,375948,321388
6,Jeb,486141,600753,742716,404995
7,Stacy,479662,742806,770712,2501
8,Morgan,992673,879183,37945,293710


In [93]:
pd.melt(frame = sales, id_vars = ["Salesman"], var_name = "Quarter", value_name = "Revenue")

# id_vars --> Nome de colunas que queremos manter no nosso novo Dataframe
# var_name --> Nome da coluna que conterá os cabeçalhos que não forem considerados no 'id_vars'
# value_name --> Nome da coluna que conterá o resultado dos valores agregados do nosso dataset

Unnamed: 0,Salesman,Quarter,Revenue
0,Boris,Q1,602908
1,Bob,Q1,43790
2,Tommy,Q1,392668
3,Travis,Q1,834663
4,Donald,Q1,580935
5,Ted,Q1,656644
6,Jeb,Q1,486141
7,Stacy,Q1,479662
8,Morgan,Q1,992673
9,Boris,Q2,233879
