# Series

Objeto básico do Pandas.

Análogo a um dicionário (índice/chaves e valores)

In [1]:
import numpy as np
import pandas as pd

In [4]:
índice = np.array(['a', 'b', 'c'])
dados = np.array([1,2,3])
series0 = pd.Series(dados, índice)
series0

a    1
b    2
c    3
dtype: int64

Aceita qualquer tipo de dados tanto como índice quanto como dado. Por exemplo, funções:

In [6]:
series1 = pd.Series([sum, max, print], [1,2,3])
series1

1      <built-in function sum>
2      <built-in function max>
3    <built-in function print>
dtype: object

## Operações aritméticas com series

In [10]:
matches0 = ['Liverpool vs Man City', 'Luton Town vs Burnley', 'Arsenal vs Tottenham']
xG0 = [5.76, 1.53, 3.89]
matches1 = ['Liverpool vs Man City', 'West Ham vs Man Utd', 'Arsenal vs Tottenham']
xG1 = [4.34, 1.98, 4.67]

season0 = pd.Series(xG0, matches0)
season1 = pd.Series(xG1, matches1)
season0 + season1

Arsenal vs Tottenham      8.56
Liverpool vs Man City    10.10
Luton Town vs Burnley      NaN
West Ham vs Man Utd        NaN
dtype: float64

Foram somados os valores de mesmo índice em ambos os operandos e, para os índices que não estavam presentem em todos os operandos, o resultado ficou NaN.

A principal ideia do Pandas é fazer operações baseadas nos índices, como essa.

# Data Frames

Principal objeto do Pandas.

Basicamente um conjunto de series.

Muito semelhante à uma tabela do Excel.

In [39]:
dados = np.random.randn(6,7)
linhas = np.arange(1, 7)
colunas = 'A B C D E F G'.split()

df0 = pd.DataFrame(dados, linhas, colunas)
df0

Unnamed: 0,A,B,C,D,E,F,G
1,-0.513578,-2.280642,0.176946,-0.885264,-0.240493,0.296436,-0.512543
2,0.664333,0.41444,1.096201,-0.390138,-0.461409,-0.456621,-0.524396
3,0.72942,0.291071,-0.233774,0.347071,-1.023211,1.174574,-1.42617
4,-1.140624,0.327971,-0.196065,-0.511918,-0.960672,0.869374,-0.052482
5,-0.986853,-0.183727,-0.353851,1.158218,-0.439221,0.004605,-1.548726
6,0.443604,-1.381404,0.376131,-0.258115,0.965624,-1.808706,-0.42638


### Acessando as linhas e colunas

In [40]:
df0['D'] #irá retornar a coluna D

1   -0.885264
2   -0.390138
3    0.347071
4   -0.511918
5    1.158218
6   -0.258115
Name: D, dtype: float64

In [41]:
type(df0['B'])

pandas.core.series.Series

Repare que as colunas são series, assim como as linhas.

In [42]:
df0[['B', 'D', 'G']] #acesso a múltiplas colunas

Unnamed: 0,B,D,G
1,-2.280642,-0.885264,-0.512543
2,0.41444,-0.390138,-0.524396
3,0.291071,0.347071,-1.42617
4,0.327971,-0.511918,-0.052482
5,-0.183727,1.158218,-1.548726
6,-1.381404,-0.258115,-0.42638


In [43]:
df0['A'][5] #acesso a um valor específico

-0.9868525553788156

In [44]:
df0.loc[3, 'B'] #forma alternativa

0.2910707572337753

In [46]:
df0.loc[[1, 3, 5], 'C D G'.split()] #seleção de uma fração do DF

Unnamed: 0,C,D,G
1,0.176946,-0.885264,-0.512543
3,-0.233774,0.347071,-1.42617
5,-0.353851,1.158218,-1.548726


In [47]:
df0.iloc[1:5, 0:2] #iloc permite-se usar índices posicionais

Unnamed: 0,A,B
2,0.664333,0.41444
3,0.72942,0.291071
4,-1.140624,0.327971
5,-0.986853,-0.183727


### Definição de uma nova coluna

In [27]:
df0['H'] = np.random.randint(0, 100, 6) #deve respeitar as dimensões do DF já existente
df0

Unnamed: 0,A,B,C,D,E,F,G,H
1,1.48159,1.969031,-1.562609,0.660918,-0.245025,-1.024508,0.69269,8
2,-1.410155,-0.862282,0.660701,1.364336,0.359741,0.182681,-1.206179,14
3,-0.402238,-0.999434,-1.575302,-0.445132,-0.509585,0.002012,-1.097855,4
4,-0.740087,-0.996299,-0.251916,-0.79756,0.997484,2.468255,-0.416426,34
5,0.652998,1.522702,1.008412,1.314283,-0.496594,-0.552968,-0.252362,28
6,-0.016693,0.609687,0.735745,-0.641789,0.636975,-1.856851,-0.914444,45


### Deletando linhas e colunas

In [36]:
df0.drop(4, inplace=True) #deleta a linha de índice 4
df0

Unnamed: 0,A,B,C,D,E,F,G
1,2.112972,-0.524794,0.378813,1.571393,0.484992,-0.06287,1.329515
2,-1.392032,1.749623,-1.797409,1.069238,1.846337,-0.649699,-0.836691
3,-0.565516,-0.897866,-2.0386,-0.116191,-1.566686,0.65069,0.70658
5,1.630562,-0.164614,0.340406,2.029252,-0.785929,-0.491308,-0.746148
6,0.64779,-0.408039,-0.633284,-1.221384,0.614792,-0.240898,-0.710083


Repare que o parâmetro inplace foi assinalado com True, para que a alteração fosse realizada no DF original em vez de só gerar uma cópia com a alteração.

In [37]:
df0.drop('C', axis=1, inplace=True) #deleta a coluna C
df0

Unnamed: 0,A,B,D,E,F,G
1,2.112972,-0.524794,1.571393,0.484992,-0.06287,1.329515
2,-1.392032,1.749623,1.069238,1.846337,-0.649699,-0.836691
3,-0.565516,-0.897866,-0.116191,-1.566686,0.65069,0.70658
5,1.630562,-0.164614,2.029252,-0.785929,-0.491308,-0.746148
6,0.64779,-0.408039,-1.221384,0.614792,-0.240898,-0.710083


Repare que o parâmetro axis foi ajustado para 1, indicando ao interpretador que a operação seria feita numa coluna.

### Seleção condicional

In [50]:
linhas = ['Liverpool', 'Arsenal', 'Tottenham', 'Man City']
colunas = ['Real Madrid', 'Valência', 'Barcelona']
xG = np.random.randint(50, 600, 12).reshape(4,3) / 100

matchesxG = pd.DataFrame(xG, linhas, colunas)
matchesxG

Unnamed: 0,Real Madrid,Valência,Barcelona
Liverpool,2.42,3.85,2.71
Arsenal,4.95,1.17,3.21
Tottenham,3.72,4.29,4.13
Man City,2.02,3.84,1.51


In [52]:
cond = df0 > 2
cond

Unnamed: 0,Real Madrid,Valência,Barcelona
Liverpool,False,False,True
Arsenal,True,False,True
Tottenham,False,True,True
Man City,False,True,True


É criado um Data Frame com dados booleanos indicando se o dado do DF original cumpre a condição.

In [53]:
matchesxG[cond]

Unnamed: 0,Real Madrid,Valência,Barcelona
Liverpool,,,2.71
Arsenal,4.95,,3.21
Tottenham,,4.29,4.13
Man City,,3.84,1.51


Os valores que não cumprem a condição ficam com valores nulos.

In [55]:
matchesxG[matchesxG['Barcelona'] > 2]

Unnamed: 0,Real Madrid,Valência,Barcelona
Liverpool,2.42,3.85,2.71
Arsenal,4.95,1.17,3.21
Tottenham,3.72,4.29,4.13


Retornou um DF apenas com a linhas onde a coluna Barcelona é > 2.

In [61]:
matchesxG[(matchesxG['Real Madrid'] > 3) & (matchesxG['Barcelona'] < 4)]

Unnamed: 0,Real Madrid,Valência,Barcelona
Arsenal,4.95,1.17,3.21


É possível utilizar mais de uma condição com operadores condicionais.

In [64]:
matchesxG[matchesxG['Valência'] < 2]['Barcelona'] #seleciona a linha em Barcelona na qual Valência < 2

Arsenal    3.21
Name: Barcelona, dtype: float64

In [73]:
matchesxG[(matchesxG['Real Madrid'] < 2.5) | (matchesxG['Barcelona'] < 3)] #condicional OU

Unnamed: 0,Real Madrid,Valência,Barcelona
Liverpool,2.42,3.85,2.71
Man City,2.02,3.84,1.51


### Set e Reset Índices

In [74]:
matchesxG.reset_index(inplace=True)
matchesxG

Unnamed: 0,index,Real Madrid,Valência,Barcelona
0,Liverpool,2.42,3.85,2.71
1,Arsenal,4.95,1.17,3.21
2,Tottenham,3.72,4.29,4.13
3,Man City,2.02,3.84,1.51


O índice é assinalado com o padrão e os antigos índices viram uma nova coluna.

In [76]:
matchesxG['new_index'] = 'A B C D'.split()
matchesxG.set_index('new_index', inplace=True)
matchesxG

Unnamed: 0_level_0,index,Real Madrid,Valência,Barcelona
new_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,Liverpool,2.42,3.85,2.71
B,Arsenal,4.95,1.17,3.21
C,Tottenham,3.72,4.29,4.13
D,Man City,2.02,3.84,1.51


É possível definir o índice como sendo uma das colunas

# Índices multiníveis

In [78]:
outside = ['G1','G1','G1', 'G2', 'G2', 'G2']
inside = [1,2,3,1,2,3]
mult_index = list(zip(outside, inside))
mult_index = pd.MultiIndex.from_tuples(mult_index)

In [79]:
dfm = pd.DataFrame(np.random.randn(6,4), mult_index, 'A B C D'.split())
dfm

Unnamed: 0,Unnamed: 1,A,B,C,D
G1,1,0.329978,-0.656507,0.730698,-0.20158
G1,2,1.248446,-0.769487,0.582923,1.359677
G1,3,1.341377,-0.825268,-0.874363,0.230298
G2,1,-0.12138,0.067832,-0.493369,0.190043
G2,2,0.898001,-1.251476,-0.07976,0.106813
G2,3,-1.641031,0.142648,-0.549596,0.864095


Gerou um Data Frame multinível (tem 2 níveis externos com 3 internos cada).

In [81]:
dfm.loc['G1'] #recorte com um dos índices externos, retorna outro DF

Unnamed: 0,A,B,C,D
1,0.329978,-0.656507,0.730698,-0.20158
2,1.248446,-0.769487,0.582923,1.359677
3,1.341377,-0.825268,-0.874363,0.230298


In [83]:
dfm.loc['G1'].loc[2] #como são dois níveis de índice, é necessário usar o loc em série para acessar uma linha

A    1.248446
B   -0.769487
C    0.582923
D    1.359677
Name: 2, dtype: float64

In [85]:
dfm.index.names = ['grupo', 'numero'] #define nomes para os níveis de índices
dfm

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C,D
grupo,numero,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
G1,1,0.329978,-0.656507,0.730698,-0.20158
G1,2,1.248446,-0.769487,0.582923,1.359677
G1,3,1.341377,-0.825268,-0.874363,0.230298
G2,1,-0.12138,0.067832,-0.493369,0.190043
G2,2,0.898001,-1.251476,-0.07976,0.106813
G2,3,-1.641031,0.142648,-0.549596,0.864095


In [86]:
dfm.xs('G2') #forma alternativa de acessar elementos do DF multinível

Unnamed: 0_level_0,A,B,C,D
numero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.12138,0.067832,-0.493369,0.190043
2,0.898001,-1.251476,-0.07976,0.106813
3,-1.641031,0.142648,-0.549596,0.864095


In [88]:
dfm.xs(2, level='numero') #com o xs (cross-section) é possível acessar diretamente o nível interno

Unnamed: 0_level_0,A,B,C,D
grupo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G1,1.248446,-0.769487,0.582923,1.359677
G2,0.898001,-1.251476,-0.07976,0.106813


Retornou todos os valores de índice interno 2 com seus respectivos grupos (níveis externos).

In [10]:
outside = ['PR', 'PR', 'PR', 'RS', 'RS', 'RS']
inside = ['IDH', 'População', 'PIB','IDH', 'População', 'PIB']
mult_index2 = list(zip(outside, inside))
mult_index2 = pd.MultiIndex.from_tuples(mult_index2)
IDHs0 = np.random.randint(500, 900, 3) / 1000
pop0 = np.random.randint(300000, 1000000, 3)
PIB0 = np.random.randint(1000000000, 3000000000, 3)
IDHs1 = np.random.randint(500, 900, 3) / 1000
pop1 = np.random.randint(300000, 1000000, 3)
PIB1 = np.random.randint(1000000000, 3000000000, 3)
dados = np.concatenate((IDHs0, pop0, PIB0, IDHs1, pop1, PIB1)).reshape(6,3)
pesquisa = pd.DataFrame(dados, mult_index2, ['Capital', 'Litoral', 'Fronteira'])
pesquisa.index.names = ['Estado', 'Métricas']
pesquisa

Unnamed: 0_level_0,Unnamed: 1_level_0,Capital,Litoral,Fronteira
Estado,Métricas,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PR,IDH,0.856,0.506,0.806
PR,População,375791.0,877312.0,769062.0
PR,PIB,2181807000.0,1531303000.0,2539823000.0
RS,IDH,0.686,0.538,0.612
RS,População,487782.0,351042.0,882391.0
RS,PIB,2805094000.0,1716979000.0,2271103000.0


In [12]:
pesquisa['Capital']

Estado  Métricas 
PR      IDH          8.560000e-01
        População    3.757910e+05
        PIB          2.181807e+09
RS      IDH          6.860000e-01
        População    4.877820e+05
        PIB          2.805094e+09
Name: Capital, dtype: float64

Exibe as métricas das capitais.

In [13]:
pesquisa.loc['PR']

Unnamed: 0_level_0,Capital,Litoral,Fronteira
Métricas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IDH,0.856,0.506,0.806
População,375791.0,877312.0,769062.0
PIB,2181807000.0,1531303000.0,2539823000.0


Exibe as métricas das diferentes regiões e cidades do Paraná.

In [14]:
pesquisa.loc['RS'].loc['IDH']

Capital      0.686
Litoral      0.538
Fronteira    0.612
Name: IDH, dtype: float64

Exibe a linha de IDH do Rio Grande do Sul.

In [15]:
pesquisa.xs('RS')

Unnamed: 0_level_0,Capital,Litoral,Fronteira
Métricas,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IDH,0.686,0.538,0.612
População,487782.0,351042.0,882391.0
PIB,2805094000.0,1716979000.0,2271103000.0


In [16]:
pesquisa.xs('População', level='Métricas')

Unnamed: 0_level_0,Capital,Litoral,Fronteira
Estado,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PR,375791.0,877312.0,769062.0
RS,487782.0,351042.0,882391.0


Recorta as populações de ambos os estados.

# Dados Ausentes

In [20]:
dados = np.random.randint(0, 200, 20).reshape(4,5) / 100
unidade = np.arange(1, 5)
refris = 'CocaCola Água Pepsi Fanta CocaZero'.split()

gas_no_refri = pd.DataFrame(dados, unidade, refris)
gas_no_refri.index.names = ['Unidade']
gas_no_refri

Unnamed: 0,CocaCola,Água,Pepsi,Fanta,CocaZero
1,1.74,0.12,0.0,0.05,0.15
2,1.3,1.07,1.33,0.81,0.04
3,0.51,1.56,1.47,1.64,0.17
4,1.24,1.33,1.65,1.67,0.02


Unnamed: 0_level_0,CocaCola,Água,Pepsi,Fanta,CocaZero
Unidade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.74,0.12,0.0,0.05,0.15
2,1.3,1.07,1.33,0.81,0.04
3,0.51,1.56,1.47,1.64,0.17
4,1.24,1.33,1.65,1.67,0.02
