## Series

In [2]:
import pandas as pd
import numpy as np

In [3]:
labels = ['a', 'b', 'c']

minha_lista = [10,20,30]

arr = np.array([10,20,30])

d = {
    'a': 10,
    'b': 20,
    'c': 30
    }

In [4]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [5]:
pd.Series(data=labels, index=minha_lista)

10    a
20    b
30    c
dtype: object

In [6]:
pd.Series(labels, minha_lista)

10    a
20    b
30    c
dtype: object

In [7]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [8]:
ser1 = pd.Series([100,525,312,414], ['EUA', 'Alemanha', 'Russia', 'Japão'])

In [9]:
ser2 = pd.Series([100,525,312,414], ['EUA', 'Alemanha', 'Itália', 'Japão'])

In [10]:
ser1[['EUA', 'Alemanha']]

EUA         100
Alemanha    525
dtype: int64

In [11]:
ser1 + ser2

Alemanha    1050.0
EUA          200.0
Itália         NaN
Japão        828.0
Russia         NaN
dtype: float64

## DataFrames

In [12]:
from numpy.random import randn

In [13]:
# criamos um dataframe com valores randomicos aleatórios, definimos o indices e colunas

df = pd.DataFrame(randn(5, 4), index=['A', 'B', 'C', 'D', 'E'], columns="W X Y Z".split())

In [14]:
df

Unnamed: 0,W,X,Y,Z
A,-1.074104,-0.536053,-0.795247,-0.725271
B,0.0985,0.66834,0.267298,0.177967
C,-0.131849,-0.93293,-1.298283,0.55415
D,0.496395,0.057136,-0.461104,-0.09536
E,-2.12644,-0.31159,-0.931416,1.890829


In [15]:
# Se pedir para retornar apenas uma, retorna uma Serie
df['W']

A   -1.074104
B    0.098500
C   -0.131849
D    0.496395
E   -2.126440
Name: W, dtype: float64

In [16]:
type(df['W'])

pandas.core.series.Series

In [17]:
# Se pedirmos para retornar uma coluna, mas passando no formato de lista, retorna um dataframe, e podemos selecionar apenas as colunas desejadas.

df[['W', 'X']]


Unnamed: 0,W,X
A,-1.074104,-0.536053
B,0.0985,0.66834
C,-0.131849,-0.93293
D,0.496395,0.057136
E,-2.12644,-0.31159


In [18]:
type(df[['W', 'X']])

pandas.core.frame.DataFrame

In [19]:
# Criar uma coluna
df['new'] = df['W']

In [20]:
df

Unnamed: 0,W,X,Y,Z,new
A,-1.074104,-0.536053,-0.795247,-0.725271,-1.074104
B,0.0985,0.66834,0.267298,0.177967,0.0985
C,-0.131849,-0.93293,-1.298283,0.55415,-0.131849
D,0.496395,0.057136,-0.461104,-0.09536,0.496395
E,-2.12644,-0.31159,-0.931416,1.890829,-2.12644


In [21]:
# Remover uma coluna - Eixo 1 é coluna e eixo 0 é linha

df.drop('new', axis=1)

Unnamed: 0,W,X,Y,Z
A,-1.074104,-0.536053,-0.795247,-0.725271
B,0.0985,0.66834,0.267298,0.177967
C,-0.131849,-0.93293,-1.298283,0.55415
D,0.496395,0.057136,-0.461104,-0.09536
E,-2.12644,-0.31159,-0.931416,1.890829


In [22]:
# Observe que não alterou o original
df

Unnamed: 0,W,X,Y,Z,new
A,-1.074104,-0.536053,-0.795247,-0.725271,-1.074104
B,0.0985,0.66834,0.267298,0.177967,0.0985
C,-0.131849,-0.93293,-1.298283,0.55415,-0.131849
D,0.496395,0.057136,-0.461104,-0.09536,0.496395
E,-2.12644,-0.31159,-0.931416,1.890829,-2.12644


In [23]:
# para alterar o arquivo original, aplicamos o parametro inplace
df.drop('new', axis=1, inplace=True)

In [24]:
df

Unnamed: 0,W,X,Y,Z
A,-1.074104,-0.536053,-0.795247,-0.725271
B,0.0985,0.66834,0.267298,0.177967
C,-0.131849,-0.93293,-1.298283,0.55415
D,0.496395,0.057136,-0.461104,-0.09536
E,-2.12644,-0.31159,-0.931416,1.890829


In [25]:
# Seleção em linhas.
# Observe que faz uma transposição.

df.loc['A']

W   -1.074104
X   -0.536053
Y   -0.795247
Z   -0.725271
Name: A, dtype: float64

In [26]:
# Se eu quiser selecionar mais de uma linha, passo uma lista com quais linhas

df.loc[['A', 'B']]

Unnamed: 0,W,X,Y,Z
A,-1.074104,-0.536053,-0.795247,-0.725271
B,0.0985,0.66834,0.267298,0.177967


In [27]:
# Posso passar uma lista com as linhas e qual coluna quero apenas

df.loc[['A', 'B'], 'W']

A   -1.074104
B    0.098500
Name: W, dtype: float64

In [28]:
# Já com o iloc a seleçaõ é feita pelo número do indice da linha e coluna.

df.iloc[0, 2]

-0.7952470795432338

In [29]:
# Fatiar para pegar apenas alguns ou remover parte
# Estou eliminando a primeira e ultima linha e removendo a primeira e ultima coluna
df.iloc[1:-1, 1:3]

Unnamed: 0,X,Y
B,0.66834,0.267298
C,-0.93293,-1.298283
D,0.057136,-0.461104


In [30]:
df

Unnamed: 0,W,X,Y,Z
A,-1.074104,-0.536053,-0.795247,-0.725271
B,0.0985,0.66834,0.267298,0.177967
C,-0.131849,-0.93293,-1.298283,0.55415
D,0.496395,0.057136,-0.461104,-0.09536
E,-2.12644,-0.31159,-0.931416,1.890829


In [31]:
# Agora vamos fazer um filtro condicional, para retonar apenas valores maiores que 0

df[df > 0]

Unnamed: 0,W,X,Y,Z
A,,,,
B,0.0985,0.66834,0.267298,0.177967
C,,,,0.55415
D,0.496395,0.057136,,
E,,,,1.890829


In [32]:
# SE EU QUISER QUE TRAGA APENAS A COLUNA Y ONDE FOR MAIOR QUE ZERO
df['Y'] > 0

A    False
B     True
C    False
D    False
E    False
Name: Y, dtype: bool

In [33]:
# SE EU QUISER QUE TRAGA APENAS OS INDICES QUANDO A COLUNA Y ONDE FOR MAIOR QUE ZERO

df[df['Y'] > 0]

Unnamed: 0,W,X,Y,Z
B,0.0985,0.66834,0.267298,0.177967


In [34]:
# quando o filtro precisar de duas condicionais

df[(df['Y'] > 0) & (df['X'] > 0)]

Unnamed: 0,W,X,Y,Z
B,0.0985,0.66834,0.267298,0.177967


In [35]:
df.index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [36]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,-1.074104,-0.536053,-0.795247,-0.725271
1,B,0.0985,0.66834,0.267298,0.177967
2,C,-0.131849,-0.93293,-1.298283,0.55415
3,D,0.496395,0.057136,-0.461104,-0.09536
4,E,-2.12644,-0.31159,-0.931416,1.890829


In [37]:
novoind = 'CA NY WY OR CO'.split()

In [38]:
df['novo_index'] = novoind

In [39]:
df

Unnamed: 0,W,X,Y,Z,novo_index
A,-1.074104,-0.536053,-0.795247,-0.725271,CA
B,0.0985,0.66834,0.267298,0.177967,NY
C,-0.131849,-0.93293,-1.298283,0.55415,WY
D,0.496395,0.057136,-0.461104,-0.09536,OR
E,-2.12644,-0.31159,-0.931416,1.890829,CO


In [42]:
df.reset_index().set_index('novo_index')

Unnamed: 0_level_0,index,W,X,Y,Z
novo_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,A,-1.074104,-0.536053,-0.795247,-0.725271
NY,B,0.0985,0.66834,0.267298,0.177967
WY,C,-0.131849,-0.93293,-1.298283,0.55415
OR,D,0.496395,0.057136,-0.461104,-0.09536
CO,E,-2.12644,-0.31159,-0.931416,1.890829


## Hierarquia de índices e índices múltiplos

In [43]:
# Níveis de Índice
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]

hier_index = list(zip(outside, inside))

In [44]:
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [46]:
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [47]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [48]:
df = pd.DataFrame(np.random.randn(6,2), index=hier_index, columns=['A', 'B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.706579,-0.66584
G1,2,-0.264563,0.036406
G1,3,0.957776,0.100616
G2,1,0.707455,0.715317
G2,2,0.321449,0.123247
G2,3,-0.769762,-0.39456


In [49]:
# loc (Localize a primeira linha do primeiro índice)
df.loc['G1'].loc[1]

A   -0.706579
B   -0.665840
Name: 1, dtype: float64

In [54]:
# Posso dar um nome aos indices
df.index.names = ['Grupo', 'Número']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Grupo,Número,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.706579,-0.66584
G1,2,-0.264563,0.036406
G1,3,0.957776,0.100616
G2,1,0.707455,0.715317
G2,2,0.321449,0.123247
G2,3,-0.769762,-0.39456


In [55]:
# e acessar facilmente

df.xs(1, level='Número')

Unnamed: 0_level_0,A,B
Grupo,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.706579,-0.66584
G2,0.707455,0.715317
