# Pandas

In [1]:
from pandas import Series, DataFrame

In [2]:
import pandas as pd
import numpy as np
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## Estruturas de Dados no Pandas

### DataFrame

In [3]:
data = {'estado': ['PR', 'SC', 'RS', 'RJ', 'MG', 'SP'],
        'ano': [2000, 2001, 2002, 2001, 2002, 2003],
        'desempenho': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [63]:
frame

Unnamed: 0,estado,ano,desempenho
0,PR,2000,1.5
1,SC,2001,1.7
2,RS,2002,3.6
3,RJ,2001,2.4
4,MG,2002,2.9
5,SP,2003,3.2


In [4]:
frame.head()

Unnamed: 0,estado,ano,desempenho
0,PR,2000,1.5
1,SC,2001,1.7
2,RS,2002,3.6
3,RJ,2001,2.4
4,MG,2002,2.9


In [5]:
pd.DataFrame(data, columns=['ano', 'estado'])

Unnamed: 0,ano,estado
0,2000,PR
1,2001,SC
2,2002,RS
3,2001,RJ
4,2002,MG
5,2003,SP


In [7]:
frame2 = pd.DataFrame(data, columns=['ano', 'estado', 'desempenho', 'débito'],
                      index=['um', 'dois', 'três', 'quatro',
                             'cinco', 'seis'])
frame2

Unnamed: 0,ano,estado,desempenho,débito
um,2000,PR,1.5,
dois,2001,SC,1.7,
três,2002,RS,3.6,
quatro,2001,RJ,2.4,
cinco,2002,MG,2.9,
seis,2003,SP,3.2,


In [8]:
frame2.columns

Index(['ano', 'estado', 'desempenho', 'débito'], dtype='object')

In [9]:
frame2['estado']

um        PR
dois      SC
três      RS
quatro    RJ
cinco     MG
seis      SP
Name: estado, dtype: object

In [10]:
frame2.ano

um        2000
dois      2001
três      2002
quatro    2001
cinco     2002
seis      2003
Name: ano, dtype: int64

In [11]:
frame2.loc['três']

ano           2002
estado          RS
desempenho     3.6
débito         NaN
Name: três, dtype: object

In [15]:
frame2['débito'] = 16.5
frame2

Unnamed: 0,ano,estado,desempenho,débito
um,2000,PR,1.5,16.5
dois,2001,SC,1.7,16.5
três,2002,RS,3.6,16.5
quatro,2001,RJ,2.4,16.5
cinco,2002,MG,2.9,16.5
seis,2003,SP,3.2,16.5


In [13]:
frame2['débito'] = np.arange(6.0)
frame2

Unnamed: 0,ano,estado,desempenho,débito
um,2000,PR,1.5,0.0
dois,2001,SC,1.7,1.0
três,2002,RS,3.6,2.0
quatro,2001,RJ,2.4,3.0
cinco,2002,MG,2.9,4.0
seis,2003,SP,3.2,5.0


In [16]:
val = pd.Series([-1.2, -1.5, -1.7], index=['dois', 'quatro', 'cinco'])
frame2['débito'] = val
frame2

Unnamed: 0,ano,estado,desempenho,débito
um,2000,PR,1.5,
dois,2001,SC,1.7,-1.2
três,2002,RS,3.6,
quatro,2001,RJ,2.4,-1.5
cinco,2002,MG,2.9,-1.7
seis,2003,SP,3.2,


In [17]:
frame2['novo estado'] = frame2.estado == 'SC'
frame2

Unnamed: 0,ano,estado,desempenho,débito,novo estado
um,2000,PR,1.5,,False
dois,2001,SC,1.7,-1.2,True
três,2002,RS,3.6,,False
quatro,2001,RJ,2.4,-1.5,False
cinco,2002,MG,2.9,-1.7,False
seis,2003,SP,3.2,,False


In [18]:
del frame2['novo estado']
frame2.columns

Index(['ano', 'estado', 'desempenho', 'débito'], dtype='object')

In [19]:
pop = {'SP': {2001: 2.4, 2002: 2.9},
       'SC': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [20]:
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,SP,SC
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [21]:
frame3.T

Unnamed: 0,2001,2002,2000
SP,2.4,2.9,
SC,1.7,3.6,1.5


In [22]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,SP,SC
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [23]:
pdata = {'SC': frame3['SC'][:2],
         'SP': frame3['SP'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,SC,SP
2001,1.7,2.4
2002,3.6,2.9


In [24]:
frame3.index.name = 'ano'; frame3.columns.name = 'estado'
frame3

estado,SP,SC
ano,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [25]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [26]:
frame2.values

array([[2000, 'PR', 1.5, nan],
       [2001, 'SC', 1.7, -1.2],
       [2002, 'RS', 3.6, nan],
       [2001, 'RJ', 2.4, -1.5],
       [2002, 'MG', 2.9, -1.7],
       [2003, 'SP', 3.2, nan]], dtype=object)

### Objetos Index

In [27]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj.index


Index(['a', 'b', 'c'], dtype='object')

In [28]:
obj.index[1] = 'd' #simulação de erro

TypeError: Index does not support mutable operations

In [29]:
labels = pd.Index(np.arange(3))
print(labels)
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
print(obj2)
# Verifica se os objetivos possuem o mesmo conteúdo 
obj2.index is labels

Index([0, 1, 2], dtype='int64')
0    1.5
1   -2.5
2    0.0
dtype: float64


True

In [30]:
print(frame3)
print(frame3.columns)
print('MT' in frame3.columns)
print(2002 in frame3.index)

estado   SP   SC
ano             
2001    2.4  1.7
2002    2.9  3.6
2000    NaN  1.5
Index(['SP', 'SC'], dtype='object', name='estado')
False
True


## Funcionalidades Essenciais

### Reindexação

In [31]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [32]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)
obj2['e'] = 2
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


a   -5.3
b    7.2
c    3.6
d    4.5
e    2.0
dtype: float64

In [38]:
obj3 = pd.Series(['azul', 'roxo', 'amarelo'], index=[0, 3, 6])
print(obj3)
obj3.reindex(range(9), method='ffill')

0       azul
3       roxo
6    amarelo
dtype: object


0       azul
1       azul
2       azul
3       roxo
4       roxo
5       roxo
6    amarelo
7    amarelo
8    amarelo
dtype: object

In [39]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['SC', 'PR', 'RS'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

   SC  PR  RS
a   0   1   2
c   3   4   5
d   6   7   8


Unnamed: 0,SC,PR,RS
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [42]:
states = ['SC', 'PR', 'SP', 'RS', 'PR']
frame.reindex(columns=states)

Unnamed: 0,SC,PR,SP,RS,PR.1
a,0,1,,2,1
c,3,4,,5,4
d,6,7,,8,7


In [43]:
val = pd.Series([3, 2, 6], index=['a', 'c', 'd'])
frame['SP'] = val
frame

Unnamed: 0,SC,PR,RS,SP
a,0,1,2,3
c,3,4,5,2
d,6,7,8,6


### Descartando entradas de um eixo

In [44]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)
objr = obj.drop(['d', 'c'])
print(objr)
print(obj)
obj.drop(['d','c'], inplace=True)
print(obj)

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
e    4.0
dtype: float64
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
a    0.0
b    1.0
e    4.0
dtype: float64


In [45]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['SC', 'RS', 'PR', 'SP'],
                    columns=['um', 'dois', 'três', 'quatro'])
data

Unnamed: 0,um,dois,três,quatro
SC,0,1,2,3
RS,4,5,6,7
PR,8,9,10,11
SP,12,13,14,15


In [46]:
data.drop(['SC', 'RS'])

Unnamed: 0,um,dois,três,quatro
PR,8,9,10,11
SP,12,13,14,15


In [47]:
data.drop('dois', axis=1)
data.drop(['dois', 'quatro'], axis='columns')

Unnamed: 0,um,três
SC,0,2
RS,4,6
PR,8,10
SP,12,14


### Indexação, Seleção e Filtragem

In [48]:
import pandas as pd
import numpy as np
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 5])

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1.0
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64


In [49]:
obj['a':'c']

a    0.0
b    1.0
c    2.0
dtype: float64

In [50]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [51]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['SC', 'RS', 'PR', 'SP'],
                    columns=['um', 'dois', 'três', 'quatro'])
print(data)
print(data['dois'])
data[['três', 'um']]

    um  dois  três  quatro
SC   0     1     2       3
RS   4     5     6       7
PR   8     9    10      11
SP  12    13    14      15
SC     1
RS     5
PR     9
SP    13
Name: dois, dtype: int64


Unnamed: 0,três,um
SC,2,0
RS,6,4
PR,10,8
SP,14,12


In [52]:
data[0:2]


Unnamed: 0,um,dois,três,quatro
SC,0,1,2,3
RS,4,5,6,7


In [53]:
data[data['três'] > 5]

Unnamed: 0,um,dois,três,quatro
RS,4,5,6,7
PR,8,9,10,11
SP,12,13,14,15


In [54]:
print(data < 5)
data[data < 5] = 6
data

       um   dois   três  quatro
SC   True   True   True    True
RS   True  False  False   False
PR  False  False  False   False
SP  False  False  False   False


Unnamed: 0,um,dois,três,quatro
SC,6,6,6,6
RS,6,5,6,7
PR,8,9,10,11
SP,12,13,14,15


#### Seleção com loc e iloc

In [55]:
print(data)
data.loc['SC', ['dois', 'três']]

    um  dois  três  quatro
SC   6     6     6       6
RS   6     5     6       7
PR   8     9    10      11
SP  12    13    14      15


dois    6
três    6
Name: SC, dtype: int64

In [56]:
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,quatro,um,dois
RS,7,6,5
PR,11,8,9


In [57]:
print(data.loc[:'SC', ['dois', 'três']])
data.iloc[:, :4][data.dois > 5]
#data.iloc[:, :3]

    dois  três
SC     6     6


Unnamed: 0,um,dois,três,quatro
SC,6,6,6,6
PR,8,9,10,11
SP,12,13,14,15
