In [1]:
import pandas as pd

In [2]:
obj = pd.Series([4, 7, -5, 3])

In [None]:
obj

In [3]:
print(obj.values)

[ 4  7 -5  3]


In [4]:
print(obj.index)

RangeIndex(start=0, stop=4, step=1)


In [5]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [6]:
print(obj[1])
print(obj2['d'])

7
4


In [7]:
obj2[['a', 'b', 'c']]

a   -5
b    7
c    3
dtype: int64

In [8]:
#retorna valores maiores que 0
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [9]:
'b' in obj2

True

In [10]:
'e' in obj2

False

Criando Series a partir de um Dicionário

In [11]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)

print(obj3)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


Alterando os índices a partir de um novo array (se não existe um valor correspondente ao índice, considera not a number)

In [12]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)

print(obj4)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [13]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [14]:
#ou
obj4.notnull() 

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

O próprio objeto Series e seu índice podem ter um nome

In [15]:
obj4.name = 'population'

obj4.index.name = 'state'

print(obj4)

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


Criando um DataFrame

In [16]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)

frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


Criando o 'frame2' a partir de 'data'

In [17]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])

In [18]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [19]:
#alterando todos os valores de uma coluna
frame2['debt'] = 16.6

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.6
two,2001,Ohio,1.7,16.6
three,2002,Ohio,3.6,16.6
four,2001,Nevada,2.4,16.6
five,2002,Nevada,2.9,16.6
six,2003,Nevada,3.2,16.6


In [20]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [21]:
frame2['eastern'] = frame2.state == 'Ohio'

frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [22]:
#deletando uma coluna
del frame2['eastern']

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


Dicionários aninhados

In [24]:
#itens mais externos são as colunas
#e os itens mais internos são os índices e seus respectivos valores
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)

frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


Transposição do dataframe

In [25]:
#trocando linha por coluna 
#não altera o dataframe original, gera um novo

frame3_transposta = frame3.T
frame3_transposta

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


Objetos Index

In [26]:
#index são imutáveis
obj = pd.Series(range(3), index=['a', 'b', 'c'])

index = obj.index

print(index)

Index(['a', 'b', 'c'], dtype='object')


In [27]:
#pode contem rótulos duplicados
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
print(dup_labels)

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')


In [28]:
#reindexação 
obj5 = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj5)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64


In [29]:
#reindex não altera o objeto inicial
#gera um novo objeto
obj6 = obj5.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj6)

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [30]:
#method (ffill - forward fill)
obj7 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj7)

0      blue
2    purple
4    yellow
dtype: object


In [31]:
obj7.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [33]:
import numpy as np

In [34]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)),
        index=['a', 'c', 'd'],
        columns=['Ohio', 'Texas', 'California'])

frame2 = frame.reindex(['a', 'b', 'c', 'd'])

In [35]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [36]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [37]:
#drop descarta eixos
#não altera o objeto inicial
#também pode descartar linhas e colunas
obj8 = pd.Series(np.arange(5,), index=['a', 'b', 'c', 'd', 'e'])
print(obj8)

a    0
b    1
c    2
d    3
e    4
dtype: int64


In [38]:
obj8.drop(['d', 'c'])

a    0
b    1
e    4
dtype: int64

In [39]:
print(obj8[[1, 3]])

b    1
d    3
dtype: int64


In [40]:
print(obj8['b':'c'])

b    1
c    2
dtype: int64


Somando dados

In [41]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [42]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

df2.loc[1, 'b'] = np.nan #linha 1 coluna b

df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


Funções mais usadas


In [47]:
print(frame['b'].min())
print(frame['b'].max())
print(frame['b'].mean())
print(frame['b'].sum())

-0.648608312483635
1.4513687814201626
0.8064173514188976
3.2256694056755904


Função apply

In [43]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [44]:
f = lambda x: x.max() - x.min()
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame.apply(f)

b    2.099977
d    3.892355
e    2.632713
dtype: float64

In [45]:
frame.apply(f, axis=1)

Utah      2.868869
Ohio      3.197504
Texas     2.354952
Oregon    1.667337
dtype: float64

In [48]:
format = lambda x: '%.2f' % x

frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.45,2.44,-0.43
Ohio,-0.65,-1.45,1.74
Texas,1.03,1.47,-0.89
Oregon,1.39,-0.28,-0.18


In [49]:
frame['e'].map(format) #função map tambem aplica uma função em todos os elementos

Utah      -0.43
Ohio       1.74
Texas     -0.89
Oregon    -0.18
Name: e, dtype: object

In [50]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index() #ordenando pelo índice

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [51]:
frame.sort_index(axis=1) #ordenando pelas colunas

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [52]:
obj10 = pd.Series([4, 7, -3, 2])
obj10.sort_values() #ordenando pelos valores

2   -3
3    2
0    4
1    7
dtype: int64

In [53]:
#ordenando pelos valores por coluna
frame4 = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame4.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [54]:
frame4.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1
