# Pandas

## Estrutura de Dados

In [1]:
from pandas import Series, DataFrame
import pandas as pd
obj = Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [2]:
print(obj.values)
print(obj.index) #obj.index.values

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [3]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [4]:
import numpy as np
print(obj2[(obj2 > 0) & (obj2 < 5)])
print(obj2 * 2)
print(np.exp(obj2))

d    4
c    3
dtype: int64
d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [5]:
# Dicionario Chave: Valor
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [6]:
states = ['Oregon', 'Texas','California', 'Ohio'] #Lista
obj4 = Series(sdata, index=states)
obj4

Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
dtype: float64

In [7]:
print(pd.isnull(obj4)) # == obj4.isnull()
print(pd.notnull(obj4))

Oregon        False
Texas         False
California     True
Ohio          False
dtype: bool
Oregon         True
Texas          True
California    False
Ohio           True
dtype: bool


In [8]:
print(obj3)
print(obj4)
obj3 + obj4

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
dtype: float64


California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [9]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
Oregon        16000.0
Texas         71000.0
California        NaN
Ohio          35000.0
Name: population, dtype: float64

In [10]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [11]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = DataFrame(data)
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [12]:
df2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [13]:
print(df['state'])
print(df.year)

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object
0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64


In [14]:
print(df2.loc['four']) #label
print(df.iloc[0]) #int

year       2001
state    Nevada
pop         2.4
debt        NaN
Name: four, dtype: object
state    Ohio
year     2000
pop       1.5
Name: 0, dtype: object


In [15]:
df2['debt'] = np.arange(5)

In [16]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df2['debt'] = val
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [17]:
df2['eastern'] = df2.state == 'Ohio'
df2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [18]:
del df2['eastern']
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [19]:
df.values

array([['Ohio', 2000, 1.5],
       ['Ohio', 2001, 1.7],
       ['Ohio', 2002, 3.6],
       ['Nevada', 2001, 2.4],
       ['Nevada', 2002, 2.9]], dtype=object)

In [20]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj.index.values

array(['a', 'b', 'c'], dtype=object)

In [21]:
print('state' in df2.columns)
print(0 in df.index)

True
True


## Aplicação de Função e Mapeamento

In [38]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [42]:
df = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
np.abs(df) #retorna valor absoluto

Unnamed: 0,b,d,e
Utah,0.276574,0.679767,0.181593
Ohio,0.022937,0.298083,0.078167
Texas,0.061269,0.152902,0.355311
Oregon,1.458879,2.251214,0.284254


In [43]:
f = lambda x: x.max() - x.min()
print(df.apply(f))
print(df.apply(f, axis=1))

b    1.481816
d    2.930980
e    0.639565
dtype: float64
Utah      0.956341
Ohio      0.321020
Texas     0.416580
Oregon    1.966960
dtype: float64


In [45]:
def f2(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

df.apply(f2)

Unnamed: 0,b,d,e
min,-1.458879,-2.251214,-0.284254
max,0.022937,0.679767,0.355311


In [46]:
format2 = lambda x: '%.2f' % x
df.applymap(format2)

Unnamed: 0,b,d,e
Utah,-0.28,0.68,0.18
Ohio,0.02,-0.3,-0.08
Texas,-0.06,0.15,0.36
Oregon,-1.46,-2.25,-0.28


## Ordenação e Ranking

In [27]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
df2 = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(obj)
print(df2)
print(obj.sort_index())
print(df2.sort_index())
print(df2.sort_index(axis=1))

d    0
a    1
b    2
c    3
dtype: int64
       d  a  b  c
three  0  1  2  3
one    4  5  6  7
a    1
b    2
c    3
d    0
dtype: int64
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4


In [28]:
obj = Series([4, 7, -3, 2])
obj.sort_values() #igual para pandas

2   -3
3    2
0    4
1    7
dtype: int64

In [29]:
df3 = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(df3)
print(df3.sort_index(by='b'))
print(df3.sort_index(by=['a','b']))

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [49]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'nota': [8, 7, 7.5, 10, 5]}
df4 = DataFrame(data)
print(df4)
df4['rank'] = df4['nota'].rank(ascending=0)
df4.sort_values('rank')

    name  nota
0  Jason   8.0
1  Molly   7.0
2   Tina   7.5
3   Jake  10.0
4    Amy   5.0


Unnamed: 0,name,nota,rank
3,Jake,10.0,1.0
0,Jason,8.0,2.0
2,Tina,7.5,3.0
1,Molly,7.0,4.0
4,Amy,5.0,5.0


## Sumarização e Estatística Descritiva

In [31]:
df5 = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df5)
print(df5.sum())
print(df5.sum(axis=1))
print(df5.count())

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
one    3
two    2
dtype: int64


In [32]:
df5.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [33]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [34]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

## Manipulação de Valores Faltantes

In [35]:
string_data = Series(['laranja', 'uva', np.nan, 'abacate'])
print(string_data)
print(string_data.isnull())
string_data[0] = None
print(string_data.isnull())

0    laranja
1        uva
2        NaN
3    abacate
dtype: object
0    False
1    False
2     True
3    False
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool


In [36]:
data = DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
print(data)
cleaned = data.dropna()
print(cleaned)
data.dropna(how='all')

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [37]:
print(data.fillna(0))
print(data.fillna(data.mean()))

     0    1    2
0  1.0  6.5  3.0
1  1.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  6.5  3.0
2  1.0  6.5  3.0
3  1.0  6.5  3.0
