# Pandas

## Estrutura de Dados

### Series

In [None]:
from pandas import Series, DataFrame
import pandas as pd
obj = Series([4, 7, -5, 3])
obj

In [None]:
print(obj.values)
print(obj.index) #obj.index.values

In [None]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

Comparado com o numpy array, você pode usar rótulos no índice quando seleciona um único valor ou um conjunto de valores:

In [None]:
obj2["a"]

In [None]:
obj2["d"] = 6 #atribuição

In [None]:
obj2[["c", "a", "d"]] #uma lista de índices

In [None]:
import numpy as np
print(obj2[obj2 > 0])
print()
print(obj2 * 2) #OPERAÇÃO vetorizada
print()
print(np.exp(obj2))

In [None]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Podemos converter a série num dicionário novamente usando o método `to_dict`:

In [None]:
obj3.to_dict()

In [None]:
states = ['Oregon', 'Texas','California', 'Ohio']
obj4 = Series(sdata, index=states)
obj4

In [None]:
print(pd.isna(obj4)) #isnull
print(pd.notna(obj4)) # notnull

In [None]:
print(obj3)
print()
print(obj4)
print()
obj3 + obj4

In [None]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

In [None]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

### DataFrame

In [None]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = DataFrame(data)
df

Para grandes DataFrames, podemos usar os métodos `head()`e `tail()`para visualizar apenas uma parte dos dados:

In [None]:
df.head() #mostra as 5 primeiras linhas

In [None]:
df.tail(2) #mostra as 2 últimas linhas

In [None]:
df2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                index=['one', 'two', 'three', 'four', 'five'])
df2

In [None]:
df2.columns

In [None]:
print(df['state'])
print()
print(df.year)
print()
print(df['year'])

In [None]:
print(df2.loc['four']) #label
print()
print(df.iloc[0]) #int

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
print(data)
data.loc['Colorado'] #seleciona a linha cujo índice é Colorado

In [None]:
print(data)
data.loc[["Colorado", "New York"]] #seleciona as linhas Colorado e New York

In [None]:
data.loc["Colorado", ["two", "three"]] #seleciona a linha Colorado e as colunas two e three

In [None]:
print(data)
data.iloc[2]#linha 2

In [None]:
print(data)
data.iloc[[2, 1]] #linhas 2 e 1, nessa ordem

In [None]:
print(data)
data.iloc[2, [3, 0, 1]] #linha 2, colunas 3, 0 e 1, nessa ordem

In [None]:
print(data)
data.iloc[[1, 2], [3, 0, 1]] #linhas 1 e 2, colunas 3,0 e 1, nessa ordem

In [None]:
# todas as linhas, as 3 primeiras colunas, desde que seja maior que 5
data.iloc[:, :3][data.three > 5] 

In [None]:
data.loc[data.three >= 5]

In [None]:
df2['debt'] = np.arange(len(df2))
df2

In [None]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df2['debt'] = val
df2

In [None]:
df2['eastern'] = df2.state == 'Ohio'
df2

In [None]:
del df2['eastern']
df2

In [None]:
df2.values # df2.to_numpy()

In [None]:
obj = Series(range(3), index=['a', 'b', 'c'])
obj.index.values

In [None]:
print('state' in df2.columns)
print(0 in df.index)

## Eliminando entradas de um dos eixos

In [None]:
import numpy as np
obj = pd.Series(np.arange(5.), index=["a", "b", "c", "d", "e"])
obj

In [None]:
new_obj = obj.drop("c")
new_obj

In [None]:
obj.drop(["d", "c"])

In [None]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=["Ohio", "Colorado", "Utah", "New York"],
                    columns=["one", "two", "three", "four"])
data

In [None]:
data.drop(index=["Colorado", "Ohio"]) #linhas

In [None]:
data.drop(columns=["two"]) #colunas

In [None]:
data.drop("two", axis=1)
data.drop("Utah", axis=0)

In [None]:
data.drop(["two", "four"], axis="columns")

## Aplicação de Função e Mapeamento

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [None]:
list('abc')

In [None]:
df = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df)
print()
np.abs(df) #retorna valor absoluto

In [None]:
f = lambda x: x.max() - x.min()
print(df.apply(f))
print()
print(df.apply(f, axis=1))

In [None]:
def f2(x):
    return Series([x.min(), x.max()], index=['min', 'max'])

df.apply(f2)

In [None]:
df

In [None]:
format2 = lambda x: '%.2f' % x
df.applymap(format2)

## Ordenação e Ranking

In [None]:
obj = Series(range(4), index=['d', 'a', 'b', 'c'])
df2 = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],
                  columns=['d', 'a', 'b', 'c'])
print(obj)
print()
print(df2)
print()
print(obj.sort_index())
print()
print(df2.sort_index())
print()
print(df2.sort_index(axis=1))

In [None]:
obj = Series([4, 7, -3, 2])
obj.sort_values(ascending=False) #igual para pandas DataFrame

In [None]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'nota': [8, 7, 7.5, 10, 8]}
df4 = DataFrame(data)
print(df4)
print()
df4['rank'] = df4['nota'].rank(ascending=0)
df4.sort_values('rank')

## Sumarização e Estatística Descritiva

In [None]:
df5 = DataFrame([[1.4, np.nan], [7.1, -4.5],
                [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df5)
print()
print(df5.sum())
print()
print(df5.sum(axis=1))
print()
print(df5.count())
print()
print(df5.size)

In [None]:
df5.size

In [None]:
df5.describe()

In [None]:
obj = pd.Series(["a", "a", "b", "c"] * 4)
obj

In [None]:
obj.describe()

In [None]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

In [None]:
mask = obj.isin(['b', 'c'])
mask

## Manipulação de Valores Faltantes 

In [None]:
string_data = Series(['laranja', 'uva', np.nan, 'abacate'])
print(string_data)
print()
print(string_data.isnull())
string_data[0] = None
print()
print(string_data.isnull())

In [None]:
data = DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
print(data)
cleaned = data.dropna()
print('\n',cleaned)
print()
data.dropna(how='all')

In [None]:
print(data.fillna(0))
print()
print(data.fillna(data.mean()))

## TODO Section

### Manipulação de DataFrame

        > Crie, a partir do dicionário abaixo, um DataFrame cujo index seja os valores da variável labels
        > encontre a média dos valores da coluna age e preencha os valores faltantes dessa coluna com o valor da média
        > crie uma nova coluna chamada 'rank', que mostre os animais que receberam mais visitas
        > qual o animal que recebeu a maior quantidade de visitas? Use o método max()

In [129]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}


data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [130]:
#resposta
df = DataFrame(data, index=labels)
media = df['age'].mean()
df = df.fillna(media)

df['rank'] = df['visits'].rank(ascending=1)
df.sort_values('rank', ascending=False)

df[df['rank'] == df['rank'].max()].animal.unique()

df.sort_values('rank', ascending=False)

Unnamed: 0,animal,age,visits,priority,rank
b,cat,3.0,3,yes,9.0
d,dog,3.4375,3,yes,9.0
f,cat,2.0,3,no,9.0
c,snake,0.5,2,no,6.0
e,dog,5.0,2,no,6.0
i,dog,7.0,2,no,6.0
a,cat,2.5,1,yes,2.5
g,snake,4.5,1,no,2.5
h,cat,3.4375,1,yes,2.5
j,dog,3.0,1,no,2.5


## Carregamento e Armazenamento de Dados

### Arquivo CSV

In [131]:
import pandas as pd
poke = pd.read_csv('bases/Pokemon.csv')
poke.head(n=10)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
9,7,Squirtle,Water,,314,44,48,65,50,64,43,1,False


### Arquivo JSON

In [132]:
obj = """
{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""
print(type(obj))
print(obj)

<class 'str'>

{"name": "Wes",
"places_lived": ["United States", "Spain", "Germany"],
"pet": null,
"siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
{"name": "Katie", "age": 33, "pet": "Cisco"}]
}



In [133]:
import json
result = json.loads(obj)
result

{'name': 'Wes',
 'places_lived': ['United States', 'Spain', 'Germany'],
 'pet': None,
 'siblings': [{'name': 'Scott', 'age': 25, 'pet': 'Zuko'},
  {'name': 'Katie', 'age': 33, 'pet': 'Cisco'}]}

In [None]:
type(result)

In [134]:
asjson = json.dumps(result)
print(type(asjson))
asjson

<class 'str'>


'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

In [135]:
siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])
siblings

Unnamed: 0,name,age
0,Scott,25
1,Katie,33


## Combinação de Dados

In [136]:
import pandas as pd

In [137]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df2 = pd.DataFrame({'key': ['a', 'b', 'd','b'],
                 'data2': range(4)})

pd.merge(df1,df2) #default inner

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,6,1
5,b,6,3
6,a,2,0
7,a,4,0
8,a,5,0


In [139]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})

df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                 'data2': range(3)})

print(df3)
print(df4)
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

  lkey  data1
0    b      0
1    b      1
2    a      2
3    c      3
4    a      4
5    a      5
6    b      6
  rkey  data2
0    a      0
1    b      1
2    d      2


Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [None]:
df3

In [None]:
df4

In [None]:
pd.merge(df3, df4, how='outer',left_on='lkey',right_on='rkey')

In [None]:
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

In [None]:
import numpy as np
arr = np.arange(12).reshape((3, 4))
print(arr)
print()
np.concatenate([arr, arr], axis=1)

In [None]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3])

In [None]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), 
                index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), 
                index=['a', 'c'],
                columns=['three', 'four'])
print(df1)
print()
print(df2)
print()
pd.concat([df1, df2], axis=1)

## TODO Section

### Manipulação de Dados usando Pandas

Usando o dataset Pokemon.csv, faça:

    1) Verifique em qual(is) coluna(s) existem valores faltantes
    2) Preencha os valores faltantes da coluna Type 2 com os valores correspondentes da coluna Type 1
    3) Crie um DataFrame a partir dos dados originais contendo apenas pokemons lendários. Imprima os 5 primeiros
    4) Use apply/applymap para passar todos os valores das colunas Name, Type 1 e Type 2 para minúscula

In [211]:
poke = pd.read_csv('bases/Pokemon.csv')
poke.head(n=10)
df = pd.DataFrame(poke)

In [207]:
# Resposta 1
df.columns[df.isna().any()]

Index(['Type 2'], dtype='object')

In [218]:
# Resposta 2
df['Type 2'].fillna(df['Type 1'], inplace=True)
df


Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,Fire,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [223]:
# Resposta 3
df_legendary = poke[poke.Legendary == True]
df_legendary

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
156,144,Articuno,Ice,Flying,580,90,85,100,95,125,85,1,True
157,145,Zapdos,Electric,Flying,580,90,90,85,125,90,100,1,True
158,146,Moltres,Fire,Flying,580,90,100,90,125,85,90,1,True
162,150,Mewtwo,Psychic,Psychic,680,106,110,90,154,90,130,1,True
163,150,MewtwoMega Mewtwo X,Psychic,Fighting,780,106,190,100,154,100,130,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [227]:
# Resposta 4
poke[['Name', 'Type 1', 'Type 2']] = poke[['Name', 'Type 1', 'Type 2']].applymap(lambda x: x.lower())
poke

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,bulbasaur,grass,poison,318,45,49,49,65,65,45,1,False
1,2,ivysaur,grass,poison,405,60,62,63,80,80,60,1,False
2,3,venusaur,grass,poison,525,80,82,83,100,100,80,1,False
3,3,venusaurmega venusaur,grass,poison,625,80,100,123,122,120,80,1,False
4,4,charmander,fire,fire,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,diancie,rock,fairy,600,50,100,150,100,150,50,6,True
796,719,dianciemega diancie,rock,fairy,700,50,160,110,160,110,110,6,True
797,720,hoopahoopa confined,psychic,ghost,600,80,110,60,150,130,70,6,True
798,720,hoopahoopa unbound,psychic,dark,680,80,160,60,170,130,80,6,True
