# Pandas

## O que é o pandas?

Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool,
built on top of the Python programming language.

## Instalação do Pandas

<strong>PyPI</strong> <br />
pip install pandas

<strong>Conda</strong> <br />
conda install pandas

In [1]:
# Importação do Pandas

import pandas as pd

### Serie no Pandas

In [2]:
sample_serie = pd.Series([1,4,6,5,7,10,6])

In [3]:
print(sample_serie)

0     1
1     4
2     6
3     5
4     7
5    10
6     6
dtype: int64


In [4]:
sample_serie.describe()

count     7.000000
mean      5.571429
std       2.760262
min       1.000000
25%       4.500000
50%       6.000000
75%       6.500000
max      10.000000
dtype: float64

In [5]:
df = pd.DataFrame([['Messias', 123456], ['Verônica', 789456], ['Miguel', 456789]])

In [6]:
df.shape

(3, 2)

In [7]:
df

Unnamed: 0,0,1
0,Messias,123456
1,Verônica,789456
2,Miguel,456789


In [8]:
df = pd.DataFrame([['Messias', 123456], ['Verônica', 789456], ['Miguel', 456789]],
                  columns=['nome', 'number'])

In [9]:
df

Unnamed: 0,nome,number
0,Messias,123456
1,Verônica,789456
2,Miguel,456789


In [10]:
df['nome']

0     Messias
1    Verônica
2      Miguel
Name: nome, dtype: object

In [11]:
df['number'].mean()

456567.0

# PyDataset

In [12]:
import pydataset

In [13]:
pydataset.data()

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
...,...,...
752,VerbAgg,Verbal Aggression item responses
753,cake,Breakage Angle of Chocolate Cakes
754,cbpp,Contagious bovine pleuropneumonia
755,grouseticks,Data on red grouse ticks from Elston et al. 2001


In [14]:
titanic = pydataset.data('titanic')

In [15]:
titanic.head()

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes


In [16]:
titanic.tail()

Unnamed: 0,class,age,sex,survived
1312,3rd class,child,women,no
1313,3rd class,child,women,no
1314,3rd class,child,women,no
1315,3rd class,child,women,no
1316,3rd class,child,women,no


In [17]:
titanic.describe()

Unnamed: 0,class,age,sex,survived
count,1316,1316,1316,1316
unique,3,2,2,2
top,3rd class,adults,man,no
freq,706,1207,869,817


In [18]:
titanic['class'].value_counts()

3rd class    706
1st class    325
2nd class    285
Name: class, dtype: int64

In [19]:
len(pydataset.data())

757

# DB.py

É necessário pesquisar a lib para cada banco de dados e os parâmetros necessários para conectar o banco de dados.

In [20]:
from db import DB

In [21]:
database = DB(filename='./original/logs.sqlite3', dbtype='sqlite')

Indexing schema. This will take a second...finished!


In [22]:
database.tables

Refreshing schema. Please wait...done!


Schema,Table,Columns
public,log,"id, path, user_id, date"


In [23]:
log_df = database.tables.log

In [24]:
log_df

Column,Type,Foreign Keys,Reference Keys
id,integer,,
path,text,,
user_id,integer,,
date,numeric,,


In [25]:
log_df = database.tables.log.all()

In [26]:
log_df

Unnamed: 0,id,path,user_id,date
0,1,/,3,2017-01-03T11:41:00
1,2,/pandas/,4,2017-01-03T11:37:00
2,3,/videos/,5,2017-01-03T10:47:00
3,4,/,2,2017-01-03T11:51:00
4,5,/python-para-zumbis/,5,2017-01-03T10:32:00
5,6,/cursos/,3,2017-01-03T11:31:00
6,7,/videos/,4,2017-01-03T10:39:00
7,8,/cursos/,5,2017-01-03T10:55:00
8,9,/,4,2017-01-03T11:50:00
9,10,/pandas/,1,2017-01-03T11:38:00


In [27]:
log_df = database.query('select * from log where user_id = 3')

In [28]:
log_df

Unnamed: 0,id,path,user_id,date
0,1,/,3,2017-01-03T11:41:00
1,6,/cursos/,3,2017-01-03T11:31:00
2,13,/django-ecommerce/,3,2017-01-03T11:59:00
3,16,/cursos/,3,2017-01-03T11:39:00
4,22,/django-ecommerce/,3,2017-01-03T10:35:00
5,28,/django-ecommerce/,3,2017-01-03T11:57:00
6,34,/pandas/,3,2017-01-03T11:43:00
7,38,/videos/,3,2017-01-03T11:58:00
8,47,/pandas/,3,2017-01-03T10:55:00
9,51,/python-para-zumbis/,3,2017-01-03T11:57:00


Possibilidade de usar um arquivo sql

log_df = database.query_from_file('arquivo.sql')

In [29]:
log_df = database.query('select * from log')

In [30]:
len(log_df)

100

# Carregar Planilhas de Dados para o Pandas

In [31]:
# Carregar o CSV
copacabana = pd.read_csv('./original/copacabana.csv', delimiter=';')

In [32]:
copacabana.head()

Unnamed: 0,Posicao,Quartos,Vagas,DistIpanema,DistPraia,DistFavela,RendaMedia,RendaMovel,RendaMovelRua,Vu2009,Mes,Idade,Tipologia,AreaConstruida,VAL_UNIT,X,Y
0,1,3.0,0.01,1144,311,146,969501,1028834,999168,1750,509,37.0,1,95,4379,685365.07,7457802.68
1,0,2.0,0.01,2456,502,254,1472861,1137759,1305310,2300,484,30.0,1,71,6479,685941.55,7459001.32
2,0,2.0,0.01,2448,772,229,1803724,1512475,1658100,2350,920,44.0,1,58,12414,685627.39,7459080.52
3,0,2.0,0.01,1615,428,310,1124331,1370600,1247466,2200,930,43.0,1,88,11250,685438.2001,7458268.28
4,0,2.0,1.0,2358,586,287,1165764,1177933,1171849,2150,918,42.0,1,68,13382,685764.384,7458954.513


In [33]:
populacao_pe = pd.read_excel('./original/total_populacao_pernambuco.xls')

In [34]:
populacao_pe.head()

Unnamed: 0,Código do município,Nome do município,Total da população 2000,Total de homens,Total de mulheres,Total da população urbana,Total da população rural,Total da população 2010
0,2600054,Abreu e Lima,89039.0,45165.0,49263.0,86589.0,7839.0,94428.0
1,2600104,Afogados da Ingazeira,32922.0,16790.0,18301.0,27406.0,7685.0,35091.0
2,2600203,Afrânio,15014.0,8751.0,8837.0,5859.0,11729.0,17588.0
3,2600302,Agrestina,20036.0,10938.0,11742.0,16955.0,5725.0,22680.0
4,2600401,Água Preta,28531.0,16581.0,16465.0,18708.0,14338.0,33046.0


### Realizando Filtros / Seleção em Dataframe

In [35]:
copacabana.columns

Index(['Posicao', 'Quartos', 'Vagas', 'DistIpanema', 'DistPraia', 'DistFavela',
       'RendaMedia', 'RendaMovel', 'RendaMovelRua', 'Vu2009', 'Mes', 'Idade',
       'Tipologia', 'AreaConstruida', 'VAL_UNIT', 'X', 'Y'],
      dtype='object')

In [36]:
copacabana['Quartos'].describe()

count    1675.000000
mean        1.767510
std         1.142523
min         0.010000
25%         1.000000
50%         2.000000
75%         3.000000
max         6.000000
Name: Quartos, dtype: float64

In [37]:
copacabana.loc[copacabana['Quartos'] == 6]

Unnamed: 0,Posicao,Quartos,Vagas,DistIpanema,DistPraia,DistFavela,RendaMedia,RendaMovel,RendaMovelRua,Vu2009,Mes,Idade,Tipologia,AreaConstruida,VAL_UNIT,X,Y
748,1,6.0,2.0,2500,35,743,1524600,1275377,1399989,4100,360,58.0,1,668,4491,686456.25,7458801.05


In [38]:
copacabana.loc[copacabana['Quartos'] == 5]

Unnamed: 0,Posicao,Quartos,Vagas,DistIpanema,DistPraia,DistFavela,RendaMedia,RendaMovel,RendaMovelRua,Vu2009,Mes,Idade,Tipologia,AreaConstruida,VAL_UNIT,X,Y
173,1,5.0,0.01,2696,154,712,1083455,649733,866594,2000,425,55.0,1,38,6316,686522.98,7458989.87
1521,1,5.0,3.0,3044,31,609,1524600,1168222,1346411,4100,405,34.0,1,430,5000,686965.3201,7459109.71


## Dados Categóricos

Tipos de dados por colunas

In [39]:
titanic.columns

Index(['class', 'age', 'sex', 'survived'], dtype='object')

In [41]:
titanic['class'].describe()

count          1316
unique            3
top       3rd class
freq            706
Name: class, dtype: object

In [43]:
titanic['class'].nbytes

10528

In [44]:
%%time
titanic['class'] == '3rd class'

Wall time: 4.01 ms


1       False
2       False
3       False
4       False
5       False
        ...  
1312     True
1313     True
1314     True
1315     True
1316     True
Name: class, Length: 1316, dtype: bool

In [45]:
# transformando para dados categóricos
titanic['class'] = titanic['class'].astype('category')

In [46]:
# verificando que os tipos de dados continuam o mesmo
titanic['class'].describe()

count          1316
unique            3
top       3rd class
freq            706
Name: class, dtype: object

In [47]:
# a quantidade de bytes utilizados diminuiu
titanic['class'].nbytes

1340

In [48]:
%%time
titanic['class'] == '3rd class'

Wall time: 929 µs


1       False
2       False
3       False
4       False
5       False
        ...  
1312     True
1313     True
1314     True
1315     True
1316     True
Name: class, Length: 1316, dtype: bool

## Problemas de Dados Perdidos (data missing)

Utilizaremos Pandas e Numpy.

Essa técnica consiste em consertar conjuntos de dados que por ventura tenham perdido alguma informação durante o processo de coleta.

In [50]:
import numpy as np

In [53]:
dados = {
    'nome' : ['João', 'Maria', 'José', np.nan, 'Pedro', 'Judas', 'Tiago'],
    'sexo' : ['M', 'F', 'M', np.nan, 'M', 'M', np.nan],
    'idade' : [14, 13, np.nan, np.nan, 15, 13, 14],
    'nota' : [4, 10, 7, np.nan, 8, 9, 7]
}

df = pd.DataFrame(dados)

In [54]:
df

Unnamed: 0,nome,sexo,idade,nota
0,João,M,14.0,4.0
1,Maria,F,13.0,10.0
2,José,M,,7.0
3,,,,
4,Pedro,M,15.0,8.0
5,Judas,M,13.0,9.0
6,Tiago,,14.0,7.0


In [58]:
# estratégia para dropar todas as linhas que tem pelo menos
# um dado incompleto

df.dropna()

Unnamed: 0,nome,sexo,idade,nota
0,João,M,14.0,4.0
1,Maria,F,13.0,10.0
4,Pedro,M,15.0,8.0
5,Judas,M,13.0,9.0


In [59]:
# dropa as linhas na qual todos os dados são incompletos

df.dropna(how='all')

Unnamed: 0,nome,sexo,idade,nota
0,João,M,14.0,4.0
1,Maria,F,13.0,10.0
2,José,M,,7.0
4,Pedro,M,15.0,8.0
5,Judas,M,13.0,9.0
6,Tiago,,14.0,7.0


In [60]:
df['serie'] = np.nan

In [61]:
df

Unnamed: 0,nome,sexo,idade,nota,serie
0,João,M,14.0,4.0,
1,Maria,F,13.0,10.0,
2,José,M,,7.0,
3,,,,,
4,Pedro,M,15.0,8.0,
5,Judas,M,13.0,9.0,
6,Tiago,,14.0,7.0,


In [62]:
# dropa as linhas na qual todos os dados são incompletos
# alterando para o AXIS 1 o pandas verifica a coluna e não a linha

df.dropna(how='all', axis=1)

Unnamed: 0,nome,sexo,idade,nota
0,João,M,14.0,4.0
1,Maria,F,13.0,10.0
2,José,M,,7.0
3,,,,
4,Pedro,M,15.0,8.0
5,Judas,M,13.0,9.0
6,Tiago,,14.0,7.0


In [64]:
# também podemos informar a quantidade de dados perdidas na linha

df.dropna(thresh=3)

Unnamed: 0,nome,sexo,idade,nota,serie
0,João,M,14.0,4.0,
1,Maria,F,13.0,10.0,
2,José,M,,7.0,
4,Pedro,M,15.0,8.0,
5,Judas,M,13.0,9.0,
6,Tiago,,14.0,7.0,


In [65]:
# Estratégia de preencher os valores perdidos com algum valor.

# Atenção - Essa alteração será apenas apresentada. Ela não altera
# o dataset

df['serie'].fillna(8)

0    8.0
1    8.0
2    8.0
3    8.0
4    8.0
5    8.0
6    8.0
Name: serie, dtype: float64

In [66]:
# Para alterar o dataset, utilize o código abaixo

df['serie'].fillna(8, inplace=True)

In [70]:
df['serie'] = np.nan
df

Unnamed: 0,nome,sexo,idade,nota,serie
0,João,M,14.0,4.0,
1,Maria,F,13.0,10.0,
2,José,M,,7.0,
3,,,,,
4,Pedro,M,15.0,8.0,
5,Judas,M,13.0,9.0,
6,Tiago,,14.0,7.0,


In [71]:
# Estratégia de preencger com a média

df['idade'].fillna(df['idade'].mean(), inplace=True)

In [72]:
df

Unnamed: 0,nome,sexo,idade,nota,serie
0,João,M,14.0,4.0,
1,Maria,F,13.0,10.0,
2,José,M,13.8,7.0,
3,,,13.8,,
4,Pedro,M,15.0,8.0,
5,Judas,M,13.0,9.0,
6,Tiago,,14.0,7.0,


In [74]:
# Estratégia utilizando filtros

df[df['nome'].notnull() & df['sexo'].notnull()]

Unnamed: 0,nome,sexo,idade,nota,serie
0,João,M,14.0,4.0,
1,Maria,F,13.0,10.0,
2,José,M,13.8,7.0,
4,Pedro,M,15.0,8.0,
5,Judas,M,13.0,9.0,
