# Curso: (Mini) Introdução ao Python para processamento e análise de dados 
Maurício Garcia, março 2023

## Revisando Pandas

In [1]:
import pandas as pd


## Acessando Google Sheet

Obter o sharable link:  
https://docs.google.com/spreadsheets/d/1bRTW8_ZQzyqLaNSR5cP_mw9Yb_yEAS1t/edit?usp=share_link&ouid=116157839543199465705&rtpof=true&sd=true  

Colar no navegador e ele irá gerar esse link:  
https://docs.google.com/spreadsheets/d/1bRTW8_ZQzyqLaNSR5cP_mw9Yb_yEAS1t/edit#gid=1357010444  

Usar esse link no comando abaixo:

In [2]:
url = 'https://docs.google.com/spreadsheets/d/1bRTW8_ZQzyqLaNSR5cP_mw9Yb_yEAS1t/edit#gid=1357010444'

In [3]:
url = url.replace('/edit#gid=', '/export?format=csv&gid=')

In [4]:
df = pd.read_csv(url)

In [5]:
df.head()

Unnamed: 0,Nome,RG,CPF,Sexo,Nascimento
0,Clarice Gonzales,88829099450,49492282913,F,2008-02-25
1,Maria Isis Ramos,43437187759,83067956448,F,1999-02-27
2,Daniela Palma,58946054283,32723668271,M,1982-03-03
3,Maria da Conceição Antunes,23000712291,39961145321,F,1973-03-05
4,Valentina Williams,68948263392,39936634316,F,1956-03-09


### Criando novas colunas

In [6]:
df['Nascimento2'] = pd.to_datetime(df['Nascimento'])

In [7]:
df['Idade'] = (pd.to_datetime('today') - df['Nascimento2']).dt.days

In [8]:
masculino = (df['Sexo'] == 'M')
df['Target'] = 0
df.loc[masculino, 'Target'] = 1

In [9]:
df.head()

Unnamed: 0,Nome,RG,CPF,Sexo,Nascimento,Nascimento2,Idade,Target
0,Clarice Gonzales,88829099450,49492282913,F,2008-02-25,2008-02-25,5488,0
1,Maria Isis Ramos,43437187759,83067956448,F,1999-02-27,1999-02-27,8773,0
2,Daniela Palma,58946054283,32723668271,M,1982-03-03,1982-03-03,14978,1
3,Maria da Conceição Antunes,23000712291,39961145321,F,1973-03-05,1973-03-05,18263,0
4,Valentina Williams,68948263392,39936634316,F,1956-03-09,1956-03-09,24468,0


### Contando COLUNAS QUALITATIVAS

In [10]:
df.loc[masculino, 'Target'].count()

8

In [11]:
df['Target'].value_counts()

0    10
1     8
Name: Target, dtype: int64

In [12]:
df['Sexo'].value_counts()

F    10
M     8
Name: Sexo, dtype: int64

### Calculando COLUNAS QUANTITATIVAS

In [13]:
df['Idade'].sum()

219599

In [14]:
df.loc[masculino, 'Idade'].sum()

80039

In [15]:
df['Idade'].mean()

12199.944444444445

In [16]:
df['Idade'].std()

5886.964447701102

In [17]:
df['Idade'].max()

24468

In [18]:
df['Idade'].min()

4758

### Crosstab QUALI x QUALI

In [19]:
pd.crosstab(df['Sexo'],df['Target']) # quantidades

Target,0,1
Sexo,Unnamed: 1_level_1,Unnamed: 2_level_1
F,10,0
M,0,8


In [20]:
pd.crosstab(df['Sexo'],df['Target'], margins=True) # quantidades com totais

Target,0,1,All
Sexo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,10,0,10
M,0,8,8
All,10,8,18


In [21]:
pd.crosstab(df['Sexo'],df['Target'], normalize='index') # porcentagem na horizontal

Target,0,1
Sexo,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.0,0.0
M,0.0,1.0


In [22]:
pd.crosstab(df['Sexo'],df['Target'], normalize='columns') # porcentagem na vertical

Target,0,1
Sexo,Unnamed: 1_level_1,Unnamed: 2_level_1
F,1.0,0.0
M,0.0,1.0


### Pivot Table QUALI x QUANT

In [23]:
# Uma coluna

In [24]:
df.pivot_table(
    index=['Sexo'], #row
    values=['Idade'],
)

Unnamed: 0_level_0,Idade
Sexo,Unnamed: 1_level_1
F,13956.0
M,10004.875


In [25]:
df.pivot_table(
    index=['Sexo'], #row
    values=['Idade'],
    aggfunc=[len],
)

Unnamed: 0_level_0,len
Unnamed: 0_level_1,Idade
Sexo,Unnamed: 1_level_2
F,10
M,8


In [26]:
import numpy as np

In [27]:
df.pivot_table(
    index=['Sexo'], #row
    values=['Idade'],
    aggfunc=[len, np.mean],
)

Unnamed: 0_level_0,len,mean
Unnamed: 0_level_1,Idade,Idade
Sexo,Unnamed: 1_level_2,Unnamed: 2_level_2
F,10,13956.0
M,8,10004.875


In [28]:
# Duas colunas

In [29]:
df.pivot_table(
    index=['Sexo'], #row
    columns=['Target'], #columns
    values=['Idade'],
)

Unnamed: 0_level_0,Idade,Idade
Target,0,1
Sexo,Unnamed: 1_level_2,Unnamed: 2_level_2
F,13956.0,
M,,10004.875


In [30]:
df.pivot_table(
    index=['Sexo'], #row
    columns=['Target'], #columns
    values=['Idade'],
    aggfunc=[len],
)

Unnamed: 0_level_0,len,len
Unnamed: 0_level_1,Idade,Idade
Target,0,1
Sexo,Unnamed: 1_level_3,Unnamed: 2_level_3
F,10.0,
M,,8.0


In [31]:
df.pivot_table( 
    index=['Sexo'], #row
    columns=['Target'], #columns
    values=['Idade'],
    aggfunc=[len,np.mean],
)

Unnamed: 0_level_0,len,len,mean,mean
Unnamed: 0_level_1,Idade,Idade,Idade,Idade
Target,0,1,0,1
Sexo,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
F,10.0,,13956.0,
M,,8.0,,10004.875
