<a href="https://colab.research.google.com/github/luiz-bcardoso/UFN-DataScience/blob/main/pandas_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [239]:
import pandas as pd
import numpy as np

---

### Usando Series

In [240]:
vendas = pd.Series([100,200,300,250], index=['Jan', 'Fev', 'Mar', 'Abr'])
print(vendas)

Jan    100
Fev    200
Mar    300
Abr    250
dtype: int64


In [241]:
# Exibindo a estrutura e descrição
print(vendas.info())
print(vendas.describe())

<class 'pandas.core.series.Series'>
Index: 4 entries, Jan to Abr
Series name: None
Non-Null Count  Dtype
--------------  -----
4 non-null      int64
dtypes: int64(1)
memory usage: 64.0+ bytes
None
count      4.000000
mean     212.500000
std       85.391256
min      100.000000
25%      175.000000
50%      225.000000
75%      262.500000
max      300.000000
dtype: float64


In [242]:
# Pegando dados do cabeçalho e rodapé
print(vendas.head(2))
print(vendas.tail(2))

Jan    100
Fev    200
dtype: int64
Mar    300
Abr    250
dtype: int64


In [243]:
# Quantidade, total, média, mediana, máximo, mínimo e desvio padrão
print(f"Quantidade: {vendas.count()} meses")
print(f"Total: R$ {vendas.sum()}")
print(f"Média: R$ {vendas.mean()}")
print(f"Mediana: R$ {vendas.median()}")
print(f"Máximo: R$ {vendas.max()}")
print(f"Mínimo: R$ {vendas.min()}")
print(f"Desvio Padrão: R$ {vendas.std():.2f}")

Quantidade: 4 meses
Total: R$ 850
Média: R$ 212.5
Mediana: R$ 225.0
Máximo: R$ 300
Mínimo: R$ 100
Desvio Padrão: R$ 85.39


In [244]:
#Acessando os dados diretamente
print(vendas['Jan'])
print(vendas['Jan':'Mar'])

# Acessando os dados por posição
print(vendas[0])
print(vendas[0:3])

# Acessando usando loc e iloc
print(vendas.loc['Jan'])
print(vendas.loc['Jan':'Mar'])
print(vendas.iloc[0])
print(vendas.iloc[0:3])

100
Jan    100
Fev    200
Mar    300
dtype: int64
100
Jan    100
Fev    200
Mar    300
dtype: int64
100
Jan    100
Fev    200
Mar    300
dtype: int64
100
Jan    100
Fev    200
Mar    300
dtype: int64


  print(vendas[0])


In [245]:
valor_min = 200
valor_max = 280

In [246]:
#Filtragem de dados
filtro = (vendas >= valor_min) & (vendas <= valor_max)
print(filtro,'\n')
print(vendas[filtro])

Jan    False
Fev     True
Mar    False
Abr     True
dtype: bool 

Fev    200
Abr    250
dtype: int64


In [247]:
# Filtragem de dados
print(f"Os meses de venderam entre {valor_min} a {valor_max} foram {vendas[filtro].index.tolist()}")
print(f"Com vendas de R$ {vendas[filtro].values.tolist()}")

Os meses de venderam entre 200 a 280 foram ['Fev', 'Abr']
Com vendas de R$ [200, 250]


---

### Usando DataFrame

In [248]:
dados = {
    'produto' : ['sorvete', 'refrigerante', 'batata frita'],
    'preco'   : [8.00, 5.50, 7.00],
    'estoque' : [120, 85, 60]
}

df = pd.DataFrame(dados)
df

Unnamed: 0,produto,preco,estoque
0,sorvete,8.0,120
1,refrigerante,5.5,85
2,batata frita,7.0,60


In [249]:
df.describe()

Unnamed: 0,preco,estoque
count,3.0,3.0
mean,6.833333,88.333333
std,1.258306,30.138569
min,5.5,60.0
25%,6.25,72.5
50%,7.0,85.0
75%,7.5,102.5
max,8.0,120.0


In [250]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   produto  3 non-null      object 
 1   preco    3 non-null      float64
 2   estoque  3 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 204.0+ bytes


In [251]:
print(df.dtypes)

# Convertendo o tipo de dado de int para string
df['estoque'] = df['estoque'].astype('str')

print(df.dtypes)

# Retornando para inteiro
df['estoque'] = df['estoque'].astype('int32')

print(df.dtypes)

produto     object
preco      float64
estoque      int64
dtype: object
produto     object
preco      float64
estoque     object
dtype: object
produto     object
preco      float64
estoque      int32
dtype: object


In [252]:
# Calculando e armazenando o valor total do estoque
df['valor_total'] = df['preco'] * df['estoque']
df

Unnamed: 0,produto,preco,estoque,valor_total
0,sorvete,8.0,120,960.0
1,refrigerante,5.5,85,467.5
2,batata frita,7.0,60,420.0


In [253]:
# Removendo a coluna valor_total
df = df.drop(columns=['valor_total'])
df

Unnamed: 0,produto,preco,estoque
0,sorvete,8.0,120
1,refrigerante,5.5,85
2,batata frita,7.0,60


---

### Exemplos reais usando pandas (df)

In [254]:
dados_bike = {
'dia': ['Seg', 'Ter', 'Qua', 'Qui', 'Sex', 'Sab', 'Dom'],
'alugueis': [32, 41, 35, 39, 60, 150, 140],
'chuva': [True, False, True, False, False, False, True]
}
df_bike = pd.DataFrame(dados_bike)

df_bike

Unnamed: 0,dia,alugueis,chuva
0,Seg,32,True
1,Ter,41,False
2,Qua,35,True
3,Qui,39,False
4,Sex,60,False
5,Sab,150,False
6,Dom,140,True


In [255]:
# Dias em que o aluguel foi maior que 50
filtro = (df_bike['alugueis'] > 50) 
df_bike[filtro]

Unnamed: 0,dia,alugueis,chuva
4,Sex,60,False
5,Sab,150,False
6,Dom,140,True


In [256]:
# Total de alugueis nos dias sem chuva
dias_sem_chuva = (df_bike['chuva'] == False)
print(f"Dias sem chuva: {df_bike[dias_sem_chuva]['alugueis'].sum()} alugueis")
print(f"Dias com chuva: {df_bike[~dias_sem_chuva]['alugueis'].sum()} alugueis")


Dias sem chuva: 290 alugueis
Dias com chuva: 207 alugueis


In [257]:
# Média de alugueis nos dias chuvosos e sem chuva
print(f"Media com chuvas: {df_bike[~dias_sem_chuva]['alugueis'].mean()}")
print(f"Media sem chuvas: {df_bike[dias_sem_chuva]['alugueis'].mean()}")

Media com chuvas: 69.0
Media sem chuvas: 72.5


In [258]:
def verificar_movimento(alguel):
    if alguel >= 50:
        return 'Movimentado'
    else:
        return 'Calmo'

In [259]:
#df['valor_total'] = df['preco'] * df['estoque']
df_bike['movimento'] = df_bike['alugueis'].apply(verificar_movimento)
df_bike

Unnamed: 0,dia,alugueis,chuva,movimento
0,Seg,32,True,Calmo
1,Ter,41,False,Calmo
2,Qua,35,True,Calmo
3,Qui,39,False,Calmo
4,Sex,60,False,Movimentado
5,Sab,150,False,Movimentado
6,Dom,140,True,Movimentado


---

### Carregando dados usando fontes externas

In [260]:
df_glicose = pd.read_csv('https://raw.githubusercontent.com/RobertsonWeb/materiais-numpy-pandas/refs/heads/main/datasets/glicose_data_suja.csv', sep=',')
df_glicose

Unnamed: 0,Dia Semana,Data,Antes Comer / Depois Comer,Resultado,Dose Insulina,kcal,carb,noite de sono,padel,musculacao R,musculacao H,pilates,corrida,caminhada,tenis,sauna,bike,natacao,eliptico,volei de areia
0,,,,,,,,1-May,,,,,,,,,,,,
1,Sexta,2012.0,ac,96.0,6.0,2714.0,309.0,4,0.0,60.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Sábado,2012.0,ac,90.0,6.0,2665.0,334.0,4,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Domingo,2012.0,ac,105.0,6.0,2008.0,262.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Segunda,2012.0,ac,86.0,6.0,2117.0,291.0,5,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
720,Segunda,2014.0,ac,98.0,12.0,2626.0,157.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
721,Terça,2014.0,ac,92.0,12.0,2296.0,276.0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
722,Quarta,2014.0,ac,91.0,12.0,2453.0,177.0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
723,Quinta,2014.0,ac,99.0,12.0,2565.0,246.0,4,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
