In [2]:
import pandas as pd

# Caminho do dataset já salvo
file_path = "../dataset/raw/ObesityDataSet_raw_and_data_sinthetic.csv"

# Carregar os dados novamente
df = pd.read_csv(file_path)

# Exibir as primeiras linhas para conferir
print(df.head())


    Age  Gender  Height  Weight        CALC FAVC  FCVC  NCP  SCC SMOKE  CH2O  \
0  21.0  Female    1.62    64.0          no   no   2.0  3.0   no    no   2.0   
1  21.0  Female    1.52    56.0   Sometimes   no   3.0  3.0  yes   yes   3.0   
2  23.0    Male    1.80    77.0  Frequently   no   2.0  3.0   no    no   2.0   
3  27.0    Male    1.80    87.0  Frequently   no   3.0  3.0   no    no   2.0   
4  22.0    Male    1.78    89.8   Sometimes   no   2.0  1.0   no    no   2.0   

  family_history_with_overweight  FAF  TUE       CAEC                 MTRANS  \
0                            yes  0.0  1.0  Sometimes  Public_Transportation   
1                            yes  3.0  0.0  Sometimes  Public_Transportation   
2                            yes  2.0  1.0  Sometimes  Public_Transportation   
3                             no  2.0  0.0  Sometimes                Walking   
4                             no  0.0  0.0  Sometimes  Public_Transportation   

            NObeyesdad  
0        Norm

In [5]:
# Remover registros duplicados
df = df.drop_duplicates()

# Confirmar a remoção
print(f"Número de registros após remoção de duplicatas: {df.shape[0]}")


Número de registros após remoção de duplicatas: 2087


In [None]:
# 📌 Converter colunas categóricas para tipo adequado
categorical_cols = [
    "Gender", "CALC", "FAVC", "SCC", "SMOKE",
    "family_history_with_overweight", "CAEC", "MTRANS", "NObeyesdad"
]

df[categorical_cols] = df[categorical_cols].astype("category")

# 📌 Aplicar One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

# 📌 Exibir as novas colunas criadas
print("\n🔹 Novas colunas após One-Hot Encoding:")
print(df_encoded.head())


🔹 Novas colunas após One-Hot Encoding:
    Age  Height  Weight  FCVC  NCP  CH2O  FAF  TUE  Gender_Male  \
0  21.0    1.62    64.0   2.0  3.0   2.0  0.0  1.0        False   
1  21.0    1.52    56.0   3.0  3.0   3.0  3.0  0.0        False   
2  23.0    1.80    77.0   2.0  3.0   2.0  2.0  1.0         True   
3  27.0    1.80    87.0   3.0  3.0   2.0  2.0  0.0         True   
4  22.0    1.78    89.8   2.0  1.0   2.0  0.0  0.0         True   

   CALC_Frequently  ...  MTRANS_Bike  MTRANS_Motorbike  \
0            False  ...        False             False   
1            False  ...        False             False   
2             True  ...        False             False   
3             True  ...        False             False   
4            False  ...        False             False   

   MTRANS_Public_Transportation  MTRANS_Walking  NObeyesdad_Normal_Weight  \
0                          True           False                      True   
1                          True           False       

In [7]:
# Detectar e Tratar Outliers
# Objetivo: Identificar e remover valores extremos que podem distorcer o modelo.

import numpy as np

# Selecionar apenas as colunas numéricas
numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns

# Calcular o IQR (Intervalo Interquartil) para cada variável numérica
Q1 = df[numerical_cols].quantile(0.25)
Q3 = df[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Definir limites para detecção de outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Criar um DataFrame sem outliers
df_cleaned = df[~((df[numerical_cols] < lower_bound) | (df[numerical_cols] > upper_bound)).any(axis=1)]

# Exibir quantos registros foram removidos
print(f"Registros antes da remoção de outliers: {df.shape[0]}")
print(f"Registros após a remoção de outliers: {df_cleaned.shape[0]}")

# Atualizar o DataFrame original para a versão sem outliers
df = df_cleaned


Registros antes da remoção de outliers: 2087
Registros após a remoção de outliers: 1388


In [4]:
# Salvar o dataset tratado na pasta RAW
processed_file_path = "../dataset/raw/ObesityDataSet_cleaned.csv"
df.to_csv(processed_file_path, index=False)

print(f"✅ Dataset tratado salvo em: {processed_file_path}")


✅ Dataset tratado salvo em: ../dataset/raw/ObesityDataSet_cleaned.csv
