In [5]:
import pandas as pd

df = pd.read_csv('../data/raw/penguins2.csv')

# 1. Tratamento de dados faltantes
missing_values = df.isnull().sum()
print("Valores faltantes por coluna:")
print(missing_values)

threshold = 0.5 * df.shape[0]
df_cleaned = df.dropna(axis=1, thresh=threshold)

for column in df_cleaned.columns:
    if df_cleaned[column].dtype in ['int64', 'float64']:  # Colunas numéricas
        df_cleaned[column].fillna(df_cleaned[column].mean(), inplace=True)
    else:  
        df_cleaned[column].fillna(df_cleaned[column].mode()[0], inplace=True)

print("\nDataFrame após tratamento de dados faltantes:")
print(df_cleaned)

# 2. Tratamento de dados discrepantes
print("\nEstatísticas descritivas:")
print(df_cleaned.describe())

numeric_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns
Q1 = df_cleaned[numeric_cols].quantile(0.25)
Q3 = df_cleaned[numeric_cols].quantile(0.75)
IQR = Q3 - Q1

outliers = df_cleaned[
    (df_cleaned[numeric_cols] < (Q1 - 1.5 * IQR)) | (df_cleaned[numeric_cols] > (Q3 + 1.5 * IQR))
]

df_final = df_cleaned[~((df_cleaned[numeric_cols] < (Q1 - 1.5 * IQR)) | (df_cleaned[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

print("\nDataFrame após tratamento de dados discrepantes:")
print(df_final)


Valores faltantes por coluna:
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

DataFrame após tratamento de dados faltantes:
    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen        39.10000       18.70000         181.000000   
1    Adelie  Torgersen        39.50000       17.40000         186.000000   
2    Adelie  Torgersen        40.30000       18.00000         195.000000   
3    Adelie  Torgersen        43.92193       17.15117         200.915205   
4    Adelie  Torgersen        36.70000       19.30000         193.000000   
..      ...        ...             ...            ...                ...   
339  Gentoo     Biscoe        43.92193       17.15117         200.915205   
340  Gentoo     Biscoe        46.80000       14.30000         215.000000   
341  Gentoo     Biscoe        50.40000       15.70000     