In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

Carregando os dados de uma planilha Excel

In [None]:
df = pd.read_excel('dados_tratados.xlsx')

Verificando e tratando valores ausentes nas colunas numéricas

In [None]:
numerical_features = ['age', 'sex', 'cp', 'trestbps','chol', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num2']
if df[numerical_features].isnull().values.any():
    df[numerical_features] = df[numerical_features].fillna(df[numerical_features].mean())

Normalizando as colunas numericas

In [None]:
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

Aplicando o Isolation Forest para detectar outliers

In [None]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(df[numerical_features])
df['outliers'] = outliers

Removendo os outliers identificados

In [None]:
df = df[df['outliers'] == 1].drop(columns='outliers')

Verificando o resultado

In [None]:
print(df.head())

   id       age       sex        cp  trestbps      chol   restecg    thalch  \
0   1  0.948726  0.686202  2.251775  0.757525 -0.264900  1.016684  0.017197   
1   2  1.392002  0.686202 -0.877985  1.611220  0.760415  1.016684 -1.821905   
2   3  1.392002  0.686202 -0.877985 -0.665300 -0.342283  1.016684 -0.902354   
3   4 -1.932564  0.686202  0.165268 -0.096170  0.063974 -0.996749  1.637359   
4   5 -1.489288 -1.457296  1.208521 -0.096170 -0.825922  1.016684  0.980537   

      exang   oldpeak     slope        ca      thal      num2  hd  
0 -0.696631  1.305086  2.274579 -0.723095  2.461950 -0.764198   0  
1  1.435481  0.577838  0.649113  2.503851 -0.835207  0.866450   1  
2  1.435481  1.577804  0.649113  1.428203  0.813372  0.051126   1  
3 -0.696631  2.395958  2.274579 -0.723095 -0.835207 -0.764198   0  
4 -0.696631  0.486932 -0.976352 -0.723095 -0.835207 -0.764198   0  


Gerando nova planilha para uso com os dados tratados

In [None]:
df.to_excel('dados_normalizados.xlsx', index=False)