In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


In [8]:
df = pd.read_csv('processed_dataset.csv')

In [9]:
for col in ['AGE', 'DAYS_STAY']:
    Q1_train = df[col].quantile(0.25)
    Q3_train = df[col].quantile(0.75)
    IQR_train = Q3_train - Q1_train
    df[col] = np.where((df[col] > (Q3_train + 1.5*IQR_train)), (Q3_train + 1.5*IQR_train), df[col])

In [10]:
X = df.drop(['DAYS_STAY','UNIQUE_ID'], axis=1)
y = df['DAYS_STAY']

# divide entre treino e teste 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X.head(1)

Unnamed: 0,GENDER,AGE,Diagnosis_Certain Conditions originating in the Perinatal Period,"Diagnosis_Complications of Pregnancy, Childbirth, and the Puerperium",Diagnosis_Congenital Anomalies,Diagnosis_Diseases of the Blood and Blood-forming Organs,Diagnosis_Diseases of the Circulatory System,Diagnosis_Diseases of the Digestive System,Diagnosis_Diseases of the Genitourinary System,Diagnosis_Diseases of the Musculoskeletal System and Connective Tissue,...,Diagnosis_Neoplasms,Diagnosis_Supplementary Classification of External Causes of Injury and Poisoning,Diagnosis_Supplementary Classification of Factors influencing Health Status and Contact with Health Services,"Diagnosis_Symptoms, Signs and Ill-defined Conditions",Insurance_Government,Insurance_Medicaid,Insurance_Medicare,Insurance_Private,Insurance_Self Pay,avg_days
0,0,50.0,0,0,0,1,1,1,1,1,...,0,0,0,1,0,0,0,0,1,11


In [11]:
#normaliza os dados 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [12]:
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [None, 3, 5, 10],
    'min_samples_split': [7, 3, 5]
}

# Criar o modelo
clf = RandomForestClassifier(random_state=42)

# O GridSearchCV testará todas as combinações de parâmetros
model = GridSearchCV(clf, param_grid, cv=5)

# Treinar o modelo
model.fit(X_train, y_train)

# Imprime os melhores parâmetros encontrados
print("Best Params: ", model.best_params_)

# Fazer previsões com os melhores parâmetros
predictions = model.predict(X_test)

# Calcular a precisão
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy * 100}%')

# Realizar a validação cruzada
cross_val = cross_val_score(model, X, y, cv=5)
print('Cross Validation: ', cross_val)
print('Average CV Score: ', cross_val.mean() * 100, '%')

Best Params:  {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 400}
Accuracy: 17.29497541400554%
Cross Validation:  [0.17777213 0.17439593 0.17583722 0.16685036 0.16193302]
Average CV Score:  17.135773171469022 %
