In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [19]:
df = pd.read_csv('processed_dataset.csv')

In [20]:
df.head(10)

Unnamed: 0,UNIQUE_ID,GENDER,DAYS_STAY,AGE,Diagnosis_Certain Conditions originating in the Perinatal Period,"Diagnosis_Complications of Pregnancy, Childbirth, and the Puerperium",Diagnosis_Congenital Anomalies,Diagnosis_Diseases of the Blood and Blood-forming Organs,Diagnosis_Diseases of the Circulatory System,Diagnosis_Diseases of the Digestive System,...,Diagnosis_Neoplasms,Diagnosis_Supplementary Classification of External Causes of Injury and Poisoning,Diagnosis_Supplementary Classification of Factors influencing Health Status and Contact with Health Services,"Diagnosis_Symptoms, Signs and Ill-defined Conditions",Insurance_Government,Insurance_Medicaid,Insurance_Medicare,Insurance_Private,Insurance_Self Pay,avg_days
0,10000_187813,0,20,50,0,0,0,1,1,1,...,0,0,0,1,0,0,0,0,1,14
1,10001_118420,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,10
2,10002_132138,0,2,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,15
3,10003_144039,0,10,41,0,0,0,1,1,0,...,0,1,0,0,0,0,0,1,0,12
4,10004_161106,0,3,53,0,0,0,0,1,0,...,0,1,1,0,0,0,1,0,0,12
5,10004_164713,0,17,52,0,0,0,0,1,1,...,0,1,1,1,0,0,1,0,0,12
6,10005_181146,1,2,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,0,17
7,10006_142345,1,8,70,0,0,0,1,1,1,...,0,1,1,1,0,0,1,0,0,14
8,10007_166331,1,8,55,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,0,17
9,10008_180596,1,34,0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,1,0,15


In [21]:
for col in ['AGE', 'DAYS_STAY']:
    Q1_train = df[col].quantile(0.25)
    Q3_train = df[col].quantile(0.75)
    IQR_train = Q3_train - Q1_train
    df[col] = np.where((df[col] > (Q3_train + 1.5*IQR_train)), (Q3_train + 1.5*IQR_train), df[col])

In [22]:
X = df.drop(['DAYS_STAY','UNIQUE_ID'], axis=1)
y = df['DAYS_STAY']

# divide entre treino e teste 70:30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X.head(1)

Unnamed: 0,GENDER,AGE,Diagnosis_Certain Conditions originating in the Perinatal Period,"Diagnosis_Complications of Pregnancy, Childbirth, and the Puerperium",Diagnosis_Congenital Anomalies,Diagnosis_Diseases of the Blood and Blood-forming Organs,Diagnosis_Diseases of the Circulatory System,Diagnosis_Diseases of the Digestive System,Diagnosis_Diseases of the Genitourinary System,Diagnosis_Diseases of the Musculoskeletal System and Connective Tissue,...,Diagnosis_Neoplasms,Diagnosis_Supplementary Classification of External Causes of Injury and Poisoning,Diagnosis_Supplementary Classification of Factors influencing Health Status and Contact with Health Services,"Diagnosis_Symptoms, Signs and Ill-defined Conditions",Insurance_Government,Insurance_Medicaid,Insurance_Medicare,Insurance_Private,Insurance_Self Pay,avg_days
0,0,50.0,0,0,0,1,1,1,1,1,...,0,0,0,1,0,0,0,0,1,14


In [23]:
ridge = Ridge(alpha=1)

# Treine o modelo
ridge.fit(X_train, y_train)

# Imprima os coeficientes
print(ridge.coef_)

# Faça previsões
y_pred = ridge.predict(X_test)

[-0.22773021 -0.01147306  7.21748878  1.41540455  3.65111508  0.95586944
  1.5490177   1.29938983  1.03248132 -0.01927645  0.6592174   2.68466966
  3.28156508  0.0248092   2.35127038  2.55286698 -0.64381437  1.05270873
 -0.12099067 -0.99179669  0.66225298  0.43452013  0.28382089  0.1218523
  0.21252345 -1.05271676 -0.3745799 ]


In [24]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Agora vamos imprimir as métricas.
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R² score: {r2}")

MAE: 4.653826688635875
MSE: 35.69365123326285
R² score: 0.22506809445963583


In [25]:
from sklearn.linear_model import RidgeCV

# O parâmetro alphas é uma lista de valores alpha para tentar.
ridge_cv = RidgeCV(alphas=[0.01, 0.1, 1, 10, 100])

# Treinar o modelo com validação cruzada integrada
ridge_cv.fit(X_train, y_train)

# Obter o melhor valor para alpha
print(ridge_cv.alpha_)

# Fazer previsões
y_pred = ridge_cv.predict(X_test)


1.0


In [26]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Agora vamos imprimir as métricas.
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R² score: {r2}")

MAE: 4.6538266933681705
MSE: 35.69365124181228
R² score: 0.22506809427402208
