# **TCC - Regressão Linear**
Morgana Weber

# Imports e leitura dos arquivos

In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error,  r2_score

In [18]:
patients = pd.read_csv('patients.csv')
admissions = pd.read_csv('admissions.csv')

data = pd.merge(patients, admissions, on='SUBJECT_ID')
#data.head()

# Feature Engineering

In [19]:
data['DOB'] = pd.to_datetime(data['DOB'])
data['ADMITTIME'] = pd.to_datetime(data['ADMITTIME'])

data['DOB_YEAR'] = data['DOB'].dt.year
data['ADMIT_YEAR'] = data['ADMITTIME'].dt.year

#calcula a fake idade
data['AGE'] = data['ADMIT_YEAR'] - data['DOB_YEAR']

#cria uma coluna dizendo se internou mais de uma vez
data['PREV_ADMISSION'] = data.groupby('SUBJECT_ID').cumcount()
data['PREV_ADMISSION_CHECK'] = data['PREV_ADMISSION'].apply(lambda x: '1' if x > 0 else '0')

#tempo de estadia
data['ADMITTIME'] = pd.to_datetime(data['ADMITTIME'])
data['DISCHTIME'] = pd.to_datetime(data['DISCHTIME'])
data['AVG_DAYS_STAY'] = (data['DISCHTIME'] - data['ADMITTIME']).dt.days

#agrega por diagnostico e calcula a média
avg_days_by_diagnosis = data.groupby('DIAGNOSIS')['AVG_DAYS_STAY'].mean().reset_index()

patients_per_diagnosis = data.groupby('DIAGNOSIS')['SUBJECT_ID'].nunique().reset_index()
patients_per_diagnosis.columns = ['DIAGNOSIS', 'NUMBER_OF_PATIENTS']

sorted_patients_per_diagnosis = patients_per_diagnosis.sort_values('NUMBER_OF_PATIENTS', ascending=False)

# Para exibir a tabela
#print(sorted_patients_per_diagnosis)

#converte genero pra binário
data['GENDER'] = data['GENDER'].map({'M': 0, 'F': 1})

selected_data = data[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'INSURANCE', 'DIAGNOSIS','AGE', 'GENDER', 'PREV_ADMISSION_CHECK']]

#adiciona o tempo médio de acordo com o diagnostico
selected_data = pd.merge(selected_data, avg_days_by_diagnosis, on='DIAGNOSIS', how='left')
selected_data.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,INSURANCE,DIAGNOSIS,AGE,GENDER,PREV_ADMISSION_CHECK,AVG_DAYS_STAY
0,249,116935,2149-12-17 20:41:00,2149-12-31 14:55:00,Medicare,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,74,1,0,13.0
1,249,149546,2155-02-03 20:16:00,2155-02-14 11:15:00,Medicare,GI BLEED/ CHEST PAIN,80,1,1,10.0
2,249,158975,2156-04-27 15:33:00,2156-05-14 15:30:00,Medicare,GI BLEEDING\COLONOSCOPY,81,1,1,16.0
3,250,124271,2188-11-12 09:22:00,2188-11-22 12:00:00,Self Pay,PNEUMONIA;R/O TB,24,1,0,10.0
4,251,117937,2110-07-27 06:46:00,2110-07-29 15:23:00,Private,INTRACRANIAL HEAD BLEED,20,0,0,2.5


# Modelo - XGBoost

In [20]:
label_encoder = LabelEncoder()
categorical_features = ['INSURANCE', 'DIAGNOSIS', 'GENDER','PREV_ADMISSION_CHECK']

for feature in categorical_features:
    selected_data[feature] = label_encoder.fit_transform(selected_data[feature])

# Drop Columns with datetime
selected_data = selected_data.drop(columns=['ADMITTIME', 'DISCHTIME'])

# Normalização dos dados    
scaler = StandardScaler()
selected_data_scaled = pd.DataFrame(scaler.fit_transform(selected_data), columns=selected_data.columns)

# Separação dos dados em recursos e valor-alvo
X = selected_data_scaled.drop(['AVG_DAYS_STAY'], axis=1)
y = selected_data_scaled['AVG_DAYS_STAY']

y = y.replace([np.inf, -np.inf], np.nan)
y = y.dropna()
X = X.loc[y.index, :]

# Separação dos dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Treinamento do modelo
xgb_model = XGBRegressor(objective ='reg:squarederror')
xgb_model.fit(X_train, y_train)

# Previsão
y_pred_train = xgb_model.predict(X_train)
y_pred = xgb_model.predict(X_test)

# Avaliação do Modelo
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Agora vamos imprimir as métricas.
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² score: {r2}")

Treino MSE: 0.5624790547358346
Teste MSE: 1.0744747822930398


# Analisando resultados