# **TCC - Regressão Linear**
Morgana Weber

# Imports e leitura dos arquivos

In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [27]:
patients = pd.read_csv('patients.csv')
admissions = pd.read_csv('admissions.csv')
diagnoses = pd.read_csv('diagnoses_icd.csv')

data = pd.merge(patients, admissions, on='SUBJECT_ID')
data = pd.merge(data, diagnoses, on=['SUBJECT_ID', 'HADM_ID'])
csv_data = data[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'DOB', 'GENDER', 'ADMITTIME', 'DISCHTIME', 'INSURANCE', 'DIAGNOSIS']]

#data.head()

#categoriza o dignostico com base no icd-9
categories = {
    '001': 'Infectious and Parasitic Diseases',
    '140': 'Neoplasms',
    '240': 'Endocrine, Nutritional and Metabolic Diseases, and Immunity Disorders',
    '280': 'Diseases of the Blood and Blood-forming Organs',
    '290': 'Mental Disorders',
    '320': 'Diseases of the Nervous System and Sense Organs',
    '390': 'Diseases of the Circulatory System',
    '460': 'Diseases of the Respiratory System',
    '520': 'Diseases of the Digestive System',
    '580': 'Diseases of the Genitourinary System',
    '630': 'Complications of Pregnancy, Childbirth, and the Puerperium',
    '680': 'Diseases of the Skin and Subcutaneous Tissue',
    '710': 'Diseases of the Musculoskeletal System and Connective Tissue',
    '740': 'Congenital Anomalies',
    '760': 'Certain Conditions originating in the Perinatal Period',
    '780': 'Symptoms, Signs and Ill-defined Conditions',
    '800': 'Injury and Poisoning',
    'E80': 'Supplementary Classification of External Causes of Injury and Poisoning',
    'V01': 'Supplementary Classification of Factors influencing Health Status and Contact with Health Services',
    'M80': 'Morphology of Neoplasms'
}

# Feature Engineering


In [28]:
#data = pd.merge(patients, admissions, on='SUBJECT_ID')

# Transformação dos dados
data['DOB'] = pd.to_datetime(data['DOB'])
data['ADMITTIME'] = pd.to_datetime(data['ADMITTIME'])
data['DOB_YEAR'] = data['DOB'].dt.year
data['ADMIT_YEAR'] = data['ADMITTIME'].dt.year
data['AGE'] = data['DOB_YEAR'] - data['ADMIT_YEAR']

data['ADMITTIME'] = pd.to_datetime(data['ADMITTIME'])
data['DISCHTIME'] = pd.to_datetime(data['DISCHTIME'])
data['DAYS_STAY'] = (data['DISCHTIME'] - data['ADMITTIME']).dt.days

data['PREV_ADMISSION'] = data.groupby('SUBJECT_ID').cumcount()
data['PREV_ADMISSION_CHECK'] = data['PREV_ADMISSION'].apply(lambda x: '1' if x > 0 else '0')

data = data.dropna(subset=['ICD9_CODE'])

def new_categories(code):
    if pd.isnull(code):  # Tratar valores nulos
        return 'N/A'
    elif isinstance(code, float):  # Tratar valores inválidos (float)
        return 'Invalid'
    else:
        digits = code[:3]
        if digits[0] == 'V':
            return categories['V01']
        if digits[0] == 'E': 
            return categories['E80']
        if digits[0] == 'M':
            return categories['M80']
        if not digits[0] in ['0','1','2','3','4','5','6','7','8','9']:
            return 'Other'
        if int(digits) >= 800 and int(digits) <= 999:
            return categories['800']
        else:
            previous_key = None
            for key in categories:
                #print(code)
                if int(digits) < int(key):
                    if previous_key is not None:
                        return categories[previous_key]
                    return categories[key]
                previous_key = key
            return 'Other'

data['DIAGNOSIS_CATEGORY'] = data['ICD9_CODE'].apply(new_categories)

data['B_GENDER'] = data['GENDER'].map({'M': 0, 'F': 1})

insurance_dummies= pd.get_dummies(data['INSURANCE'], prefix='INSURANCE')
diagnosis_dummies = pd.get_dummies(data['DIAGNOSIS_CATEGORY'], prefix='DIAGNOSIS_CATEGORY')

csv_data = data[['ADMITTIME', 'DISCHTIME','SUBJECT_ID', 'HADM_ID','INSURANCE', 'DIAGNOSIS_CATEGORY','DIAGNOSIS','AGE', 'B_GENDER', 'PREV_ADMISSION_CHECK', 'DAYS_STAY']]
#csv_data.to_csv('output.csv')

csv_data.head()

Unnamed: 0,ADMITTIME,DISCHTIME,SUBJECT_ID,HADM_ID,INSURANCE,DIAGNOSIS_CATEGORY,DIAGNOSIS,AGE,B_GENDER,PREV_ADMISSION_CHECK,DAYS_STAY
0,2149-12-17 20:41:00,2149-12-31 14:55:00,249,116935,Medicare,Diseases of the Respiratory System,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,-74,1,0,13
1,2149-12-17 20:41:00,2149-12-31 14:55:00,249,116935,Medicare,Diseases of the Respiratory System,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,-74,1,1,13
2,2149-12-17 20:41:00,2149-12-31 14:55:00,249,116935,Medicare,Diseases of the Circulatory System,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,-74,1,1,13
3,2149-12-17 20:41:00,2149-12-31 14:55:00,249,116935,Medicare,Diseases of the Circulatory System,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,-74,1,1,13
4,2149-12-17 20:41:00,2149-12-31 14:55:00,249,116935,Medicare,Injury and Poisoning,UNSTABLE ANGINA;ASTHMA;BRONCHITIS,-74,1,1,13


# Tratamento dos dados

In [4]:
for col in ['AGE', 'DAYS_STAY']:
    Q1_train = csv_data[col].quantile(0.25)
    Q3_train = csv_data[col].quantile(0.75)
    IQR_train = Q3_train - Q1_train
    csv_data[col] = np.where((csv_data[col] > (Q3_train + 1.5*IQR_train)), (Q3_train + 1.5*IQR_train), csv_data[col])

# Separação entre treino e teste
data_train, data_test = train_test_split(csv_data, test_size=0.3, random_state=12)

# Cálculo de 'AVG_DAYS_STAY' para os dados de treinamento
# data_train['ADMITTIME'] = pd.to_datetime(data_train['ADMITTIME'])
# data_train['DISCHTIME'] = pd.to_datetime(data_train['DISCHTIME'])
# data_train['AVG_DAYS_STAY'] = (data_train['DISCHTIME'] - data_train['ADMITTIME']).dt.days

# Agrega por diagnostico e calcula a média
#avg_days_by_diagnosis_train = data_train.groupby('DIAGNOSIS')['AVG_DAYS_STAY'].mean().reset_index()


# Adiciona a coluna com o tempo médio de acordo com o diagnóstico
#data_train = pd.merge(data_train, avg_days_by_diagnosis_train, on='DIAGNOSIS', how='left')
#data_test = pd.merge(data_test, avg_days_by_diagnosis_train, on='DIAGNOSIS', how='left')

# Conversão do gênero para binario
# data_train['GENDER'] = data_train['GENDER'].map({'M': 0, 'F': 1})
# data_test['GENDER'] = data_test['GENDER'].map({'M': 0, 'F': 1})

data_train.head()

# def new_categories(code):
#     if pd.isnull(code):  # Tratar valores nulos
#         return 'N/A'
#     elif isinstance(code, float):  # Tratar valores inválidos (float)
#         return 'Invalid'
#     else:
#         digits = code[:3]
#         if digits[0] == 'V':
#             return categories['V01']
#         if digits[0] == 'E': 
#             return categories['E80']
#         if digits[0] == 'M':
#             return categories['M80']
#         if not digits[0] in ['0','1','2','3','4','5','6','7','8','9']:
#             return 'Other'
#         if int(digits) >= 800 and int(digits) <= 999:
#             return categories['800']
#         else:
#             previous_key = None
#             for key in categories:
#                 #print(code)
#                 if int(digits) < int(key):
#                     if previous_key is not None:
#                         return categories[previous_key]
#                     return categories[key]
#                 previous_key = key
#             return 'Other'

# data_train['DIAGNOSIS_CATEGORY'] = data_train['ICD9_CODE'].apply(new_categories)
# data_test['DIAGNOSIS_CATEGORY'] = data_test['ICD9_CODE'].apply(new_categories)

# Seleciona as variáveis que serão utilizadas para prever o tempo de internação
data_model_train = data_train[['INSURANCE', 'DIAGNOSIS_CATEGORY','AGE', 'GENDER', 'PREV_ADMISSION_CHECK']]
data_model_test = data_test[['INSURANCE', 'DIAGNOSIS_CATEGORY','AGE', 'GENDER', 'PREV_ADMISSION_CHECK']]

#data_model_train.head()

# Cria variáveis dummies para diagnóstico
# insurance_dummies_train = pd.get_dummies(data_model_train['INSURANCE'], prefix='INSURANCE')
# diagnosis_dummies_train = pd.get_dummies(data_model_train['DIAGNOSIS_CATEGORY'], prefix='DIAGNOSIS_CATEGORY')

# insurance_dummies_test = pd.get_dummies(data_model_test['INSURANCE'], prefix='INSURANCE')
# diagnosis_dummies_test = pd.get_dummies(data_model_test['DIAGNOSIS_CATEGORY'], prefix='DIAGNOSIS_CATEGORY')

# Combinação dos dados com os dummies
# model_data_train = pd.concat([data_model_train, insurance_dummies_train, diagnosis_dummies_train], axis=1)
# model_data_test = pd.concat([data_model_test, insurance_dummies_test, diagnosis_dummies_test], axis=1)

data_model_test = data_model_test.reindex(columns = data_model_train.columns, fill_value=0)

# model_data_train = model_data_train.dropna(subset=['AVG_DAYS_STAY'])
# model_data_test = model_data_test.dropna(subset=['AVG_DAYS_STAY'])

# Tratamento de outliers
# for col in ['AGE', 'DAYS_STAY']:
#     Q1_train = model_data_train[col].quantile(0.25)
#     Q3_train = model_data_train[col].quantile(0.75)
#     IQR_train = Q3_train - Q1_train
#     model_data_train[col] = np.where((model_data_train[col] > (Q3_train + 1.5*IQR_train)), (Q3_train + 1.5*IQR_train), model_data_train[col])

#     Q1_test = model_data_test[col].quantile(0.25)
#     Q3_test = model_data_test[col].quantile(0.75)
#     IQR_test = Q3_test - Q1_test
#     model_data_test[col] = np.where((model_data_test[col] > (Q3_test + 1.5*IQR_test)), (Q3_test + 1.5*IQR_test), model_data_test[col])

y_train = data_model_train['AVG_DAYS_STAY']
#y_test = model_data_test.drop('AVG_DAYS_STAY')
y_test = data_model_test['AVG_DAYS_STAY']

# Atualização de X
X_train = data_model_train.drop(['INSURANCE', 'DIAGNOSIS_CATEGORY'], axis=1)
X_test = data_model_test.drop(['INSURANCE', 'DIAGNOSIS_CATEGORY'], axis=1)

# print(X_train)
# print(X_test)

# Normalização
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

KeyError: 'AVG_DAYS_STAY'

# Linear Regression


In [None]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

# Cálculo das métricas
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'Training MSE: {mse_train}, Testing MSE: {mse_test}')
print(f'Training MAE: {mae_train}, Testing MAE: {mae_test}')
print(f'Training R2 Score: {r2_train}, Testing R2 Score: {r2_test}')

Training MSE: 4.9991824156599544e-29, Testing MSE: 4.9766847327623614e-29
Training MAE: 5.3755239841984095e-15, Testing MAE: 5.362041811302779e-15
Training R2 Score: 1.0, Testing R2 Score: 1.0
