In [38]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv("/home/mc2731/Final_Model/cleaned_dataset_2.csv")
print(df.head)

# Coverting categorical values to numeric using binary encoding
binary_cols = ['change', 'diabetes_med', 'readmitted']
for col in binary_cols:
    df[col] = df[col].map({'no': 0, 'yes': 1})

encoding = {'no': 0, 'normal': 1, 'high': 2}
df['glucose_test'] = df['glucose_test'].map(encoding)
df['A1Ctest'] = df['A1Ctest'].map(encoding)

binary_cols2 = ['diag_1', 'diag_2', 'diag_3']
encoding2 = {'Other': 0, 'Respiratory': 1, 'Circulatory': 2, 'Injury': 3, 'Diabetes': 3}

df['diag_1'] = df['diag_1'].map(encoding2)
df['diag_2'] = df['diag_2'].map(encoding2)
df['diag_3'] = df['diag_3'].map(encoding2)

<bound method NDFrame.head of            age  time_in_hospital  n_lab_procedures  n_procedures  \
0      [70-80)                 8                72             1   
1      [70-80)                 3                34             2   
2      [50-60)                 5                45             0   
3      [70-80)                 2                36             0   
4      [60-70)                 1                42             0   
...        ...               ...               ...           ...   
19469  [60-70)                 2                61             4   
19470  [80-90)                 2                66             0   
19471  [70-80)                 5                12             0   
19472  [70-80)                 2                61             3   
19473  [50-60)                10                37             1   

       n_medications  n_outpatient  n_inpatient  n_emergency  \
0                 18             2            0            0   
1                 13     

In [39]:
# After looking at our graph for the medical specialty, we decided it would be best to use label encoding

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['medical_specialty_encoded'] = label_encoder.fit_transform(df['medical_specialty'])

# Dropping the original column
df = df.drop(columns=['medical_specialty'])

print(df)

           age  time_in_hospital  n_lab_procedures  n_procedures  \
0      [70-80)                 8                72             1   
1      [70-80)                 3                34             2   
2      [50-60)                 5                45             0   
3      [70-80)                 2                36             0   
4      [60-70)                 1                42             0   
...        ...               ...               ...           ...   
19469  [60-70)                 2                61             4   
19470  [80-90)                 2                66             0   
19471  [70-80)                 5                12             0   
19472  [70-80)                 2                61             3   
19473  [50-60)                10                37             1   

       n_medications  n_outpatient  n_inpatient  n_emergency  diag_1  diag_2  \
0                 18             2            0            0     2.0     1.0   
1                 13   

In [40]:
# Since age is represented as a range, which is not suitable for our logistic regression model, we convert the age ranges into their corresponding midpoints.
def convert_age_to_midpoint(age_range):
    lower, upper = map(int, age_range.strip('[]()').split('-'))
    return (lower + upper) / 2

df['age'] = df['age'].apply(convert_age_to_midpoint)

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

df = df.dropna()
X = df.drop('readmitted', axis=1)  
y = df['readmitted']               

#To build to model, we split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Next, we initialized and trained the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] 

In [47]:
# We wanted to retreive the coefficients of our logistic model to understand which variables are the most significant predictors

coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', key=abs, ascending=False)

# Sorting the absolute values of the coefficients
coefficients['Abs_Coefficient'] = coefficients['Coefficient'].abs()
coefficients_sorted = coefficients.sort_values(by='Abs_Coefficient', ascending=False)

print(coefficients_sorted)

                      Feature  Coefficient  Abs_Coefficient
6                 n_inpatient     0.329137         0.329137
7                 n_emergency     0.231650         0.231650
5                n_outpatient     0.160172         0.160172
14               diabetes_med     0.145634         0.145634
11               glucose_test    -0.123424         0.123424
3                n_procedures    -0.076353         0.076353
15  medical_specialty_encoded    -0.051205         0.051205
8                      diag_1     0.044913         0.044913
12                    A1Ctest    -0.038811         0.038811
9                      diag_2     0.032447         0.032447
13                     change     0.030812         0.030812
1            time_in_hospital     0.024232         0.024232
4               n_medications     0.019351         0.019351
10                     diag_3     0.013963         0.013963
2            n_lab_procedures    -0.002290         0.002290
0                         age    -0.0021

In [48]:
# Next, we evaluated the model

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import mean_squared_error

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Confusion Matrix:\n", conf_matrix)

mse = mean_squared_error(y_test, y_pred_proba)
print(f"Mean Squared Error: {mse}")

Accuracy: 0.6137862796833773
Precision: 0.6398809523809523
Confusion Matrix:
 [[1216  363]
 [ 808  645]]
Mean Squared Error: 0.23108934772244635
