# Load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Dataset from https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/PX9K2R
#Read Excel https://datatofish.com/read_excel/
df = pd.read_excel (r'C:\Users\moson\Downloads\Hyperchloremia _and_DKA_Dataset.xlsx')

# Clean/Scale Data

In [2]:
# these col have empty data so drop
df = df.drop("Hospital AKI, Time of Onset from inition of DKA treatment (hours)", axis = 1)
df = df.drop("Duration of Admission Acute Kidney Injury (hours)", axis = 1)
df.dropna(inplace=True)
df = df.reset_index()

# Train/test split

In [3]:
# https://stackoverflow.com/questions/24147278/how-do-i-create-test-and-train-samples-from-one-dataframe-with-pandas
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
from sklearn.preprocessing import MinMaxScaler

X_train = train.drop("Time to Final DKA Resolution (hours)", axis=1)
Y_train = train["Time to Final DKA Resolution (hours)"]
X_test  = test.drop("Time to Final DKA Resolution (hours)", axis=1).copy()
Y_test = test["Time to Final DKA Resolution (hours)"]
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

scaler = MinMaxScaler()
train_model = scaler.fit(X_train)
test_model = scaler.fit(X_test)
X_train = train_model.transform(X_train)
X_test = test_model.transform(X_test)

(80, 48) (80,) (20, 48) (20,)


In [5]:
from sklearn.metrics import mean_squared_error
from sklearn import linear_model

las = linear_model.Lasso(alpha=1)
las.fit(X_train, Y_train)
las_predictions = las.predict(X_test)
print('Lasso Root Mean Squared Error:', mean_squared_error(Y_test, las_predictions, squared = False))

Lasso Root Mean Squared Error: 11.554641011464003


In [6]:
cv_las = linear_model.LassoCV(alphas=np.arange(0.07, 1, 0.01), cv=5)
cv_las.fit(X_train, Y_train)
cvlas_predictions = cv_las.predict(X_test)

print('Best Alpha: %f' % cv_las.alpha_)
print('LassoCV Root Mean Squared Error:', mean_squared_error(Y_test, cvlas_predictions, squared = False))

Best Alpha: 0.070000
LassoCV Root Mean Squared Error: 7.386731536769981


In [7]:
from joblib import dump

dump(cv_las, './../savedModels/models.joblib')

['./../savedModels/models.joblib']