In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn as sk 

from matplotlib import style
style.use('dark_background')

In [2]:
train_data = pd.read_csv('train_data.csv')
train_data = train_data.drop(['ID','HealthServiceArea'], axis = 1)
train_data.head()

Unnamed: 0,Gender,Race,TypeOfAdmission,CCSProcedureCode,APRSeverityOfIllnessCode,PaymentTypology,BirthWeight,EmergencyDepartmentIndicator,AverageCostInCounty,AverageChargesInCounty,AverageCostInFacility,AverageChargesInFacility,AverageIncomeInZipCode,LengthOfStay
0,F,Other Race,Newborn,228,1,Medicaid,3700,N,2611,9227,1751,8951,45,1
1,M,Black/African American,Newborn,228,1,Medicaid,2900,N,3242,8966,3338,6409,34,1
2,M,Other Race,Newborn,220,1,Private Health Insurance,3200,N,3155,11381,4980,9323,45,1
3,F,Other Race,Newborn,0,1,Private Health Insurance,3300,N,3155,11381,5826,15680,59,1
4,F,Other Race,Newborn,228,1,Medicaid,2600,N,2611,9227,6000,14344,59,1


In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59966 entries, 0 to 59965
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Gender                        59966 non-null  object
 1   Race                          59966 non-null  object
 2   TypeOfAdmission               59966 non-null  object
 3   CCSProcedureCode              59966 non-null  int64 
 4   APRSeverityOfIllnessCode      59966 non-null  int64 
 5   PaymentTypology               59966 non-null  object
 6   BirthWeight                   59966 non-null  int64 
 7   EmergencyDepartmentIndicator  59966 non-null  object
 8   AverageCostInCounty           59966 non-null  int64 
 9   AverageChargesInCounty        59966 non-null  int64 
 10  AverageCostInFacility         59966 non-null  int64 
 11  AverageChargesInFacility      59966 non-null  int64 
 12  AverageIncomeInZipCode        59966 non-null  int64 
 13  LengthOfStay    

In [4]:
train_data.describe()

Unnamed: 0,CCSProcedureCode,APRSeverityOfIllnessCode,BirthWeight,AverageCostInCounty,AverageChargesInCounty,AverageCostInFacility,AverageChargesInFacility,AverageIncomeInZipCode,LengthOfStay
count,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0,59966.0
mean,155.404229,1.254594,3336.298903,2372.80669,7979.126922,2396.414318,7958.472668,59.811143,2.538405
std,89.541978,0.546207,446.244475,639.755096,3220.291347,1248.501189,3859.294711,21.47017,1.171246
min,-1.0,1.0,2500.0,712.0,1243.0,457.0,1120.0,28.0,1.0
25%,115.0,1.0,3000.0,2041.0,4620.0,1551.0,4438.0,45.0,2.0
50%,220.0,1.0,3300.0,2533.0,9227.0,1967.0,7945.0,55.0,2.0
75%,228.0,1.0,3600.0,2785.0,10644.0,2895.0,11619.0,74.0,3.0
max,231.0,4.0,7500.0,3242.0,11381.0,8114.0,18466.0,115.0,10.0


In [5]:
X = train_data.drop('LengthOfStay', axis = 1)
Y = train_data['LengthOfStay']
Y_convert = Y.apply(lambda x: 0 if int(x) <= 3 else 1)

In [6]:
Y.value_counts()

2     25000
3     16000
1      8895
4      7504
5      1342
6       557
7       346
8       145
9        97
10       80
Name: LengthOfStay, dtype: int64

In [7]:
Y_convert.value_counts()

0    49895
1    10071
Name: LengthOfStay, dtype: int64

In [8]:
for col in X.columns:
    print(X[col].value_counts())
    print('---------------------------')

M    30978
F    28987
U        1
Name: Gender, dtype: int64
---------------------------
White                     32943
Other Race                18314
Black/African American     8183
Multi-racial                526
Name: Race, dtype: int64
---------------------------
Newborn      58741
Emergency      659
Urgent         412
Elective       154
Name: TypeOfAdmission, dtype: int64
---------------------------
 228    19886
 115    13628
 0      11189
 220    10773
 231     2981
-1        769
 216      740
Name: CCSProcedureCode, dtype: int64
---------------------------
1    47953
2     8760
3     3252
4        1
Name: APRSeverityOfIllnessCode, dtype: int64
---------------------------
Medicaid                     28723
Private Health Insurance     15608
Blue Cross/Blue Shield       12073
Self-Pay                      1984
Federal/State/Local/VA         849
Managed Care, Unspecified      545
Miscellaneous/Other            118
Medicare                        44
Unknown                        

## Dummy Encode

In [9]:
# Select columns for one-hot encoding
# CSS Procedure Code not sure
encode_col = ['Gender','Race','TypeOfAdmission','PaymentTypology','EmergencyDepartmentIndicator']
X_copy = X.copy()


for col in encode_col:
    # create new dataframe for each type of encode_column
    dummiesDF = pd.get_dummies(X_copy[col])
    # merge the new dataframe into the orginal
    X_copy = pd.concat([X_copy, dummiesDF], axis='columns')
    # Drop the original column after encoding
    if col == 'Gender':
        X_copy = X_copy.drop([col,'U'], axis = 1)
    else:
        X_copy = X_copy.drop(col, axis = 1)

X_copy.columns


Index(['CCSProcedureCode', 'APRSeverityOfIllnessCode', 'BirthWeight',
       'AverageCostInCounty', 'AverageChargesInCounty',
       'AverageCostInFacility', 'AverageChargesInFacility',
       'AverageIncomeInZipCode', 'F', 'M', 'Black/African American',
       'Multi-racial', 'Other Race', 'White', 'Elective', 'Emergency',
       'Newborn', 'Urgent', 'Blue Cross/Blue Shield', 'Federal/State/Local/VA',
       'Managed Care, Unspecified', 'Medicaid', 'Medicare',
       'Miscellaneous/Other', 'Private Health Insurance', 'Self-Pay',
       'Unknown', 'N', 'Y'],
      dtype='object')

In [10]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(X_copy,Y_convert,test_size = 0.3, stratify = Y)

print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

(41976, 29)
(17990, 29)
(41976,)
(17990,)


In [11]:
# Reset index after spliting

def df_reset_index(x):
    return x.reset_index().drop('index', axis = 1)

train_X = df_reset_index(train_X)
test_X = df_reset_index(test_X)
train_Y = df_reset_index(train_Y)
test_Y = df_reset_index(test_Y)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train_X)
train_X_scale = scaler.transform(train_X)
test_X_scale = scaler.transform(test_X)
train_Y_ravel = train_Y.values.ravel()

In [13]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state = 0, max_iter = 1000)
log_reg.fit(train_X_scale, train_Y_ravel)
print('Intercept:', log_reg.intercept_)
print('Coefficient/Theta', log_reg.coef_)

Intercept: [-1.76891642]
Coefficient/Theta [[ 0.07764193  0.59920665 -0.06911987 -0.00353355 -0.03155733  0.13345883
   0.02048114  0.26401642 -0.45927853 -0.38400441  0.09617262 -0.02537596
  -0.01446672 -0.04822616 -0.0797998  -0.1780288   0.1432542   0.03258158
   0.03249744 -0.0137461   0.03041437 -0.05449524 -0.01392996  0.0178171
   0.0412354  -0.03386521  0.02218569 -0.0236418   0.0236418 ]]


In [14]:
from sklearn.metrics import accuracy_score, mean_squared_error

log_pred = log_reg.predict(test_X_scale)
print('Accuracy:',accuracy_score(test_Y,log_pred))
print('Mean squared error ', mean_squared_error(test_Y, log_pred))

Accuracy: 0.836242356864925
Mean squared error  0.16375764313507504


In [15]:
unique, counts = np.unique(log_pred, return_counts=True)
unique_Y, counts_Y = np.unique(test_Y, return_counts=True)
print('Prediction:',dict(zip(unique, counts)))
print('Test_Y:',dict(zip(unique_Y, counts_Y)))

Prediction: {0: 17331, 1: 659}
Test_Y: {0: 14969, 1: 3021}


## Lasso

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X_copy,Y,test_size = 0.3, stratify = Y)

X_train = df_reset_index(X_train)
X_test = df_reset_index(X_test)
Y_train = df_reset_index(Y_train)
Y_test = df_reset_index(Y_test)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(41976, 29)
(17990, 29)
(41976, 1)
(17990, 1)


In [17]:
scaler_1 = StandardScaler()
scaler_1.fit(X_train)
X_train_scale = scaler_1.transform(X_train)
X_test_scale = scaler.transform(X_test)
Y_train_ravel = Y_train.values.ravel()

In [20]:
from sklearn.linear_model import Lasso 

# A list to hold different values of alpha
Log_Reg_Para = [0.01,0.05,0.1,0.25,0.5,0.75,1]

for para in Log_Reg_Para:
    # Create the logistic regression object
    lasso = Lasso(alpha = para, max_iter=1000)
    
    # Fit the model
    lasso.fit(X_train_scale, Y_train_ravel)
    
    # Predict
    preds = lasso.predict(X_test_scale)
    
    # MSE & Accuracy
    mse = mean_squared_error(Y_test, preds)
    print("Alpha MSE",para,":",mse)
    print('---------------------------')

Alpha MSE 0.01 : 1.1337791679192737
---------------------------
Alpha MSE 0.05 : 1.1525707848205176
---------------------------
Alpha MSE 0.1 : 1.1866960825751196
---------------------------
Alpha MSE 0.25 : 1.2761192951157316
---------------------------
Alpha MSE 0.5 : 1.371049997371983
---------------------------
Alpha MSE 0.75 : 1.371049997371983
---------------------------
Alpha MSE 1 : 1.371049997371983
---------------------------


In [27]:
# Select alpha = 0.01 model
# Create the logistic regression object
lasso = Lasso(alpha = 0.01, max_iter=1000)

# Fit the model
lasso.fit(X_train_scale, Y_train_ravel)

# Predict
preds = lasso.predict(X_test_scale)
print(preds)

[2.43787798 2.1576745  2.26740438 ... 2.54571715 2.39852767 2.43725535]


In [26]:
for i in range (len(preds)):
    if preds[i] <= 3:
        preds[i] = 0
    else:
        preds[i] = 1

print('Accuracy:',accuracy_score(test_Y,preds))

Accuracy: 0.8320733740967204
