In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from pandas.core.common import SettingWithCopyWarning
import warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)




In [None]:
results=pd.read_csv('Results.csv')

In [None]:
results


# Preprocessing and Split

In [None]:

df=pd.read_csv('data_v5.csv')


df

#### Duplicates and Scrambling

In [None]:
#count duplicates
df.duplicated().sum()

In [None]:
#remove duplicate rows
df=df.drop_duplicates(keep='first')

#scrambles data randomnly
df = df.sample(frac=1, random_state=4).reset_index(drop=True)


In [None]:
#should be equal to number of rows, 
len(df['ADMIT_NUMBER'].unique())

In [None]:
df[df.duplicated(['ADMIT_NUMBER'], keep=False)]

In [None]:
rows=len(df)

#### Capping LoS

In [None]:
len(df[df['LENGTH_OF_STAY_DAYS']<=14])*100/rows

In [None]:
len(df[df['LENGTH_OF_STAY_DAYS']==0])*100/rows

In [None]:
boxplot = df.boxplot(column=['LENGTH_OF_STAY_DAYS'])

In [None]:
df=df[df['LENGTH_OF_STAY_DAYS']<=14]
boxplot = df.boxplot(column=['LENGTH_OF_STAY_DAYS'])

In [None]:
#drop unique identifiers

df=df.drop(columns='UR_NUMBER',axis=1)
df=df.drop(columns='PATIENT_NUMBER',axis=1)
df=df.drop(columns='ANE_NUMBER',axis=1)
df=df.drop(columns='ADMIT_NUMBER',axis=1)
    

In [None]:
df['AGE_ON_ADMISSION'][df['AGE_ON_ADMISSION']<0]

In [None]:
df['AGE_ON_ADMISSION'][df['AGE_ON_ADMISSION']<0]=None

### Add time and date features

In [None]:
from datetime import date
import calendar

df['ADMISSION_DATE']=pd.to_datetime(df['ADMISSION_DATE'], format ="%d/%m/%Y")
df['ADMISSION_TIME']=pd.to_datetime(df['ADMISSION_TIME'])

df['DAY_OF_WEEK']=df['ADMISSION_DATE'].dt.day_name()
df['DAY_OF_WEEK']=df['DAY_OF_WEEK'].apply(str)

df['ADMISSION_YEAR']=df['ADMISSION_DATE'].dt.year
df['ADMISSION_YEAR']=df['ADMISSION_YEAR'].apply(str)

df['ADMISSION_MONTH']=df['ADMISSION_DATE'].dt.month
df['ADMISSION_MONTH']=df['ADMISSION_MONTH'].apply(str)

df['ADMISSION_HOUR']=df['ADMISSION_TIME'].dt.hour
df['ADMISSION_HOUR']=df['ADMISSION_HOUR'].apply(str)

df["IsWeekend"] = df['ADMISSION_DATE'].dt.weekday >=5
df['IsWeekend']=df['IsWeekend'].apply(str)

from govuk_bank_holidays.bank_holidays import BankHolidays

bank_holidays = BankHolidays()
 
df['IS_HOLIDAY']=df['ADMISSION_DATE'].isin(bank_holidays._get_known_holiday_date_set())
df['IS_HOLIDAY']=df['IS_HOLIDAY'].apply(str)


df

In [None]:
df['IS_HOLIDAY'].unique()

In [None]:
#check if any times are negative

print(len(df['TRIAGE_TO_SEEN_WAIT'][df['TRIAGE_TO_SEEN_WAIT']<0]))
print(len(df['ARRIVAL_TO_TREATMENT_WAIT'][df['ARRIVAL_TO_TREATMENT_WAIT']<0]))
print(len(df['ARRIVAL_TO_CONCLUSION_WAIT'][df['ARRIVAL_TO_CONCLUSION_WAIT']<0]))

In [None]:
#lotsof entries have zero, so cannot replace negative with zero
df['TRIAGE_TO_SEEN_WAIT'][df['TRIAGE_TO_SEEN_WAIT']==0]

In [None]:
#replace negative entries with None

df['TRIAGE_TO_SEEN_WAIT'][df['TRIAGE_TO_SEEN_WAIT']<0]=None
df['ARRIVAL_TO_TREATMENT_WAIT'][df['ARRIVAL_TO_TREATMENT_WAIT']<0]=None



In [None]:
df=df.drop(['ADMISSION_DATE'], axis=1)
df=df.drop(['ADMISSION_TIME'], axis=1)

In [None]:
##Change LoS
df['READMITTED']=df['DAYS_SINCE_LAST_DISCHARGE']<=30
df['READMITTED']=df['READMITTED'].apply(str)

In [None]:
df[['READMITTED','DAYS_SINCE_LAST_DISCHARGE']]

In [None]:
df=df.drop(['DAYS_SINCE_LAST_DISCHARGE'], axis=1)

## Data Exploration

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
explore_num = df.select_dtypes(include=['int64', 'float64'])

In [None]:
# fig=plt.subplots(figsize=(12, 21))
# i=0
# for feature in explore_num.columns:
#     if feature not in ['LENGTH_OF_STAY_DAYS']:
#         i+=1
#         plt.subplot(13, 3, i)
#         sns.distplot(df[feature])
#         plt.tight_layout()

In [None]:
# fig=plt.subplots(figsize=(12, 21))
# i=0
# for feature in explore_num.columns:
#     if feature not in ['LENGTH_OF_STAY_DAYS']:
#         i+=1
#         plt.subplot(13, 3, i)
#         sns.scatterplot(df[feature], df['LENGTH_OF_STAY_DAYS'])
#         plt.tight_layout()

### Reduce and Combine Ward Categories

In [None]:
df['ADMIT_WARD'].unique()

In [None]:
df['ADMIT_WARD'].value_counts()

In [None]:
#make hdu, picu, and burns unit their own categories, make all the wards not include color
df.loc[df['ADMIT_WARD'].str.contains('PICU'), 'ADMIT_WARD'] = 'PICU'
df.loc[df['ADMIT_WARD'].str.contains('HDU'), 'ADMIT_WARD'] = 'HDU'
df.loc[df['ADMIT_WARD'].str.contains('Burns'), 'ADMIT_WARD'] = 'Burn Unit '

#exceptions, 1c is neonatal
df.loc[df['ADMIT_WARD'].str.contains('1C Yellow'), 'ADMIT_WARD'] = 'neonatal'

#differentiate between ward with ensuite room vs without (orange has no ensuite, yellow,green, and blue are ensuite)
#df.loc[df['ADMIT_WARD'].str.contains('Orange'), 'ADMIT_WARD'] = df['ADMIT_WARD'].str[0:]

df.loc[df['ADMIT_WARD'].str.contains('Ward'), 'ADMIT_WARD'] = df['ADMIT_WARD'].str[0:7]


df['ADMIT_WARD'].unique()


In [None]:
df['ADMIT_WARD'].value_counts()

In [None]:
df['ETHNICITY'].unique()

In [None]:
df['ETHNICITY'].value_counts()

### Split Data

In [None]:
X=df
#X=X.drop(columns='LENGTH_OF_STAY_DAYS',axis=1). dont drop yet because needed for heat map
Y=df.LENGTH_OF_STAY_DAYS
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=4)

In [None]:
X_train

In [None]:
xtrain_num = X_train.select_dtypes(include=['int64', 'float64'])
xtest_num= X_test.select_dtypes(include=['int64', 'float64'])
xtrain_cat = X_train.select_dtypes(include=['object'])
xtest_cat = X_test.select_dtypes(include=['object'])

In [None]:
xtrain_num.info()

In [None]:
xtrain_cat.info()

## Fill Null Data

In [None]:
def analysis(data):
        return pd.DataFrame({"Data Type":data.dtypes, "Unique Count":data.apply(lambda x: x.nunique(),axis=0), 
                         "Null Count": data.isnull().sum() })

In [None]:
for col in xtrain_num.columns:
    print(col)

In [None]:
for col in xtrain_cat.columns:
    print(col)

In [None]:
analysis(xtrain_num)


In [None]:
analysis(xtrain_num)[analysis(xtrain_num)["Null Count"]>0]

In [None]:

#cannot fill null with 0, since 0 is already filled, instead fill with -1

#xtrain_num['DAYS_SINCE_LAST_DISCHARGE']=xtrain_num['DAYS_SINCE_LAST_DISCHARGE'].fillna(-1)
###possibly make this categorical, has been readmitted (30 days) yes or no
xtrain_num['Total_LOS_(within_last_12mth)']=xtrain_num['Total_LOS_(within_last_12mth)'].fillna(0)
analysis(xtrain_num)[analysis(xtrain_num)["Null Count"]>0]
####cumulative, fill 0

In [None]:

xtrain_num = xtrain_num.fillna(xtrain_num.median())
analysis(xtrain_num)[analysis(xtrain_num)["Null Count"]>0]

In [None]:
xtest_num = xtest_num.fillna(xtrain_num.median())
analysis(xtest_num)[analysis(xtest_num)["Null Count"]>0]

In [None]:
analysis(xtrain_cat)


In [None]:
xtrain_cat['AEOBS'].fillna('N', inplace = True)
xtest_cat['AEOBS'].fillna('N', inplace = True)
analysis(xtrain_cat)

In [None]:


impute_cat_features = list(xtrain_cat.columns)


for feat in impute_cat_features:
    xtrain_cat[feat].fillna(value='NA_' + feat, inplace = True)
    xtest_cat[feat].fillna(value='NA_' + feat, inplace = True)


In [None]:
analysis(xtrain_cat)[analysis(xtrain_cat)["Null Count"]>0]

In [None]:
xtrain_cat['ETHNICITY'].value_counts()

## Deal with Outliers (only applies to numerical)

In [None]:
xtrain_num[xtrain_num['RespiratoryRate']<=10]

In [None]:
xtrain_num['RespiratoryRate'].value_counts()

In [None]:
xtrain_num.columns

In [None]:
analysis(xtrain_num)

In [None]:
#from scipy.stats.mstats import winsorize

#DID NOT INCLUDE: AGE_ON, _ADMISSION, LENGTH_OF_STAY_DAYS, IMD_Decile_DOOGAl, 'IMD_QUINTILE_DOOGAL','Index_of_Multiple_Deprivation_Decile', 'Income_Decile',
#        'Employment_Decile', 'Education_and_Skills_Decile', 'IDACI_Decile',
#        'TRIAGE_PRIORITY'

from feature_engine.outliers import Winsorizer
capper = Winsorizer(capping_method='iqr', tail='both', fold=2, variables=['PulseRate', 'RespiratoryRate', 'SP02',
       'Temperature','Distance_from_Home_Address', 'Population',
       'Households','IMD_DOOGAL', 'Distance to station', 'Average Income','Index_of_Multiple_Deprivation_Rank',
        'ARRIVAL_TO_TREATMENT_WAIT', 'TRIAGE_TO_SEEN_WAIT',
       'ARRIVAL_TO_CONCLUSION_WAIT',
       'Total_LOS_(within_last_12mth)', 'Admits_last_12mth'])

# fit the capper
capper.fit(xtrain_num)

# transform the data
xtrain_num= capper.transform(xtrain_num)
xtest_num= capper.transform(xtest_num)
capper.right_tail_caps_

In [None]:
capper.left_tail_caps_

In [None]:
pd.set_option('display.max_columns', None)
xtrain_num.describe()

## Remove rare labels/categories (only applies to categorical)

In [None]:
xtrain_cat.columns

In [None]:
#use rare label encoder
from feature_engine.encoding import RareLabelEncoder
#excluding 'ADMISSION_YEAR', 

#using to not overfit, want to fit general pattern, gauge effect of different categories on LoS
###change to 1%
encoder = RareLabelEncoder(tol=0.02, n_categories=2, replace_with='Rare')

# fit the encoder
encoder.fit(xtrain_cat)

# transform the data
xtrain_cat = encoder.transform(xtrain_cat)
xtest_cat = encoder.transform(xtest_cat)

encoder.encoder_dict_


In [None]:
xtrain_cat['ETHNICITY'].unique()


In [None]:
#same as before just replaced black with rare

xtrain_cat['ETHNICITY'].value_counts()

In [None]:
analysis(xtrain_cat)

## Encoding

In [None]:
from feature_engine.encoding import OrdinalEncoder
from feature_engine.encoding import OneHotEncoder

encoder = OrdinalEncoder(encoding_method='ordered', variables=['TriageCat'])

# fit the encoder
encoder.fit(xtrain_cat, Y_train)

# transform the data
xtrain_cat= encoder.transform(xtrain_cat)
xtest_cat= encoder.transform(xtest_cat)

encoder = OrdinalEncoder(encoding_method='arbitrary', variables=['ADMIT_SOURCE', 'ADMIT_WARD', 'ADMIT_SPECIALTY', 'ADMIT_CONSULTANT',
       'AVPU', 'Visit_Reason', 'AEOBS', 'PRIMARY_DIAG_CODE',
       'PATIENT_POSTCODE', 'ARRIVAL_TRANSPORT', 'GENDER', 'ETHNICITY',
       'SAFE_GUARDING', 'Patient_Has_Learning_Disability',
       'Patient_is_Looked_After_Child', 'REFER_SOURCE', 'County', 'District',
       'Ward', 'Country', 'Constituency', 'Rural/urban', 'Region',
       'Middle layer super output area', 'Nearest station',
       'DOCTOR_ALLOCATED_CODE', 'ReferralSource',
       'ExaminationDoctor', 'AccomodationStatus', 'SITUATION_CODE', 'DIABETES',
       'DAY_OF_WEEK', 'ADMISSION_YEAR', 'ADMISSION_MONTH', 'ADMISSION_HOUR',
       'IsWeekend', 'IS_HOLIDAY', 'READMITTED'])

# fit the encoder
encoder.fit(xtrain_cat, Y_train)

# transform the data
xtrain_cat= encoder.transform(xtrain_cat)
xtest_cat= encoder.transform(xtest_cat)

In [None]:
xtrain_cat

In [None]:
xtrain_cat.columns


# Combine categorical and Numerical Data

In [None]:
X_train=pd.concat([xtrain_num,xtrain_cat], axis=1)
X_train

In [None]:
X_test=pd.concat([xtest_num,xtest_cat], axis=1)
X_test

In [None]:
X_train.describe()

In [None]:
analysis(xtrain_cat)

## Remove Correlated Features

In [None]:
from feature_engine.selection import SmartCorrelatedSelection
from lightgbm import LGBMRegressor


lgbm=LGBMRegressor(random_state=4)

tr = SmartCorrelatedSelection(
    method="pearson",
    threshold=0.85,
    selection_method="model_performance",
    estimator=lgbm,
    scoring='neg_mean_absolute_error'
)
tr.fit(X_train, Y_train)
X_train=tr.transform(X_train)
X_test=tr.transform(X_test)


In [None]:
print(tr.correlated_feature_sets_)
print(tr.features_to_drop_)

In [None]:
X_train.info()

In [None]:
X_train.describe()

# Feature Selection

## Heat Map

In [None]:
#Feature Selection Heatmap
corr = X_train.corr()
cor_target = abs(corr["LENGTH_OF_STAY_DAYS"])
#Selecting highly correlated features
heatmap_features = cor_target[cor_target>0.001]

f, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(corr)



In [None]:
##features from Heat Map
heatmap_features.sort_values(ascending=False)

In [None]:
##Drop Target Feature after HeatMap
X_train=X_train.drop(columns='LENGTH_OF_STAY_DAYS',axis=1)

X_test=X_test.drop(columns='LENGTH_OF_STAY_DAYS',axis=1)

## RFA

In [None]:
from sklearn.ensemble import RandomForestRegressor
from feature_engine.selection import RecursiveFeatureAddition
from sklearn.linear_model import LinearRegression

regressor = RandomForestRegressor(n_estimators=10,max_depth=5,random_state=4)

In [None]:

#linear_model = LinearRegression()
tr = RecursiveFeatureAddition(estimator=regressor, scoring="neg_mean_absolute_error", cv=5, threshold=.00001)

rfa=tr.fit_transform(X_train, Y_train)


In [None]:
num=len(rfa.columns)
rfa_features=list(tr.feature_importances_.index[0:num])

In [None]:
rfa_features

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# regressor = RandomForestRegressor(n_estimators=10, max_depth=5,random_state=4)

# #===========================================================================
# # perform a scikit-learn Recursive Feature Elimination (RFE)
# #===========================================================================
# from sklearn.feature_selection import RFE
# # here we want only one final feature, we do this to produce a ranking
# ##see if there is argument threshold, otherwise use RFA that data team uses
# rfe = RFE(regressor)
# rfe.fit(X_train, Y_train)






In [None]:
# from operator import itemgetter
# features = X_train.columns.to_list()
# RFR_feat_sorted=[]
# for x, y in (sorted(zip(rfe.ranking_ , features), key=itemgetter(0))):
#     RFR_feat_sorted.append(y)

In [None]:
# RFR_feat_sorted[0:15]

## Lasso 

In [None]:

##%%timeit
#Feature Selection, Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel


##need to look into parameters of cross-validation, max_iter


lasso = LassoCV(cv=10, random_state=4, max_iter=2000).fit(X_train, Y_train)
importance = np.abs(lasso.coef_)
feature_names = np.array(X_train.columns.values.tolist())

lasso_features = np.array(X_train.columns)[importance > 0.0000]
list(lasso_features)


#before scaling


## Select Best K

In [None]:
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2, f_regression
from numpy import array 

###cv
select = SelectKBest(score_func=f_regression, k=15)
z=select.fit_transform(X_train, Y_train) 



In [None]:
filter = select.get_support()
features = array(X_train.columns)
 

 
print("Selected best 25:")
#print(features[filter])
k_best_feat=features[filter]
print(k_best_feat)

In [None]:
lasso_features=list(lasso_features)
k_best_feat=list(k_best_feat)
rfa_features=list(rfa_features)

In [None]:
print('lasso:', lasso_features)
print('k_best:', k_best_feat)
print('rfa:', rfa_features)


In [None]:
old_lasso=['PulseRate', 'Distance_from_Home_Address', 'IMD_DOOGAL', 'Average Income', 
           'Index_of_Multiple_Deprivation_Rank', 'TRIAGE_TO_SEEN_WAIT', 'ARRIVAL_TO_CONCLUSION_WAIT', 'ADMIT_WARD']

old_k=['AGE_ON_ADMISSION', 'PulseRate', 'RespiratoryRate', 'SP02', 'Distance_from_Home_Address', 
        'TRIAGE_PRIORITY', 'ARRIVAL_TO_CONCLUSION_WAIT', 'Total_LOS_(within_last_12mth)', 
        'Admits_last_12mth', 'ADMIT_WARD', 'ADMIT_SPECIALTY', 'ADMIT_CONSULTANT', 'AVPU', 
        'Visit_Reason', 'AEOBS', 'PRIMARY_DIAG_CODE', 'SAFE_GUARDING', 'Patient_Has_Learning_Disability', 
        'County', 'District', 'Constituency', 'Rural/urban', 'Region', 'TriageCat', 'READMITTED']

old_rfa=['ADMIT_SPECIALTY', 'AEOBS', 'ADMIT_WARD', 'Total_LOS_(within_last_12mth)', 
         'SP02', 'TRIAGE_PRIORITY', 'ARRIVAL_TO_CONCLUSION_WAIT', 'PRIMARY_DIAG_CODE', 
         'ExaminationDoctor', 'ADMISSION_HOUR', 'PulseRate', 'Patient_Has_Learning_Disability', 
         'AGE_ON_ADMISSION', 'District']

In [None]:
len(rfa_features)

In [None]:
max(Y_train)

# Scaling

In [None]:
from sklearn.preprocessing import StandardScaler


column_names=X_train.columns
scaler = StandardScaler()
scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
X_train=pd.DataFrame(X_train,columns=column_names)
X_test=pd.DataFrame(X_test,columns=column_names)



In [None]:
# def intersection(ls1,ls2,ls3):
#     s1=set(ls1)
#     s2=set(ls2)
#     s3=set(ls3)
#     new=s1.intersection(s2,s3)
#     return list(new)
# inter=intersection(lasso_features,rfa_features, k_best_feat)

# Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
lasso_train=X_train[lasso_features]
    
lasso_test=X_test[lasso_features]   
    
rfa_train=X_train[rfa_features]

rfa_test=X_test[rfa_features]

kbest_train=X_train[k_best_feat]

kbest_test=X_test[k_best_feat]

# inter_train=X_train[inter]
# inter_test=X_train[inter]

unique=list(set(lasso_features+k_best_feat+rfa_features))
unique_train=X_train[unique]
unique_test=X_test[unique]


In [None]:
method_to_feat={'All Feat': (X_train,X_test)}

In [None]:
indices=[['Linear Regression', 'Random Forest', 'Decision Tree', 'SVR', 'LGBM', 'CatBoost'], ['features','CV MAE','MAE','MSE', 'RMSE', 'R^2']]
index=pd.MultiIndex.from_product(indices, names=['Models', 'Metrics'])
###
metric=pd.DataFrame(index=index, columns=['Lasso Feat', 'RFA Feat', 'K_best Feat','Union','All Feat'])
#metric.set_index([pd.Index(['Linear Regression', 'Random Forest']), metric.index])
          #d.DataFrame(metric,index=['Linear Regression', 'Random Forest'])
metric

In [None]:
metric.index

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn import tree
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

linear=('Linear Regression',LinearRegression())
tree=('Decision Tree', tree.DecisionTreeRegressor(criterion='absolute_error',random_state=4))
random=('Random Forest', RandomForestRegressor(n_estimators=50,max_depth=5, random_state=4, criterion='absolute_error'))
svr=('SVR', SVR())

lgbm=('LGBM', LGBMRegressor(random_state=4))
cat=('CatBoost',CatBoostRegressor(random_state=4, loss_function='MAE'))

algorithms=[linear,tree,lgbm,cat]

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean

from sklearn.preprocessing import StandardScaler
from sklearn import metrics

for alg in algorithms:
    #cross validation:
    

    for method in method_to_feat:
        x=method_to_feat[method]
        X_train=x[0]
        X_test=x[1]


        reg = alg[1]
        alg_str=alg[0]
        
        cv = RepeatedKFold(n_splits=5, n_repeats=1, random_state=4)
        n_scores = cross_val_score(reg, X_train, Y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
        metric.loc[(alg_str,'CV MAE')][method] = mean(n_scores)

        # Fitting training data
        reg = reg.fit(X_train, Y_train)
        y_pred = reg.predict(X_test)

        
        

        #mean absolute error
        mae=metrics.mean_absolute_error(Y_test, y_pred)
        #mean squared error
        mse=metrics.mean_squared_error(Y_test, y_pred)
        #root mean squared error
        rmse=np.sqrt(metrics.mean_squared_error(Y_test, y_pred))
        #r-squared
        r_square=reg.score(X_test,Y_test)

        metric.loc[(alg_str,'features')][method] = len(X_train.columns)
        metric.loc[(alg_str,'MAE')][method] = mae
        metric.loc[(alg_str,'MSE')][method] = mse
        metric.loc[(alg_str,'RMSE')][method] = rmse
        metric.loc[(alg_str,'R^2')][method] = r_square
        
        print(metric)



metric


In [None]:
metric