# Nursing Study
### Explores disability datasets using various Machine Learning models
Payton Burks  
10/20/2022

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split

from mysklearn.classification import MyKNeighborsClassifier as kNN

## Prepare Dataframe

In [3]:
# prepare df, create functions
df = pd.read_csv("data/CHRILexcel.csv")
df= df.drop(df.columns[0], axis=1)

print(df.columns)

def create_n_dim_array(df, cols):
    X = []
    for i in range(len(df)):
        row_data = []
        for col in cols:
            row_data.append(df[col][i])
        X.append(row_data)

    return X

Index(['RUCA_3cat', 'healthstatus_1', 'healthstatus_2ph', 'healthstatus_3mh',
       'transp_access', 'pas_yn_88', 'pas_yn_99', 'commpart_1', 'commpart_3',
       'commpart_5', 'himde_ssi_apply', 'himde_ssdi_apply', 'insurance_esi_yn',
       'insurance_mrkt_yn', 'emp_yn', 'emp_hrs', 'demo_age', 'demo_gender',
       'demo_raceeth_categorical', 'demo_marital', 'demo_kids',
       'demo_education', 'demo_hrms_disability', 'demo_dis_ageonset',
       'demo_dis_primarytype', 'demo_dis_selfcat', 'demo_acs_deaf',
       'demo_acs_vision'],
      dtype='object')


## Decision Tree
Begin exploration of data using Decision Trees.  
* Pull out response variables
    * emp_yn (whether or not employed) - discrete
    * emp_hrs (hrs of employment) - continuous  
* Load in all other variables into Decision Tree training set

In [6]:
def create_model(X, y, model, ft_names, figname, cls_names=None):
    clf = model
    clf = clf.fit(X, y)

    plt.figure(figsize=(30,30))
    tree.plot_tree(clf, feature_names=ft_names, class_names=cls_names, fontsize=25, filled=True)
    plt.savefig('output/'+figname)
    plt.close()

index_names = ['RUCA_3cat', 'healthstatus_1', 'healthstatus_2ph', 'healthstatus_3mh',
                'transp_access', 'pas_yn_88', 'pas_yn_99', 'commpart_1', 'commpart_3',
                'commpart_5', 'himde_ssi_apply', 'himde_ssdi_apply', 'insurance_esi_yn',
                'insurance_mrkt_yn', 'demo_age', 'demo_gender',
                'demo_raceeth_categorical', 'demo_marital', 'demo_kids',
                'demo_education', 'demo_hrms_disability', 'demo_dis_ageonset',
                'demo_dis_primarytype', 'demo_dis_selfcat', 'demo_acs_deaf',
                'demo_acs_vision']

X = create_n_dim_array(df, index_names)

y_yn = create_n_dim_array(df, ['emp_yn'])
y_hrs = create_n_dim_array(df, ['emp_hrs'])

create_model(X, y_yn, tree.DecisionTreeClassifier(max_depth=3), index_names, 'Raw Classifier.png', ['not employed', 'employed'])
create_model(X, y_hrs, tree.DecisionTreeRegressor(max_depth=3), index_names, 'Raw Regressor.png')

## Decision Tree pt. 2
Edits
* Joined **pas_yn_88** and **pas_yn_99** into one column, **pas_yn** based on if a patient answered 1 - _(yes I need help)_ - in either column
* Removed the following based on Jesus' report on significance:
    * **commpart_3, commpart_3, commpart_5, himde_ssi_apply, insurance_esi_yn,   
    insurance_mrkt_yn, demo_hrms_disability, demo_dis_selfcat, demo_acs_deaf**

In [5]:
# make pas_yn a single column
pas_yn = []
for i in range(len(df['pas_yn_88'])):
    if df['pas_yn_88'][i] == 1 and df['pas_yn_99'][i] == 1:
        pas_yn.append(1)
    else:
        pas_yn.append(0)
df['pas_yn'] = pas_yn

# remove irrelevant vars
df2= df.drop(['pas_yn_88', 'pas_yn_99', 'commpart_3', 'commpart_5', 'himde_ssi_apply', 'insurance_esi_yn',
                'insurance_mrkt_yn', 'demo_hrms_disability', 'demo_dis_selfcat', 'demo_acs_deaf', 'emp_yn', 'emp_hrs'], axis=1)

X_mod = create_n_dim_array(df2, df2.columns)

# EMP YN
create_model(X_mod, y_yn, tree.DecisionTreeClassifier(max_depth=3), df2.columns, 'Modified Employment Classifier', ['not employed', 'employed'])
# EMP HRS
create_model(X_mod, y_hrs, tree.DecisionTreeRegressor(max_depth=3), df2.columns, 'Modified Employment Hours Regressor')

## Decision Tree pt. 3
Edits
* Removed all variables related to insurance
* Entirely removed **pas_yn** variable

In [11]:
df3 = df2.drop(['himde_ssdi_apply'], axis=1)
X_mod_2 = create_n_dim_array(df3, df3.columns)

# EMP YN
create_model(X_mod_2, y_yn, tree.DecisionTreeClassifier(max_depth=3), df3.columns, 'INS_PAS Modified Employment Classifier', ['not employed', 'employed'])
# EMP HRS
create_model(X_mod_2, y_hrs, tree.DecisionTreeRegressor(max_depth=3), df3.columns, 'INS_PAS Modified Employment Regressor')

## Other Dataset - SPS Data
New dataset, removed missing data with regex and numpy libraries

In [18]:
# prepare dataframe
SPS_df = pd.read_csv("data/SPS.csv")
SPS_df = SPS_df.replace(r'^\s*$', np.nan, regex=True)
SPS_df = SPS_df.dropna(axis=0)
SPS_df = SPS_df.reset_index()
SPS_df = SPS_df.drop(SPS_df.columns[0], axis=1)

# grab response variables
y_sps_hrs = SPS_df['emp_hrs']
y_sps_yn = SPS_df['emp_Trans']

SPS_df = SPS_df.drop(['emp_hrs', 'emp_Trans'], axis=1)

print(SPS_df.columns)

X3 = create_n_dim_array(SPS_df, SPS_df.columns)

# EMP YN
create_model(X3, y_sps_yn, tree.DecisionTreeClassifier(max_depth=3), SPS_df.columns, 'SPS Classifier', ['not employed', 'employed'])
# EMP HRS
create_model(X3, y_sps_hrs, tree.DecisionTreeRegressor(max_depth=3), SPS_df.columns, 'SPS Regressor')

Index(['KUID', 'expansion_status', 'himde_ssi_approved', 'himde_ssdi_approved',
       'benefits_yn', 'insur_trans', 'healthstatus_1', 'HS1_trans',
       'healthstatus_2ph', 'healthstatus_3mh', 'healthstatus_4',
       'healthstatus_5smoking', 'healthstatus_8', 'healthstatus_9',
       'healthstatus_10', 'assist_Trans', 'pas_job', 'transp_access',
       'RUCA_3cat', 'income_householdsize', 'income_fpl_level_2019',
       'income_fpl_level', 'demo_age', 'demo_age_group', 'demo_gender',
       'demo_raceeth_categorical', 'demo_marital', 'demo_kids',
       'demo_education', 'demo_dis_ageonset', 'demo_dis_primarytype_2020',
       'demo_livewith'],
      dtype='object')


## kNN
An exploratory analysis of the kNN model, not yet optimized

In [3]:
# select variables to be used
y1 = df["emp_yn"]
X1 = create_n_dim_array(df, ["RUCA_3cat", "insurance_esi_yn"])
y1 = y1.values.tolist()

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.1)

# kNN
test_kNN = kNN()
test_kNN.fit(X_train, y_train)

pred = test_kNN.predict(X_test)

# accuracy
cor = 0
for i in range(len(pred)):
    if y_test[i] == pred[i]:
        cor+=1

acc = cor/len(X_test)
print(acc)

0.7794117647058824
