# Data Evaluation <a id='data-evaluation'>

In [1]:
# standard libraries
import pandas as pd
import numpy as np
import os
from IPython.display import Image

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from tabulate import tabulate

# data pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# prediction models
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

#from kneed import KneeLocator
from scipy import stats

# import warnings filter
'''import warnings
warnings.filterwarnings('ignore')
from warnings import simplefilter 
simplefilter(action='ignore', category=FutureWarning)'''



# Loading Data <a id='loading-data'>

In [2]:
url = 'https://raw.githubusercontent.com/olmosjorge28/QTW-SPRING-2022/main/dataset_diabetes/IDs_mapping.csv'
admission_type_mapping = pd.read_csv(url, nrows=8, index_col=0)
admission_type_mapping

Unnamed: 0_level_0,description
admission_type_id,Unnamed: 1_level_1
1,Emergency
2,Urgent
3,Elective
4,Newborn
5,Not Available
6,
7,Trauma Center
8,Not Mapped


Can Potentially Deduce values from medical specialty

In [3]:
discharge_disposition_mapping = pd.read_csv(url, nrows=30,skiprows=10, index_col=0 )
discharge_disposition_mapping 

Unnamed: 0_level_0,description
discharge_disposition_id,Unnamed: 1_level_1
1,Discharged to home
2,Discharged/transferred to another short term h...
3,Discharged/transferred to SNF
4,Discharged/transferred to ICF
5,Discharged/transferred to another type of inpa...
6,Discharged/transferred to home with home healt...
7,Left AMA
8,Discharged/transferred to home under care of H...
9,Admitted as an inpatient to this hospital
10,Neonate discharged to another hospital for neo...


In [4]:
admission_source_mapping = pd.read_csv(url,skiprows=42, index_col=0 )
admission_source_mapping

Unnamed: 0_level_0,description
admission_source_id,Unnamed: 1_level_1
1,Physician Referral
2,Clinic Referral
3,HMO Referral
4,Transfer from a hospital
5,Transfer from a Skilled Nursing Facility (SNF)
6,Transfer from another health care facility
7,Emergency Room
8,Court/Law Enforcement
9,Not Available
10,Transfer from critial access hospital


# Referral,Transfer,Emergency, Normal Delivery, Other Delivery, Other/NA

In [5]:
url = 'https://raw.githubusercontent.com/olmosjorge28/QTW-SPRING-2022/main/dataset_diabetes/diabetic_data.csv'
df = pd.read_csv(url,na_values='?')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [6]:
df.columns.values.tolist()

['encounter_id',
 'patient_nbr',
 'race',
 'gender',
 'age',
 'weight',
 'admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'payer_code',
 'medical_specialty',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'diag_1',
 'diag_2',
 'diag_3',
 'number_diagnoses',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted']

## Types of Data

### Qualitative

#### Nominal:

#### Ordinal:

### Quantitative

#### Discrete

#### Continous

## Remap Id Mappings

In [7]:
def remapIds(*new_mappings: tuple) -> dict:
    mapping_dict = dict()
    for mapping in new_mappings:
        mapping_dict[mapping[1]] =  mapping[0].map(mapping[2])
    return mapping_dict
        

In [8]:
def refactorMappingIds(inputDf: pd.DataFrame) -> pd.DataFrame:
    
    admission_type_id_new_mapping = {
        1: "emergency",
        2: "urgent",
        3: "elective",
        4: "newborn",
        5: float("NaN"),
        6: float("NaN"),
        7: "trauma-center",
        8: float("NaN")
    }
    
    disharge_disposition_new_mapping = {
        1: "discharged",
        2: "transfer",
        3: "transfer",
        4: "transfer",
        5: "transfer",
        6: "transfer",
        7: "ama",
        8: "transfer",
        9: "admitted",
        10: "transfer",
        11: "expired",
        12: "admitted",
        13: "hospice",
        14: "hospice",
        15: "transfer",
        16: "transfer",
        17: "transfer",
        18: float("NaN"),
        19: "expired",
        20: "expired",
        21: "expired",
        22: "transfer",
        23: "transfer",
        24: "transfer",
        25: float("NaN"),
        26: float("NaN"),
        27: "transfer",
        28: "transfer",
        29: "transfer",
    }


    admission_sourcing_new_mapping = {
        1: "referral",
        2: "referral",
        3: "referral",
        4: "transfer",
        5: "transfer",
        6: "transfer",
        7: "emergency",
        8: "law-enforcement",
        9: float("NaN"),
        10: "transfer",
        11: "normal-delivery",
        12: "other-delivery",
        13: "other-delivery",
        14: "other-delivery",
        15: float("NaN"),
        17: float("NaN"),
        18: "transfer",
        19: "transfer",
        20: float("NaN"),
        21: float("NaN"),
        22: "transfer",
        23: "normal-delivery",
        24: "normal-delivery",
        25: "transfer",
        26: "transfer"
    }
    df = inputDf.copy()
    mapping_tuples = [
        (df['discharge_disposition_id'],'disharge_disposition_new_mapping', disharge_disposition_new_mapping),
        (df['admission_source_id'],'admission_source_new_mapping', admission_sourcing_new_mapping),
        (df['admission_type_id'], 'admission_type_id_new_mapping', admission_type_id_new_mapping)
    ]
    remappings = remapIds(*mapping_tuples)
    for newMappingKey in remappings:
        df[newMappingKey] = remappings[newMappingKey]
    return df

## Recategorizing Diag_1, Diag_2, Diag_3

In [9]:
def getDiagCategory(input: float) -> str:
    val: str
    if input < 1:
        val = float("NaN")
    elif input < 140:
        val = 'A'
    elif input < 240:
        val = 'B'
    elif input < 280:
        val = 'C'
    elif input < 290:
        val = 'D'
    elif input < 320:
        val = 'E'
    elif input < 390:
        val = 'F'
    elif input < 460:
        val = 'G'
    elif input < 520:
        val = 'H'
    elif input < 580:
        val = 'I'
    elif input < 630:
        val = 'J'
    elif input < 680:
        val = 'K'
    elif input < 710:
        val = 'L'
    elif input < 740:
        val = 'M'
    elif input < 760:
        val = 'N'
    elif input < 780:
        val = 'O'
    elif input < 800:
        val = 'P'
    elif input < 1000:
        val = 'Q'
    elif input < 2000:
        val = 'R'
    elif input < 3000:
        val = 'S'
    else:
        val = 'Z'
    return val
    

def categorizeDiag(diag: pd.Series) -> pd.Series:
    df = diag.copy()
    df.fillna(0,inplace=True)
    df.mask(df.str.startswith('V', na=False),1000, inplace=True)
    df.mask(df.str.startswith('E', na=False),2000, inplace=True)
    df = pd.to_numeric(df)
    df = df.map(getDiagCategory)
    return df
    

In [10]:
def recategorizeDiags(inputDf: pd.DataFrame) -> pd.DataFrame:
    df = inputDf.copy()
    df['diag_1_categorized'] = categorizeDiag(df['diag_1'])
    df['diag_2_categorized'] = categorizeDiag(df['diag_2'])
    df['diag_3_categorized'] = categorizeDiag(df['diag_3'])
    return df
    

### Inserting New Variables Addition variables

In [11]:
def addDiag(*diags):
    df = None
    for diag in diags:
        if (df is None):
            df = diag.notna().astype(int)
        else:
            df = df + diag.notna().astype(int)
    return df

In [12]:
def recategorizeData(inputDf: pd.DataFrame) -> pd.DataFrame:
    df = inputDf.copy()
    df = refactorMappingIds(df)
    df = recategorizeDiags(df)
    df['total_diag'] = addDiag(df['diag_1'],df['diag_2'],df['diag_3'])
    return df
    

In [13]:
df_regategorized = recategorizeData(df)

## Missing Data

In [14]:
df.isna().sum()

encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [15]:
df_regategorized.isna().sum()

encounter_id                            0
patient_nbr                             0
race                                 2273
gender                                  0
age                                     0
weight                              98569
admission_type_id                       0
discharge_disposition_id                0
admission_source_id                     0
time_in_hospital                        0
payer_code                          40256
medical_specialty                   49949
num_lab_procedures                      0
num_procedures                          0
num_medications                         0
number_outpatient                       0
number_emergency                        0
number_inpatient                        0
diag_1                                 21
diag_2                                358
diag_3                               1423
number_diagnoses                        0
max_glu_serum                           0
A1Cresult                         

## Variables Not Needed

patient_nbr <br>
encounter_id <br>
examide <br>
citoglipton <br>
weight <br>

In [16]:
df_regategorized = df_regategorized.drop(['patient_nbr','encounter_id', 'examide', 'weight','citoglipton'], 1)

## Variables Removed Because of Recategorization

discharge_disposition_id
admission_source_id
admission_type_id
diag_1
diag_2
diag_3

In [17]:
df_regategorized = df_regategorized.drop(['discharge_disposition_id','admission_source_id','admission_type_id','diag_1',
                'diag_2','diag_3'], 1)

KNN, Total Elimnation, Mode/Mean/Median

https://www.verywellhealth.com/finding-icd-codes-2615311

https://www.aapc.com/codes/icd9-codes-range/

In [19]:
#for column in ['disharge_disposition_new_mapping', 'admission_source_new_mapping', 'admission_type_id_new_mapping', 'diag_1_categorized', 'diag_2_categorized', 'diag_3_categorized', 'race', 'payer_code']:
#     df2[column].fillna(df2[column].mode()[0], inplace=True)

In [20]:
#grouped_modes = df2.groupby(['admission_type_id_new_mapping'])['medical_specialty'].agg(pd.Series.mode)
#grouped_modes['trauma-center'] = grouped_modes['urgent']
#twoColumns = df2[['admission_type_id_new_mapping','medical_specialty']]


In [25]:
def imputeData(x: pd.DataFrame):
    df = x.copy()
    
    def customApply(input):
        if(pd.isnull(input[1])):
            input[1] = grouped_modes[input[0]]
        return input
    
    for column in ['disharge_disposition_new_mapping', 'admission_source_new_mapping', 'admission_type_id_new_mapping', 'diag_1_categorized', 'diag_2_categorized', 'diag_3_categorized', 'race', 'payer_code']:
        df[column].fillna(df[column].mode()[0], inplace=True)
    
    grouped_modes = df.groupby(['admission_type_id_new_mapping'])['medical_specialty'].agg(pd.Series.mode)
    grouped_modes['trauma-center'] = grouped_modes['urgent']
    twoColumns = df[['admission_type_id_new_mapping','medical_specialty']]
    twoColumns.apply(customApply, axis=1);
    df['medical_specialty'] = twoColumns['medical_specialty']
    return df

In [26]:
df_imputed = imputeData(df_regategorized)

In [27]:
df_imputed

Unnamed: 0,race,gender,age,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,change,diabetesMed,readmitted,disharge_disposition_new_mapping,admission_source_new_mapping,admission_type_id_new_mapping,diag_1_categorized,diag_2_categorized,diag_3_categorized,total_diag
0,Caucasian,Female,[0-10),1,MC,Pediatrics-Endocrinology,41,0,1,0,...,No,No,NO,discharged,referral,emergency,C,G,G,1
1,Caucasian,Female,[10-20),3,MC,InternalMedicine,59,0,18,0,...,Ch,Yes,>30,discharged,emergency,emergency,C,C,C,3
2,AfricanAmerican,Female,[20-30),2,MC,InternalMedicine,11,5,13,2,...,No,Yes,NO,discharged,emergency,emergency,K,C,R,3
3,Caucasian,Male,[30-40),2,MC,InternalMedicine,44,1,16,0,...,Ch,Yes,NO,discharged,emergency,emergency,A,C,G,3
4,Caucasian,Male,[40-50),1,MC,InternalMedicine,51,0,8,0,...,Ch,Yes,NO,discharged,emergency,emergency,B,B,C,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),3,MC,InternalMedicine,51,0,16,0,...,Ch,Yes,>30,transfer,emergency,emergency,C,E,G,3
101762,AfricanAmerican,Female,[80-90),5,MC,InternalMedicine,33,3,18,0,...,No,Yes,NO,transfer,transfer,emergency,I,C,P,3
101763,Caucasian,Male,[70-80),1,MC,InternalMedicine,53,0,9,1,...,Ch,Yes,NO,discharged,emergency,emergency,A,J,E,3
101764,Caucasian,Female,[80-90),10,MC,Surgery-General,45,2,21,0,...,Ch,Yes,NO,transfer,emergency,urgent,Q,D,Q,3


In [28]:
df_imputed.to_csv('diabetic_data_imputed.csv')

In [30]:
df_imputed = pd.read_csv('diabetic_data_imputed.csv')

In [35]:
df_imputed

Unnamed: 0.1,Unnamed: 0,race,gender,age,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,...,change,diabetesMed,readmitted,disharge_disposition_new_mapping,admission_source_new_mapping,admission_type_id_new_mapping,diag_1_categorized,diag_2_categorized,diag_3_categorized,total_diag
0,0,Caucasian,Female,[0-10),1,MC,Pediatrics-Endocrinology,41,0,1,...,No,No,NO,discharged,referral,emergency,C,G,G,1
1,1,Caucasian,Female,[10-20),3,MC,InternalMedicine,59,0,18,...,Ch,Yes,>30,discharged,emergency,emergency,C,C,C,3
2,2,AfricanAmerican,Female,[20-30),2,MC,InternalMedicine,11,5,13,...,No,Yes,NO,discharged,emergency,emergency,K,C,R,3
3,3,Caucasian,Male,[30-40),2,MC,InternalMedicine,44,1,16,...,Ch,Yes,NO,discharged,emergency,emergency,A,C,G,3
4,4,Caucasian,Male,[40-50),1,MC,InternalMedicine,51,0,8,...,Ch,Yes,NO,discharged,emergency,emergency,B,B,C,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,101761,AfricanAmerican,Male,[70-80),3,MC,InternalMedicine,51,0,16,...,Ch,Yes,>30,transfer,emergency,emergency,C,E,G,3
101762,101762,AfricanAmerican,Female,[80-90),5,MC,InternalMedicine,33,3,18,...,No,Yes,NO,transfer,transfer,emergency,I,C,P,3
101763,101763,Caucasian,Male,[70-80),1,MC,InternalMedicine,53,0,9,...,Ch,Yes,NO,discharged,emergency,emergency,A,J,E,3
101764,101764,Caucasian,Female,[80-90),10,MC,Surgery-General,45,2,21,...,Ch,Yes,NO,transfer,emergency,urgent,Q,D,Q,3


In [36]:
df_imputed = df_imputed.drop(['Unnamed: 0'], 1)

In [37]:
df_imputed

Unnamed: 0,race,gender,age,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,change,diabetesMed,readmitted,disharge_disposition_new_mapping,admission_source_new_mapping,admission_type_id_new_mapping,diag_1_categorized,diag_2_categorized,diag_3_categorized,total_diag
0,Caucasian,Female,[0-10),1,MC,Pediatrics-Endocrinology,41,0,1,0,...,No,No,NO,discharged,referral,emergency,C,G,G,1
1,Caucasian,Female,[10-20),3,MC,InternalMedicine,59,0,18,0,...,Ch,Yes,>30,discharged,emergency,emergency,C,C,C,3
2,AfricanAmerican,Female,[20-30),2,MC,InternalMedicine,11,5,13,2,...,No,Yes,NO,discharged,emergency,emergency,K,C,R,3
3,Caucasian,Male,[30-40),2,MC,InternalMedicine,44,1,16,0,...,Ch,Yes,NO,discharged,emergency,emergency,A,C,G,3
4,Caucasian,Male,[40-50),1,MC,InternalMedicine,51,0,8,0,...,Ch,Yes,NO,discharged,emergency,emergency,B,B,C,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),3,MC,InternalMedicine,51,0,16,0,...,Ch,Yes,>30,transfer,emergency,emergency,C,E,G,3
101762,AfricanAmerican,Female,[80-90),5,MC,InternalMedicine,33,3,18,0,...,No,Yes,NO,transfer,transfer,emergency,I,C,P,3
101763,Caucasian,Male,[70-80),1,MC,InternalMedicine,53,0,9,1,...,Ch,Yes,NO,discharged,emergency,emergency,A,J,E,3
101764,Caucasian,Female,[80-90),10,MC,Surgery-General,45,2,21,0,...,Ch,Yes,NO,transfer,emergency,urgent,Q,D,Q,3


## Model preparation

In [39]:
def reclassifyY(series: pd.Series, condition):
    s_copy = series.copy()
    s_copy = s_copy==condition
    s_copy = s_copy.astype(int)
    return s_copy

In [40]:
def prepareDataframe(df: pd.DataFrame, y_var: str, condition):
    X = df.copy()
    y = reclassifyY(X[y_var], condition)
    X = X.drop([y_var], axis=1)
    return X, y

In [41]:
X, y = prepareDataframe(df_imputed, 'readmitted', '<30')

In [42]:
def scale_and_split(X, y, test_split, random):
    return scale_and_encode_data(*train_test_split(X, y, test_size = test_split, random_state= random)) 

In [46]:
def scale_and_encode_data(x_train, x_test, y_train, y_test):
    scl = StandardScaler()
    cont_vars = x_train._get_numeric_data().columns
    scl.fit(x_train[cont_vars])
    X_train_scaled = pd.DataFrame( scl.transform(x_train[cont_vars]), columns = cont_vars, index = x_train.index) # apply to training
    X_test_scaled = pd.DataFrame( scl.transform(x_test[cont_vars]), columns = cont_vars, index = x_test.index ) # apply to the test set (without snooping)

    X_train_cat = x_train.drop(cont_vars, 1)
    X_test_cat = x_test.drop(cont_vars, 1)
    

    X_train_cat_e = pd.DataFrame( enc.transform(X_train_cat).toarray(), columns = enc.get_feature_names(), index = x_train.index)
    X_test_cat_e = pd.DataFrame(enc.transform(X_test_cat).toarray(), columns = enc.get_feature_names(), index = x_test.index)
    X_train_scaled = pd.concat([X_train_scaled, X_train_cat_e], axis=1)
    X_test_scaled = pd.concat([X_test_scaled, X_test_cat_e], axis=1)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

In [48]:
X_train, X_test, y_train, y_test = scale_and_split(X,y,0.3,234523233)