In [49]:
!pip install -r ../requirements.txt

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 2.2 MB/s eta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.9.0


## Imports

In [179]:
import matplotlib.pyplot as plt
import numpy as np
import math
import os
import pandas as pd
import time
import seaborn as sns
from sklearn import metrics
from sklearn.decomposition import PCA
from collections import Counter
from imblearn.over_sampling import SMOTE

## Data

In [180]:
datasetPath = '../datasets/diabetic_data_initial.csv'

In [181]:
data = pd.read_csv(datasetPath)

## Preprocessing

### Age

Replace ranges with means.

In [182]:
replace_age = {'[0-10)' : 5,
'[10-20)' : 15,
'[20-30)' : 25, 
'[30-40)' : 35, 
'[40-50)' : 45, 
'[50-60)' : 55,
'[60-70)' : 65, 
'[70-80)' : 75,
'[80-90)' : 85,
'[90-100)' : 95}
data['age'] = data['age'].apply(lambda x : replace_age[x])

## Patient History

Count previous visits.

In [183]:
data['prev_visits'] = data.groupby('patient_nbr').encounter_id.transform(lambda x: pd.Series(range(len(x))))

## Medical Specialty

Bucket specialities.

In [184]:
high_frequency = ['InternalMedicine', 'Family/GeneralPractice', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive', 'Emergency/Trauma', 'Urology','ObstetricsandGynecology','Psychiatry','Pulmonology ','Nephrology','Radiologist']
low_frequency = ['Surgery-PlasticwithinHeadandNeck','Psychiatry-Addictive','Proctology','Dermatology','SportsMedicine','Speech','Perinatology', 'Neurophysiology','Resident','Pediatrics-Hematology-Oncology','Pediatrics-EmergencyMedicine','Dentistry','DCPTEAM','Psychiatry-Child/Adolescent', 'Pediatrics-Pulmonology','Surgery-Pediatric','AllergyandImmunology','Pediatrics-Neurology','Anesthesiology','Pathology','Cardiology-Pediatric', 'Endocrinology-Metabolism','PhysicianNotFound','Surgery-Colon&Rectal','OutreachServices', 'Surgery-Maxillofacial','Rheumatology','Anesthesiology-Pediatric','Obstetrics','Obsterics&Gynecology-GynecologicOnco']
pediatrics = ['Pediatrics','Pediatrics-CriticalCare','Pediatrics-EmergencyMedicine','Pediatrics-Endocrinology','Pediatrics-Hematology-Oncology', 'Pediatrics-Neurology','Pediatrics-Pulmonology', 'Anesthesiology-Pediatric', 'Cardiology-Pediatric', 'Surgery-Pediatric']
psychic = ['Psychiatry-Addictive', 'Psychology', 'Psychiatry',  'Psychiatry-Child/Adolescent', 'PhysicalMedicineandRehabilitation', 'Osteopath']
neurology = ['Neurology', 'Surgery-Neuro',  'Pediatrics-Neurology', 'Neurophysiology']
surgery = ['Surgeon', 'Surgery-Cardiovascular', 'Surgery-Cardiovascular/Thoracic', 'Surgery-Colon&Rectal', 'Surgery-General', 'Surgery-Maxillofacial', 'Surgery-Plastic', 'Surgery-PlasticwithinHeadandNeck',  'Surgery-Thoracic', 'Surgery-Vascular', 'SurgicalSpecialty', 'Podiatry']
ungrouped = ['Endocrinology','Gastroenterology','Gynecology','Hematology','Hematology/Oncology','Hospitalist','InfectiousDiseases', 'Oncology','Ophthalmology','Otolaryngology','Pulmonology','Radiology']
missing = ['?']

def get_specialty_type(specialty):
    if specialty in pediatrics : return 'pediatrics'
    elif specialty in psychic : return 'psychic'
    elif specialty in neurology : return 'neurology'
    elif specialty in surgery : return 'surgery'
    elif specialty in high_frequency : return 'high_freq'
    elif specialty in low_frequency : return 'low_freq'
    elif specialty in ungrouped : return 'ungrouped'
    elif specialty in missing : return 'missing'

data['medical_specialty'] = data['medical_specialty'].apply(get_specialty_type)

## Diagnoses

Read diagnoses based off of [AAPC codes](https://www.aapc.com/codes/icd9-codes-range/).

In [185]:
def get_diag(diag):
    if '?' in diag: return 'unknown'
    if 'V' in diag:  return 'health_contact'
    if 'E' in diag: return 'injury_poisoning'
    diag_no = int(float(diag))
    if diag_no in range(0, 140): return 'infectious'
    if diag_no in range(140, 240): return 'neoplasms'
    if diag_no == 250: return 'diabetes'
    if diag_no in range(251, 260): return 'endocrine'
    if diag_no in range(240, 280): return 'long_term'
    if diag_no in range(280, 290): return 'blood'
    if diag_no in range(290, 320): return 'mental'
    if diag_no in range(320, 390): return 'nervous'
    if diag_no in range(390, 460): return 'respiratory'
    if diag_no in range(520, 580): return 'digestive'
    if diag_no in range(580, 630): return 'genitourinary'
    if diag_no in range(630, 680): return 'birth'
    if diag_no in range(680, 710): return 'skin'
    if diag_no in range(710, 740): return 'musculoskeletal'
    if diag_no in range(740, 760): return 'congenital'
    if diag_no in range(760, 780): return 'prenital'
    if diag_no in range(780, 800): return 'symptoms'
    if diag_no in range(800, 1000): return 'injury_poisoning'
    return 'other'

In [186]:
data['diagnosis_1'] = data['diag_1'].apply(get_diag)
data['diagnosis_2'] = data['diag_2'].apply(get_diag)
data['diagnosis_3'] = data['diag_3'].apply(get_diag)

In [187]:
def get_diabetes_type(diag):
    if 'V' in diag or 'E' in diag or '?' in diag: return 'no_diabetes'
    diag_float = round(float(diag),1)
    if diag_float == 250.0: return 'no_complications'
    if diag_float == 250.1: return 'ketoacidosis'
    if diag_float == 250.2: return 'hyperosmolarity'
    if diag_float == 250.3: return 'other_coma'
    if diag_float == 250.4: return 'renal'
    if diag_float == 250.5: return 'ophthalmic'
    if diag_float == 250.6: return 'nuerological'
    if diag_float == 250.7: return 'peripheral'
    if diag_float == 250.8: return 'other'
    if diag_float == 250.9: return 'unspecificed'
    return 'no_diabetes'

In [188]:
data['diabetes_type_1'] = data['diag_1'].apply(get_diabetes_type)
data['diabetes_type_2'] = data['diag_2'].apply(get_diabetes_type)
data['diabetes_type_3'] = data['diag_3'].apply(get_diabetes_type)

## Readmitted

In [189]:
data['readmitted'] = data['readmitted'].apply(lambda x : 0 if (x == '>30' or x == 'NO') else 1)
data = data.reset_index()

## Treatments and Tests

In [190]:
def get_aic(r):
    if r == 'None':
        return -1
    if r == 'Norm':
        return 0
    if r == '>7' or r == '>8':
        return 1

def get_glu(r):
    if r == 'None':
        return -1
    if r == 'Norm':
        return 0
    if r == '>200' or r == '>300':
        return 1

def get_medicine(m):
    if m == 'No':
        return 0
    if m == 'Steady' or m == 'Up' or m == 'Down':
        return 1
    
data['A1Cresult'] = data['A1Cresult'].apply(get_aic)
data['max_glu_serum'] = data['max_glu_serum'].apply(get_glu)
for m in data.columns[25:48]:
    data[m] = data[m].apply(get_medicine)

## One-Hot Encoding

In [191]:
from sklearn.preprocessing import MultiLabelBinarizer

In [193]:
def multi_encode(cols):
    to_encode = data[cols].apply(list, axis=1)
    mlb = MultiLabelBinarizer()
    return pd.DataFrame(mlb.fit_transform(to_encode),
                   columns=mlb.classes_,
                   index=to_encode.index)

In [194]:
diagnosis_cols = ['diagnosis_1', 'diagnosis_2', 'diagnosis_3']
diabetes_cols = ['diabetes_type_1', 'diabetes_type_2', 'diabetes_type_3']
data = data.join(multi_encode(diagnosis_cols).add_prefix('diagnosis_'))
data = data.join(multi_encode(diabetes_cols).add_prefix('diabetes_'))

In [120]:
cat_vars = ['race', 'gender']
for var in cat_vars:
    data = data.join(pd.get_dummies(data[var], prefix=var))
    data = data.drop(var, axis=1)

In [78]:
X = data[[col for col in data.columns if col != 'readmitted']].to_numpy()
y = data['readmitted'].to_numpy()

In [79]:
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)

ValueError: could not convert string to float: 'Caucasian'