# Visualizing One Tree in the Forest with df_imputed_min

In [1]:
import pandas as pd
import numpy as np
import pandas_profiling
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from pprint import pprint

In [2]:
#import the data
directory = 'C:/Users/N1110/Desktop/7331_Project/data/'
df = pd.read_csv(directory + 'Diabetes_tmp_Cleaned.csv')
df_imputed = df
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 51 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            101766 non-null int64
number_diagnoses            101766 non-null int64
max_glu_serum               101766 

In [3]:
#10 age groups; Ordinal encoding
#feature density plot of age can help with answering the question like: does people older age tend to be readmitted?
#can use map/apply function to achieve this

df_imputed.age[df_imputed.age== '[0-10)'] = 1
df_imputed.age[df_imputed.age== '[10-20)'] = 2
df_imputed.age[df_imputed.age== '[20-30)'] = 3
df_imputed.age[df_imputed.age== '[30-40)'] = 4
df_imputed.age[df_imputed.age== '[40-50)'] = 5
df_imputed.age[df_imputed.age== '[50-60)'] = 6
df_imputed.age[df_imputed.age== '[60-70)'] = 7
df_imputed.age[df_imputed.age== '[70-80)'] = 8
df_imputed.age[df_imputed.age== '[80-90)'] = 9
df_imputed.age[df_imputed.age== '[90-100)'] = 10

df_imputed["age"] = df_imputed["age"].astype(int)

In [4]:
dummiesList=["discharge_disposition","admission_source", "admission_type"]
    
df_imputed_min_wDummies = pd.get_dummies(df_imputed[dummiesList])

In [5]:
NumFeatures= ["age","num_medications", "number_diagnoses", "time_in_hospital",
       "number_emergency", "num_lab_procedures", "number_inpatient", "readmitted_tf"]

df_imputed_min_wDummies= pd.concat((df_imputed_min_wDummies, df_imputed[NumFeatures]), axis=1)

In [6]:
df_imputed=df_imputed_min_wDummies

from sklearn.model_selection import ShuffleSplit

# we want to predict the X and y data as follows:
if 'readmitted_tf' in df_imputed:
    y = df_imputed['readmitted_tf'].values # get the labels we want
    del df_imputed['readmitted_tf'] # get rid of the class label
    X = df_imputed.values # use everything else to predict!

    ## X and y are now numpy matrices, by calling 'values' on the pandas data frames we
    #    have converted them into simple matrices to use with scikit learn
    
    
# to use the cross validation object in scikit learn, we need to grab an instance
#    of the object and set it up. This object will be able to split our data into 
#    training and testing splits
num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                         test_size  = 0.2)
                         
print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)


In [7]:
## Training and Testing Split
# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object.split(X,y): 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]
    
      # we want to normalize the features based upon the mean and standard deviation of each column. 
# However, we do not want to accidentally use the testing data to find out the mean and std (this would be snooping)
# to Make things easier, let's start by just using whatever was last stored in the variables:
##    X_train , y_train , X_test, y_test (they were set in a for loop above)
from sklearn.preprocessing import StandardScaler
scl_obj = StandardScaler()

scl_obj.fit(X_train)
X_test_scaled = scl_obj.transform(X_test)

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) 
    



In [8]:
rf_clf = RandomForestClassifier(  random_state = 2000, criterion = 'gini', min_samples_split=10,min_samples_leaf=1,
                                max_features='sqrt', max_depth=50, bootstrap = False,
                                n_estimators = 1800, verbose = False, n_jobs = 4)
rf_clf.fit(X_train_scaled,y_train)

preds = rf_clf.predict(X_test_scaled)
print('Random Forest - roc_auc_score: ', roc_auc_score(y_test, preds)) 

Random Forest - roc_auc_score:  0.6601250264605687


In [9]:
final_model_forviz=rf_clf

In [10]:
# Use sklearn to export the tree 
from sklearn.tree import export_graphviz

# Write the decision tree as a dot file
visual_tree =final_model_forviz.estimators_[12]
export_graphviz(visual_tree, out_file = 'C:/Users/N1110/Desktop/7331_Project/images/best_tree.dot', feature_names = df_imputed.columns.values, 
                precision = 2, filled = True, rounded = True, max_depth = None)

In [17]:
# Use pydot for converting to an image file
#pip3 install pydot
import pydot

# Import the dot file to a graph and then convert to a png
(graph, ) = pydot.graph_from_dot_file('C:/Users/N1110/Desktop/7331_Project/images/best_tree.dot')
graph.write_jpg('C:/Users/N1110/Desktop/7331_Project/images/best_tree.jpg')

FileNotFoundError: [WinError 2] "dot" not found in path.