In [25]:
#Imports all required packages from saved location
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import matplotlib.mlab as mlab
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
#import lightgbm as lgb


In [26]:
dataset = pd.read_csv ('/Users/charlieargent/Desktop/University of Exeter/Year 3/AI Project/dataset.csv')

dataset.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


Selecting Features
=================
Selecting features based on importance - following Ellie and Lewis' original analysis, will use uploaded functions once Lewis has put up on GitHub.




In [27]:
'''Selecting features - dropping uninteresting columns'''

def chose_features(dataset, features=dataset.columns, n_features = -1, v=1, vv =0):
    '''Return reduced dataset with only chosen columns
    - dataset: pandas dataframe of dataset to have columns chosen
    - features (optional, default = all features): list of strings matching features to keep
    - n_features (optional) - if specified, the top n features from the scaled list is chosen: 
    ['glucose', 'age', 'totChol', 'cigsPerDay', 'diaBP', 'prevalentHyp',
        'diabetes', 'BPMeds', 'male', 'BMI', 'prevalentStroke',
        'education', 'heartRate', 'currentSmoker'],
    - v (optional) - Verbose (default 1) int 0 or 1. Print no. of features kept and lost 
    - vv (optional) - Very verbose (default 0) int 0 or 1. Print list of chosen and rejected features
    '''
                
    print('Now selecting chosen features....')
    
    if n_features != -1:
        if n_features > len(dataset.columns):
            print('WARNING: chose_features has an error: n_features must be less than no. columns')
            return(-1)
        else:
            ordered_f = ['TenYearCHD','glucose', 'age', 'totChol', 'cigsPerDay', 'diaBP', 'prevalentHyp',
            'diabetes', 'BPMeds', 'male', 'BMI', 'sysBP','prevalentStroke',
            'education', 'heartRate', 'currentSmoker']
            features = ordered_f[0:n_features]

    if v == 1: 
        print('\t * Number of features: ', len(features))
        print('\t * Number of dropped features: ', len(dataset.columns) - len(features))
        
    if vv == 1:
        print('\t * Chosen features: ', features)
        print('\t * Dropped features: ',[col for col in dataset.columns if col not in features])
    print('')
    
    return dataset.copy()[features] #reduced dataset

#chose_features(dataset, features=['TenYearCHD','glucose', 'age'],vv=1)
#^Example of calling the func, pulls just the three features labeled, and vv=1 gives extra detail to the level of selection

In [28]:
#10 most important features, from Ellie's 'graph_analysis_EB', plus TenYearCHD 

features = ['sysBP', 'age', 'cigsPerDay', 'totChol', 'diaBP', 'prevalentHyp',
       'diabetes', 'BPMeds', 'male', 'BMI', 'TenYearCHD'] 
Top10 = chose_features(dataset, features=features, vv=1)
Top10.head(10)





Now selecting chosen features....
	 * Number of features:  11
	 * Number of dropped features:  5
	 * Chosen features:  ['sysBP', 'age', 'cigsPerDay', 'totChol', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'male', 'BMI', 'TenYearCHD']
	 * Dropped features:  ['education', 'currentSmoker', 'prevalentStroke', 'heartRate', 'glucose']



Unnamed: 0,sysBP,age,cigsPerDay,totChol,diaBP,prevalentHyp,diabetes,BPMeds,male,BMI,TenYearCHD
0,106.0,39,0.0,195.0,70.0,0,0,0.0,1,26.97,0
1,121.0,46,0.0,250.0,81.0,0,0,0.0,0,28.73,0
2,127.5,48,20.0,245.0,80.0,0,0,0.0,1,25.34,0
3,150.0,61,30.0,225.0,95.0,1,0,0.0,0,28.58,1
4,130.0,46,23.0,285.0,84.0,0,0,0.0,0,23.1,0
5,180.0,43,0.0,228.0,110.0,1,0,0.0,0,30.3,0
6,138.0,63,0.0,205.0,71.0,0,0,0.0,0,33.11,1
7,100.0,45,20.0,313.0,71.0,0,0,0.0,0,21.68,0
8,141.5,52,0.0,260.0,89.0,1,0,0.0,1,26.36,0
9,162.0,43,30.0,225.0,107.0,1,0,0.0,1,23.61,0


Dealing with missing values
====================

Removing missing values, now using the function created by Lewis, and then replacing missing values with the median


In [29]:
'''Dealing with missing values'''

#Method 1: Drop missing values
def drop_missing(dataset):
    '''Drop rows with any missing values and return dataset with dropped rows. Prints number and percentage of rows dropped
    - Dataset: pandas Dataframe
    '''
    print('Now dropping rows with missing values....')
    dataset2 = dataset.copy().dropna().reset_index(drop=True)
    lost = len(dataset) - len(dataset2)
    print('\t * Dropped {} rows {:.1f}%. {} rows remaining\n'.format(lost,lost/len(dataset)*100,len(dataset2)))
    return dataset2


In [30]:
#if using the entire data set
col_interestedin1 = dataset.columns
values_removed1 = drop_missing(chose_features(dataset,col_interestedin1, vv =1))

Now selecting chosen features....
	 * Number of features:  16
	 * Number of dropped features:  0
	 * Chosen features:  Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')
	 * Dropped features:  []

Now dropping rows with missing values....
	 * Dropped 582 rows 13.7%. 3658 rows remaining



In [31]:
##e.g. if using top 10 most important columns 
col_interestedin2 = ['TenYearCHD','sysBP', 'age', 'cigsPerDay', 'totChol', 'diaBP', 'prevalentHyp',
       'diabetes', 'BPMeds', 'male', 'BMI'] # this is top 10 features from Ellie's 'graph_analysis_EB'
values_removed2 = drop_missing(chose_features(dataset,col_interestedin2, vv =1))

Now selecting chosen features....
	 * Number of features:  11
	 * Number of dropped features:  5
	 * Chosen features:  ['TenYearCHD', 'sysBP', 'age', 'cigsPerDay', 'totChol', 'diaBP', 'prevalentHyp', 'diabetes', 'BPMeds', 'male', 'BMI']
	 * Dropped features:  ['education', 'currentSmoker', 'prevalentStroke', 'heartRate', 'glucose']

Now dropping rows with missing values....
	 * Dropped 149 rows 3.5%. 4091 rows remaining



In [32]:
#Method 2: Imputation (of mean or median)
def impute_missing(dataset, strategy = 'median', v=1, vv=0):
    '''Imputation - alternative to removing missing values.
    Fill all missing with column average (median or mean)
    dataset - Pandas Dataframe to be imputed
    strategy - str (optional) 'median' (default) or 'mean' to fill missing values with
    - v (optional) - Verbose (default 1) int 0 or 1. Print no. of missing and imputed values  
    - vv (optional) - Very verbose (default 0) int 0 or 1. Print list of imputed features with counts and replaced value
    '''
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer(strategy=strategy)
    dataset2 = pd.DataFrame(my_imputer.fit_transform(dataset),columns=dataset.columns)
    
    if v == 1: 
        print('Imputing missing values with {}....'.format(strategy))
        print('\t * Number of missing values: ', dataset.isna().sum().sum())
        print('\t * Number of imputed values: ', dataset.isna().sum().sum() - dataset2.isna().sum().sum())
        print('\n')
    if vv == 1:
        subbed = pd.DataFrame(dataset.isna().sum().sort_values(ascending=False),columns=['N_missing'])
        subbed= subbed.assign(Imputed_value=np.NaN)
        for col in subbed.index:
            if strategy == 'median':
                subbed.loc[col,'Imputed_value'] = dataset[col].median()
            elif strategy == 'mean':
                subbed.loc[[col,'Imputed_value']] = dataset[col].mean()
        print(subbed)
    
    return dataset2

In [33]:
#if using the entire data set
col_interestedin1 = dataset.columns
values_replaced1 = impute_missing(chose_features(dataset,col_interestedin1), vv=1)

Now selecting chosen features....
	 * Number of features:  16
	 * Number of dropped features:  0

Imputing missing values with median....
	 * Number of missing values:  645
	 * Number of imputed values:  645


                 N_missing  Imputed_value
glucose                388           78.0
education              105            2.0
BPMeds                  53            0.0
totChol                 50          234.0
cigsPerDay              29            0.0
BMI                     19           25.4
heartRate                1           75.0
TenYearCHD               0            0.0
diaBP                    0           82.0
sysBP                    0          128.0
diabetes                 0            0.0
prevalentHyp             0            0.0
prevalentStroke          0            0.0
currentSmoker            0            0.0
age                      0           49.0
male                     0            0.0


In [34]:
##e.g. if using top 10 most important columns 
col_interestedin2 = ['TenYearCHD','sysBP', 'age', 'cigsPerDay', 'totChol', 'diaBP', 'prevalentHyp',
       'diabetes', 'BPMeds', 'male', 'BMI'] # this is top 10 features from Ellie's 'graph_analysis_EB'
values_replaced2 = impute_missing(chose_features(dataset,col_interestedin2), vv=1)

Now selecting chosen features....
	 * Number of features:  11
	 * Number of dropped features:  5

Imputing missing values with median....
	 * Number of missing values:  151
	 * Number of imputed values:  151


              N_missing  Imputed_value
BPMeds               53            0.0
totChol              50          234.0
cigsPerDay           29            0.0
BMI                  19           25.4
male                  0            0.0
diabetes              0            0.0
prevalentHyp          0            0.0
diaBP                 0           82.0
age                   0           49.0
sysBP                 0          128.0
TenYearCHD            0            0.0


Feature Scaling
=====

Normalising the data between 0 and 1 for increased algorithm efficiency, ensures all the data sits within the same range

In [35]:
def scale_data(data, method='std'):
    '''Return dataset scaled by MinMaxScalar or StandardScalar methods from sklearn.preprocessing
    - data: pandas dataframe of data to be scaled
    - method (optional): str of either 'minmax' for MinMaxScalar or 'std' for StandardScaler (default arg)
    '''
    from sklearn import preprocessing
    
    if method == 'minmax':
        scaler_minmax = preprocessing.MinMaxScaler((0,1))
        return pd.DataFrame(scaler_minmax.fit_transform(data.copy()),columns=data.columns) 
    
    elif method == 'std':
        scaler_std = preprocessing.StandardScaler() #with_std=False
        return pd.DataFrame(scaler_std.fit_transform(dataset.copy()),columns=dataset.columns)
    
    else:
        print('\nscale_data encountered a failure!!\n')
        return(-1)



In [36]:
##e.g. scales the whole dataset
scale_1 = scale_data(dataset)
scale_1

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1.153113,-1.234283,1.981583,-0.988276,-0.755465,-0.174698,-0.077014,-0.671241,-0.162437,-0.935261,-1.196267,-1.083027,0.286614,0.342735,-0.207240,-0.423188
1,-0.867217,-0.417664,0.020160,-0.988276,-0.755465,-0.174698,-0.077014,-0.671241,-0.162437,0.298311,-0.515399,-0.159355,0.718054,1.590247,-0.248992,-0.423188
2,1.153113,-0.184345,-0.960552,1.011863,0.922240,-0.174698,-0.077014,-0.671241,-0.162437,0.186168,-0.220356,-0.243325,-0.112959,-0.073103,-0.499501,-0.423188
3,-0.867217,1.332233,1.000871,1.011863,1.761092,-0.174698,-0.077014,1.489778,-0.162437,-0.262404,0.800946,1.016227,0.681284,-0.904778,0.878299,2.363017
4,-0.867217,-0.417664,1.000871,1.011863,1.173896,-0.174698,-0.077014,-0.671241,-0.162437,1.083311,-0.106878,0.092555,-0.662065,0.758572,0.126772,-0.423188
5,-0.867217,-0.767644,0.020160,-0.988276,-0.755465,-0.174698,-0.077014,1.489778,-0.162437,-0.195118,2.162682,2.275780,1.102919,0.093232,0.711293,-0.423188
6,-0.867217,1.565553,-0.960552,-0.988276,-0.755465,-0.174698,-0.077014,-0.671241,-0.162437,-0.710975,0.256252,-0.999057,1.791753,-1.320615,0.126772,2.363017
7,-0.867217,-0.534324,0.020160,1.011863,0.922240,-0.174698,-0.077014,-0.671241,-0.162437,1.711312,-1.468614,-0.999057,-1.010159,0.259567,-0.165489,-0.423188
8,1.153113,0.282295,-0.960552,-0.988276,-0.755465,-0.174698,-0.077014,1.489778,-0.162437,0.522597,0.415121,0.512406,0.137080,0.010065,-0.123737,-0.423188
9,1.153113,-0.767644,-0.960552,1.011863,1.761092,-0.174698,-0.077014,1.489778,-0.162437,-0.262404,1.345640,2.023869,-0.537046,1.423912,0.252027,-0.423188


In [37]:
# scales just the top 10 most important comlumns
scale_2 = scale_data(col_interestedin2).head()
scale_2

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1.153113,-1.234283,1.981583,-0.988276,-0.755465,-0.174698,-0.077014,-0.671241,-0.162437,-0.935261,-1.196267,-1.083027,0.286614,0.342735,-0.20724,-0.423188
1,-0.867217,-0.417664,0.02016,-0.988276,-0.755465,-0.174698,-0.077014,-0.671241,-0.162437,0.298311,-0.515399,-0.159355,0.718054,1.590247,-0.248992,-0.423188
2,1.153113,-0.184345,-0.960552,1.011863,0.92224,-0.174698,-0.077014,-0.671241,-0.162437,0.186168,-0.220356,-0.243325,-0.112959,-0.073103,-0.499501,-0.423188
3,-0.867217,1.332233,1.000871,1.011863,1.761092,-0.174698,-0.077014,1.489778,-0.162437,-0.262404,0.800946,1.016227,0.681284,-0.904778,0.878299,2.363017
4,-0.867217,-0.417664,1.000871,1.011863,1.173896,-0.174698,-0.077014,-0.671241,-0.162437,1.083311,-0.106878,0.092555,-0.662065,0.758572,0.126772,-0.423188


In [38]:
#Scales the entire dataset where missing values have been removed
scale_3 = scale_data(values_removed1)

#Sclaes the top 10 columns from the data, where missing values have been removed
scale_4 = scale_data(values_removed2)

#Scales the entire data set, where missing values have been replaced with median
scale_5 = scale_data(values_replaced1)

#Scales the to 10 columns, where missing values have been replaced with median
scale_6 = scale_data(values_replaced2)

Splitting the Data into training and test sets
======

This fucntion splits the data into 80% training, 20% test 

In [39]:
def split_data(dataset,dep_var='TenYearCHD', test_size = 0.2, v = 1):
    '''Split the dataset, return X_train, X_test, y_train, y_test as Pandas Dataframes
    - dataset: Pandas Dataframe. Data to split into training and test data
    - dep_var (optional, default = 'TenYearCHD'): string. Name of column to be dependant variable
    - test_size (optional, default = 0.2): float (0.0-1.0). Proportion of total data to make up test set.
    '''
    from sklearn.model_selection import train_test_split
    y = dataset[dep_var]
    X = dataset.drop([dep_var], axis = 1)
    if v == 1: 
          print('Splitting data set into {}% training, {}% test dataset....'.format(100*(1-test_size),100*test_size))
        
    return train_test_split(X, y, test_size = test_size, random_state=0)


Data Set Splits 
====

Split_one
-----
This is the entire dataset, split into 80% training, 20% test

Split_two
-------
This is the top 10 most important columns from the dataset, which has been split as above, with no other processing

Split_three
-----
This is the whole dataset split as above, where any missing values have been removed

Split_four
----
This is the top 10 most important columns, where the missing values have been removed

Split_five
---
This is the whole dataset, where missing values have been replaced with their median value

Split_six
---
This is the top 10 most important columns, where any missing values have been replaced with their median value 

In [43]:
#splits the entire, initial, unprocessed data set into a training and test set
split_one = split_data(dataset)


#splits the top10 columns, but still unprocessed data
split_two = split_data(Top10)

#Splits the whole dataset, where missing values have been removed
split_three = split_data(values_removed1)

#Splits the top10 dataset, where missing values have been removed
split_four = split_data(values_removed2)

#Splits the whole data set, where missing values have been replaced by their median values
split_five = split_data(values_replaced1)

#Splits the Top 10 most important columns, where the missing values have been replaced by their median values
split_six = split_data(values_replaced2)

#Split_one normalised using scale_data
split_seven = split_data(scale_1)

#split_two normalised
split_eight = split_data(scale_2)

#split_three normalised
split_nine = split_data(scale_3)

#split_four normalised
split_ten = split_data(scale_4)

#split_five normalised
split_eleven = split_data(scale_5)

#split_six normalised
split_twelve = split_data(scale_6)




Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....
Splitting data set into 80.0% training, 20.0% test dataset....


Shows that when data is scaled, it picks up NaN values - unsure as to why

In [70]:
scale_3.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
dtype: int64

In [72]:
values_removed1.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
dtype: int64

Algorithm applied to values_removed1 - i.e. the whole data set, with any missing values removed
=====


In [45]:
labels = values_removed1.pop("TenYearCHD").values
data = values_removed1.values
#gives the 10 year CHD data the name labels, this is so it can be called specifically later on in code
#split dataset in features and target variable
feature_cols = values_removed1.columns
X = feature_cols # Features
y = labels # Target variable


In [47]:
#Creates test and training sets 
x_train, x_test, y_train, y_test = train_test_split(data, labels,
                                                    test_size=.2,
                                                    random_state=5)

In [48]:
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8579234972677595


Algorithm applied to values_removed2 - i.e. top 10 most important columns, where missing values were removed
=====


In [49]:
labels2 = values_removed2.pop("TenYearCHD").values
data2 = values_removed2.values
#gives the 10 year CHD data the name labels, this is so it can be called specifically later on in code
#split dataset in features and target variable
feature_cols = values_removed2.columns
X = feature_cols # Features
y = labels # Target variable

In [53]:
#Creates test and training sets 
x_train2, x_test2, y_train2, y_test2 = train_test_split(data2, labels2,
                                                    test_size=.2,
                                                    random_state=5)

In [55]:
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(x_train2,y_train2)

#Predict the response for test dataset
y_pred2 = clf.predict(x_test2)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test2, y_pred2))

Accuracy: 0.8424908424908425


Algorithm applied to values_removed1 - i.e. the whole data set, where any missing values have been replaced with the median value
====


In [56]:
labels3 = values_replaced1.pop("TenYearCHD").values
data3 = values_replaced1.values
#gives the 10 year CHD data the name labels, this is so it can be called specifically later on in code
#split dataset in features and target variable
feature_cols = values_replaced1.columns
X = feature_cols # Features
y = labels # Target variable

In [57]:
#Creates test and training sets 
x_train3, x_test3, y_train3, y_test3 = train_test_split(data3, labels3,
                                                    test_size=.2,
                                                    random_state=5)

In [58]:
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(x_train3,y_train3)

#Predict the response for test dataset
y_pred3 = clf.predict(x_test3)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test3, y_pred3))

Accuracy: 0.8408018867924528


Algorithm applied to values_removed2 - i.e. the top 10 columns, where any missing values have been replaced with the median value
====


In [59]:
labels4 = values_replaced2.pop("TenYearCHD").values
data4 = values_replaced2.values
#gives the 10 year CHD data the name labels, this is so it can be called specifically later on in code
#split dataset in features and target variable
feature_cols = values_replaced2.columns
X = feature_cols # Features
y = labels # Target variable

In [60]:
#Creates test and training sets 
x_train4, x_test4, y_train4, y_test4 = train_test_split(data4, labels4,
                                                    test_size=.2,
                                                    random_state=5)

In [61]:
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(x_train4,y_train4)

#Predict the response for test dataset
y_pred4 = clf.predict(x_test4)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test4, y_pred4))

Accuracy: 0.8431603773584906


In [62]:
labels5 = scale_3.pop("TenYearCHD").values
data5 = scale_3.values
#gives the 10 year CHD data the name labels, this is so it can be called specifically later on in code
#split dataset in features and target variable
feature_cols = scale_3.columns
X = feature_cols # Features
y = labels # Target variable

In [63]:
#Creates test and training sets 
x_train5, x_test5, y_train5, y_test5 = train_test_split(data5, labels5,
                                                    test_size=.2,
                                                    random_state=5)

In [64]:
# Create Decision Tree classifer object
clf = tree.DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(x_train5,y_train5)

#Predict the response for test dataset
y_pred5 = clf.predict(x_test5)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test5, y_pred5))

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').