In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

import pickle
import time
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
fh_df = pd.read_csv('fetal_health.csv')
fh_df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [3]:
output = fh_df['fetal_health']
input = fh_df.drop(columns = 'fetal_health')
input.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0


In [4]:
output.value_counts(normalize=True)

fetal_health
1.0    0.778457
2.0    0.138758
3.0    0.082785
Name: proportion, dtype: float64

In [5]:
dt_pickle = open('dt_ml.pickle', 'rb')
dt_ml = pickle.load(dt_pickle)
dt_pickle.close()

In [6]:
rf_pickle = open('rf_ml.pickle', 'rb')
rf_ml = pickle.load(rf_pickle)
rf_pickle.close()

In [7]:
ada_pickle = open('ada_ml.pickle', 'rb')
ada_ml = pickle.load(ada_pickle)
ada_pickle.close()

In [8]:
train_X, test_X, train_y, test_y = train_test_split(input, output, test_size = 0.2, random_state = 1)

In [12]:
voting_estimator = VotingClassifier(estimators = [('dt', dt_ml), ('rf', rf_ml), ('ada', ada_ml)], voting = 'soft', weights= [0.335, 0.350, 0.315])

In [13]:
start = time.time()
voting_estimator.fit(train_X, train_y)
stop = time.time()
print(f'Time Taken: {stop - start} seconds')

Time Taken: 0.35922956466674805 seconds


In [14]:
y_pred = voting_estimator.predict(test_X)

In [16]:
conf_matrix = confusion_matrix(test_y, y_pred, labels = voting_estimator.classes_)
display = ConfusionMatrixDisplay(conf_matrix, display_labels=voting_estimator.classes_)

fig, ax = plt.subplots(figsize=(5,5))
display.plot(cmap = 'OrRd', ax = ax)
plt.savefig('vc_confusion_matrix.svg')

In [17]:
class_report = classification_report(test_y, y_pred, output_dict=True)
cr_df = pd.DataFrame(class_report)
cr_df.to_csv('vc_class_report.csv')

In [22]:
importance = dt_ml.feature_importances_
feat_imp_dt = pd.DataFrame(list(zip(train_X.columns, importance)),
                        columns = ['Feature', 'Importance'])

feat_imp_dt

Unnamed: 0,Feature,Importance
0,baseline value,0.029682
1,accelerations,0.01808
2,fetal_movement,0.009506
3,uterine_contractions,0.038505
4,light_decelerations,0.008606
5,severe_decelerations,0.0
6,prolongued_decelerations,0.017579
7,abnormal_short_term_variability,0.142002
8,mean_value_of_short_term_variability,0.245265
9,percentage_of_time_with_abnormal_long_term_var...,0.10201


In [23]:
importance = rf_ml.feature_importances_
feat_imp_rf = pd.DataFrame(list(zip(train_X.columns, importance)),
                        columns = ['Feature', 'Importance'])

feat_imp_rf

Unnamed: 0,Feature,Importance
0,baseline value,0.034993
1,accelerations,0.047201
2,fetal_movement,0.019285
3,uterine_contractions,0.04113
4,light_decelerations,0.007117
5,severe_decelerations,0.000417
6,prolongued_decelerations,0.045476
7,abnormal_short_term_variability,0.143171
8,mean_value_of_short_term_variability,0.111554
9,percentage_of_time_with_abnormal_long_term_var...,0.113253


In [24]:
importance = ada_ml.feature_importances_
feat_imp_ada = pd.DataFrame(list(zip(train_X.columns, importance)),
                        columns = ['Feature', 'Importance'])

feat_imp_ada

Unnamed: 0,Feature,Importance
0,baseline value,0.048275
1,accelerations,0.068237
2,fetal_movement,0.010098
3,uterine_contractions,0.044274
4,light_decelerations,0.0
5,severe_decelerations,0.0
6,prolongued_decelerations,0.063987
7,abnormal_short_term_variability,0.212495
8,mean_value_of_short_term_variability,0.11433
9,percentage_of_time_with_abnormal_long_term_var...,0.148533


In [25]:
feat_imp_vc = []
for i in range(len(feat_imp_dt['Importance'])):
    x = (feat_imp_dt['Importance'][i]*0.335) + (feat_imp_rf['Importance'][i]*0.350) + (feat_imp_ada['Importance'][i]*0.315)
    feat_imp_vc.append(x)

feat_imp_vc

[np.float64(0.03739760515124674),
 np.float64(0.044071847759524205),
 np.float64(0.013115478783911064),
 np.float64(0.04124073671297952),
 np.float64(0.005373982494251297),
 np.float64(0.0001460155859804022),
 np.float64(0.04196137009711838),
 np.float64(0.16461651463294213),
 np.float64(0.15722168335676737),
 np.float64(0.1205997409701225),
 np.float64(0.028273163429986994),
 np.float64(0.020010105286949034),
 np.float64(0.02347577793956253),
 np.float64(0.017691699078524803),
 np.float64(0.013123269857054632),
 np.float64(0.002789555535090226),
 np.float64(0.06668046656654353),
 np.float64(0.1363867649449342),
 np.float64(0.038578907269103326),
 np.float64(0.022675460469864822),
 np.float64(0.00456985407754225)]

In [27]:
feat_imp = pd.DataFrame(list(zip(train_X.columns, feat_imp_vc)),
                        columns = ['Feature', 'Importance'])

feat_imp = feat_imp.sort_values(by = 'Importance', ascending=False).reset_index(drop=True)

feat_imp_chart = plt.figure(figsize=(10, 5))
plt.barh(feat_imp['Feature'], feat_imp['Importance'], color = ['orange', 'red'])

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Predicting Fetal Health')
plt.tight_layout()
plt.savefig('vc_feature_importance.svg')

In [28]:
vc_pickle = open('vc_ml.pickle', 'wb')
pickle.dump(voting_estimator, vc_pickle)
vc_pickle.close()