In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

import pickle
import time
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
fh_df = pd.read_csv('fetal_health.csv')
fh_df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [7]:
output = fh_df['fetal_health']
input = fh_df.drop(columns = 'fetal_health')
input.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0


In [8]:
output.value_counts(normalize=True)

fetal_health
1.0    0.778457
2.0    0.138758
3.0    0.082785
Name: proportion, dtype: float64

In [9]:
train_X, test_X, train_y, test_y = train_test_split(input, output, test_size = 0.2, random_state = 1)

In [10]:
rf_estimator = RandomForestClassifier(random_state = 1)

In [11]:
start = time.time()
rf_estimator.fit(train_X, train_y)
stop = time.time()
print(f'Time Taken: {stop - start} seconds')

Time Taken: 0.2646324634552002 seconds


In [13]:
y_pred = rf_estimator.predict(test_X)

In [15]:
conf_matrix = confusion_matrix(test_y, y_pred, labels = rf_estimator.classes_)
display = ConfusionMatrixDisplay(conf_matrix, display_labels = rf_estimator.classes_)

fig, ax = plt.subplots(figsize=(5,5))
display.plot(cmap = 'PuBu', ax = ax)
plt.savefig('rf_confusion_matrix.svg')

In [16]:
class_report = classification_report(test_y, y_pred, output_dict = True)
cr_df = pd.DataFrame(class_report)
cr_df.to_csv('rf_class_report.csv')

In [18]:
importance = rf_estimator.feature_importances_

feat_imp = pd.DataFrame(list(zip(train_X.columns, importance)), 
                        columns = ['Feature', 'Importance'])

feat_imp = feat_imp.sort_values(by = 'Importance', ascending = False).reset_index(drop=True)
feat_imp_chart = plt.figure(figsize = (10, 5))
plt.barh(feat_imp['Feature'], feat_imp['Importance'], color = ['Purple', 'Blue'])

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance for Predicting Fetal Health')
plt.tight_layout()
plt.savefig('rf_feature_importance.svg')

In [19]:
rf_pickle = open('rf_ml.pickle', 'wb')
pickle.dump(rf_estimator, rf_pickle)
rf_pickle.close()