In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyClassifier


In [None]:

  
# Read the CSV file.
data = pd.read_csv("CTG.csv", skiprows=1)

# Select the relevant numerical columns.
selected_cols = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV',
                 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean',
                 'Median', 'Variance', 'Tendency', 'NSP']
data = data[selected_cols].dropna()

# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data_shuffled.drop('NSP', axis=1)

# Map the diagnosis code to a human-readable label.
def to_label(y):
    return [None, 'normal', 'suspect', 'pathologic'][(int(y))]

Y = data_shuffled['NSP'].apply(to_label)

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

The code blocks below will try out different classifiers and their accuracy on above data.

In [None]:
clf = DummyClassifier(strategy='most_frequent')
score_dummy = np.mean(cross_val_score(clf, Xtrain, Ytrain))
score_dummy

0.7805882352941176

DummyClassifier: Return the most common label in the training set, used as a baseline <br>
Score = 0.781

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
score_decisionTree = np.mean(cross_val_score(clf, Xtrain, Ytrain))
score_decisionTree

0.9241176470588235

DecisionTree: Splits the data in homogeneous subgroups based on features <br>
Score = 0.924

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
score_randomForest = np.mean(cross_val_score(clf, Xtrain, Ytrain))
score_randomForest

0.9429411764705883

RandomForest: An ensamble of DecisionTree classifiers and uses averaging to improve the predictive accuracy <br>
Score: 0.9429

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(random_state=0)
score_gradientBoosting = np.mean(cross_val_score(clf, Xtrain, Ytrain))
score_gradientBoosting

0.9494117647058824

GradientBoosting: Builds an additive model in a forward stage-wise fashion. Optimization of arbitrary differentiable loss function <br>
Score: 0.949

In [None]:
from sklearn.linear_model import Perceptron
clf = Perceptron(tol=1e-3, random_state=0)
score_perceptron = np.mean(cross_val_score(clf,Xtrain,Ytrain))
score_perceptron

0.825294117647059

Perceptron: A linear classifier which separates the classes by a hyperplane in the space of the features.

Score: 0.825

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=300)
score_MLP = np.mean(cross_val_score(clf, Xtrain, Ytrain))
score_MLP

0.8558823529411764

MLP: Multi-layered perceptron classifier. Optimizes the model through a log-loss function that utilizes LBFGS or stochastis gradient descent <br>
Score : 0.856

### Final evaluation

In [None]:
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier(random_state=0)
clf.fit(Xtrain, Ytrain)
Yguess = clf.predict(Xtest)
print(accuracy_score(Ytest, Yguess))

0.9272300469483568


Random forest classifier gives a high cross-validation score and are therefore used in step 4, final evaluation. Random forest classifers uses by default 100 decision trees that each utilizes a random number of features to give predictions on the data. After creating the decisions trees the majority decision of the trees determines the predition of the output.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=42461cb6-6f72-410b-bb93-ced1edaf8704' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>