In [1]:
# import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import ensemble
from imblearn.over_sampling import RandomOverSampler

In [2]:
# load dataset
data = pd.read_csv('parkinsons.data')

In [3]:
# dataset info
#data.info()
data.head()

In [4]:
data.describe()

In [5]:
data.info()

In [6]:
# split dataset 

X = data.drop(['name','status'],axis=1)
y = data['status']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=7,stratify=y_train_val)

In [7]:
scaler = StandardScaler()

X_train_val = scaler.fit_transform(X_train_val)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [8]:
# ensamble of KNeighborsClassifier model

# Base estimator is used from previous script where we found out that 
# best KNeighborsClassifier hyperparameters are: n_neighbors=3, weights=uniform, metric=minkowski
base_estimator = KNeighborsClassifier(n_neighbors=3,metric='minkowski', weights='uniform')  

n_estimators= [5,10, 50, 100, 200]
max_samples= [0.5, 0.7, 0.9, 1.0]
max_features= [0.5, 0.7, 0.9, 1.0]
bootst = False

best_accuracy = 0
best_model = None
best_n_estimators = None
best_max_samples = None
best_max_features = None
best_bootstrap = None
best_boot_feat = None

for n_est in n_estimators:
    for sample in max_samples:
        for feat in max_features:
            
            model_ensamble_knn = ensemble.BaggingClassifier(estimator=base_estimator
                                                                ,n_estimators=n_est
                                                                ,max_samples=sample
                                                                ,max_features = feat)
            model_ensamble_knn.fit(X_train, y_train)
            accuracy = model_ensamble_knn.score(X_val, y_val)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_model = model_ensamble_knn
                best_n_estimators = n_est
                best_max_samples = sample
                best_max_features = feat

print(f"Najbolji model: n_estimators={best_n_estimators}, max_samples={best_max_samples}, max_features={best_max_features}, Accuracy={best_accuracy}")


In [9]:
# Refiting model with both train and validation data

best_model.fit(X_train_val,y_train_val)

In [10]:
# evaluation of ensamble 
y_predicted = best_model.predict(X_test)

acc = accuracy_score(y_test, y_predicted)
conf_mat = confusion_matrix(y_test, y_predicted)
class_report = classification_report(y_test,y_predicted)
print(f"Accuracy: {acc:.2f}")
print("Confusion matrix: ", conf_mat)
print("Classification report: ", class_report)

In [11]:
# Save model
import pickle

with open('../models/ensamble_k_neighbors_classifier.model.pickle', 'wb') as model_file:
    pickle.dump(best_model, model_file)
    
with open('../models/ensamble_k_neighbors_classifier.scaler.pickle', 'wb') as model_file:
    pickle.dump(scaler, model_file)