In [1]:
# import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# import data
data = pd.read_csv('parkinsons.data')

In [3]:
# split data to X and y
X = data.drop(['name','status'],axis=1)
# Check unique values in the 'status' column
y = data['status']

In [4]:
# split the dataset into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [5]:
# scale the numerical features

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

In [6]:
# Define hyperparameter grid for tuning

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Create GridSearchCV instance
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_


In [7]:
# Predict on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model on validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print("Validation Confusion Matrix:\n", val_conf_matrix)
print("Validation Classification Report:\n", val_class_report)

Validation Accuracy: 0.93
Validation Confusion Matrix:
 [[ 2  1]
 [ 1 26]]
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.96      0.96      0.96        27

    accuracy                           0.93        30
   macro avg       0.81      0.81      0.81        30
weighted avg       0.93      0.93      0.93        30



In [8]:
# Predict on the test set
y_test_pred = best_model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_class_report = classification_report(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print("Test Confusion Matrix:\n", test_conf_matrix)
print("Test Classification Report:\n", test_class_report)

Test Accuracy: 0.86
Test Confusion Matrix:
 [[ 9  3]
 [ 1 16]]
Test Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.75      0.82        12
           1       0.84      0.94      0.89        17

    accuracy                           0.86        29
   macro avg       0.87      0.85      0.85        29
weighted avg       0.87      0.86      0.86        29



In [9]:
# creating and training model

# DecisionTreeClassifier does not support the eval_set parameter, it can be done by setting other 
# parameters: max_depth=10,min_samples_split=5,min_samples_leaf=2

model = DecisionTreeClassifier(
    random_state=42,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2
)

model.fit(X_train, y_train)



In [10]:
# Predict on the validation set
y_val_pred = model.predict(X_val)

# Evaluate the model on validation set
val_accuracy = accuracy_score(y_val, y_val_pred)
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.2f}")
print("Validation Confusion Matrix:\n", val_conf_matrix)
print("Validation Classification Report:\n", val_class_report)

Validation Accuracy: 0.77
Validation Confusion Matrix:
 [[ 1  2]
 [ 5 22]]
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.17      0.33      0.22         3
           1       0.92      0.81      0.86        27

    accuracy                           0.77        30
   macro avg       0.54      0.57      0.54        30
weighted avg       0.84      0.77      0.80        30



In [11]:
# Predict on the test set
y_test_pred = model.predict(X_test)

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_class_report = classification_report(y_test, y_test_pred)

print(f"Test Accuracy: {test_accuracy:.2f}")
print("Test Confusion Matrix:\n", test_conf_matrix)
print("Test Classification Report:\n", test_class_report)

Test Accuracy: 0.93
Test Confusion Matrix:
 [[10  2]
 [ 0 17]]
Test Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.83      0.91        12
           1       0.89      1.00      0.94        17

    accuracy                           0.93        29
   macro avg       0.95      0.92      0.93        29
weighted avg       0.94      0.93      0.93        29

