In [1]:
# import required libraries

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc

from matplotlib import pyplot as plt
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv('parkinsons.data')  # Replace 'your_dataset.csv' with the actual file name

In [3]:
# Separate features (X) and target (y)
X = data.drop(['name', 'status'], axis=1)  # Remove 'name' column and keep all features except 'status'
y = data['status']  # Target variable

In [4]:
# Split the dataset into training and testing sets
# First, split the data into train_val and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=7,stratify=y)

# Next, split the train_val set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=7,stratify=y_train_val)


In [5]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

In [31]:
clf = [
      LogisticRegression(solver='lbfgs',penalty='l2',max_iter=1000),
      LogisticRegression(solver='lbfgs',penalty=None,max_iter=1000),
      LogisticRegression(solver='liblinear',penalty='l1',max_iter=1000),
      LogisticRegression(solver='liblinear',penalty='l2',max_iter=1000),
      LogisticRegression(solver='newton-cg',penalty=None,max_iter=1000),
      LogisticRegression(solver='newton-cg',penalty='l2',max_iter=1000),
      LogisticRegression(solver='sag',penalty=None,max_iter=1000),
      LogisticRegression(solver='sag',penalty='l2',max_iter=1000),
      LogisticRegression(solver='saga',penalty=None,max_iter=1000),
      LogisticRegression(solver='saga',penalty='l1',max_iter=1000),
      LogisticRegression(solver='saga',penalty='l2',max_iter=1000),
#     LogisticRegression(solver='sag',penalty=None),
#     LogisticRegression(solver='saga',penalty=None)
    ]
clf_columns = []
clf_compare = pd.DataFrame(columns = clf_columns)

row_index = 0
for alg in clf:
        
    predicted = alg.fit(X_train, y_train).predict(X_val)
    fp, tp, th = roc_curve(y_val, predicted)
    clf_name = alg.__class__.__name__
    clf_compare.loc[row_index, 'Train Accuracy'] = round(alg.score(X_train, y_train), 5)
    clf_compare.loc[row_index, 'Validation Accuracy'] = round(alg.score(X_val, y_val), 5)
    clf_compare.loc[row_index, 'Precision'] = round(precision_score(y_val, predicted),5)
    clf_compare.loc[row_index, 'Recall'] = round(recall_score(y_val, predicted),5)
    clf_compare.loc[row_index, 'AUC'] = round(auc(fp, tp),5)

    row_index+=1
    
clf_compare.sort_values(by = ['Validation Accuracy'], ascending = False, inplace = True)    
clf_compare



Unnamed: 0,Train Accuracy,Validation Accuracy,Precision,Recall,AUC
6,0.90196,0.76471,0.82143,0.88462,0.62981
8,0.91176,0.76471,0.82143,0.88462,0.62981
4,0.95098,0.73529,0.81481,0.84615,0.61058
0,0.89216,0.70588,0.76667,0.88462,0.50481
1,0.95098,0.70588,0.78571,0.84615,0.54808
2,0.89216,0.70588,0.76667,0.88462,0.50481
3,0.88235,0.70588,0.76667,0.88462,0.50481
5,0.89216,0.70588,0.76667,0.88462,0.50481
7,0.89216,0.70588,0.76667,0.88462,0.50481
9,0.88235,0.70588,0.76667,0.88462,0.50481


In [19]:
# Print the best hyperparameters
print("Best Hyperparameters:", best_params)
print("Best estimator:", best_model)

NameError: name 'best_params' is not defined

In [None]:
# Model coef_
best_model.coef_

In [None]:
N = len(data.keys())
print(N)
print(list(data.keys()))

In [None]:
# Interpretation of intercept and coef values
N = len(list(X.keys()))
values = best_model.coef_[0]
plt.figure(figsize=(10, 5))
plt.bar(np.arange(0, N), values)
plt.xticks(np.arange(0, N), list(X.keys()), rotation='vertical')

plt.show()

In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test_scaled)

In [None]:
# Evaluate the model
accuracy_lbfgs = accuracy_score(y_test, y_pred)
precision_score_lbfgs = precision_score(y_test, y_pred)
recall_score_lbfgs = recall_score(y_test, y_pred)
f1_score_lbfgs = f1_score(y_test, y_pred)
conf_matrix_lbfgs = confusion_matrix(y_test, y_pred)
class_report_lbfgs = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy_lbfgs:.2f}")
print(f"Precision score: {precision_score_lbfgs:.2f}")
print(f"Recall score: {recall_score_lbfgs:.2f}")
print(f"F1 score: {f1_score_lbfgs:.2f}")
print("Confusion Matrix:\n", conf_matrix_lbfgs)
print("Classification Report:\n", class_report_lbfgs)


In [None]:
# 'liblinear'good for very binary classification and high-dimensional datasets with a small number of samples. 

model_liblinear = LogisticRegression(solver='liblinear')
model_liblinear.fit(X_train_scaled, y_train)




In [None]:
# Model intercept_
model_liblinear.intercept_

In [None]:
# Model coef_
model_liblinear.coef_

In [None]:
# Interpretation of intercept and coef values for liblinear solver
N = len(list(X.keys()))
values = model_liblinear.coef_[0]
plt.figure(figsize=(10, 5))
plt.bar(np.arange(0, N), values)
plt.xticks(np.arange(0, N), list(X.keys()), rotation='vertical')

plt.show()

In [None]:
# Predict on the test set
y_pred = model_liblinear.predict(X_test_scaled)

In [None]:
# Evaluate the model for liblinear solver
accuracy_liblinear = accuracy_score(y_test, y_pred)
precision_score_liblinear = precision_score(y_test, y_pred)
recall_score_liblinear = recall_score(y_test, y_pred)
f1_score_liblinear = f1_score(y_test, y_pred)
conf_matrix_liblinear = confusion_matrix(y_test, y_pred)
class_report_liblinear = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy_liblinear:.2f}")
print(f"Precision score: {precision_score_liblinear:.2f}")
print(f"Recall score: {recall_score_liblinear:.2f}")
print(f"F1 score: {f1_score_liblinear:.2f}")
print("Confusion Matrix:\n", conf_matrix_liblinear)
print("Classification Report:\n", class_report_liblinear)


In [None]:
# Save model
import pickle

with open('../models/log_reg_lbfgs_classifier.model.pickle', 'wb') as model_file:
    pickle.dump(model, model_file)
    
with open('../models/log_reg_lbfgs_classifier.scaler.pickle', 'wb') as model_file:
    pickle.dump(scaler, model_file)