Learned:
- use pipeline to streamline data manipulation
- cross validation as proxy for preformance
- grid search to find best model based on hyperparametrs

In [22]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

data

In [23]:
df = pd.read_csv("heart.csv")
X = df[['trtbps','chol']]
Y = df["output"]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

logistic Regression

In [27]:
logit_pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Step 1: Standardize the data
    ('classifier', LogisticRegression())  # Step 2: Train a classifier
])

logit_pipeline.fit(X_train, y_train)
# Make predictions
y_pred = logit_pipeline.predict(X_test)
results = cross_validate(logit_pipeline, X_train, y_train, cv=5, return_train_score=True)

# Evaluate the model
print("sklearn accuracy", accuracy_score(y_test, y_pred))
print("Training scores:", results['train_score'])
print("Validation scores:", results['test_score'])
print("Mean training score:", results['train_score'].mean())
print("Mean validation score:", results['test_score'].mean())

sklearn accuracy 0.7049180327868853
Training scores: [0.55440415 0.58031088 0.55670103 0.58762887 0.56701031]
Validation scores: [0.6122449  0.51020408 0.5625     0.5        0.5625    ]
Mean training score: 0.5692110464184605
Mean validation score: 0.5494897959183673


SVM

In [31]:
SVM_pipeline = Pipeline([
    ('scaler', StandardScaler()),    # Standardize the data
    ('svm', SVC())                   # Support Vector Machine
])

# Define the parameter grid
param_grid = {
    'svm__C': [0.1, 1, 10, 100],     # Regularization parameter
    'svm__gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient
    'svm__kernel': ['rbf', 'poly', 'sigmoid']  # Kernel type
}

# Setup GridSearchCV
grid_search = GridSearchCV(SVM_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.0s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.0s
[CV] END ..........svm__C=0.1, svm__gamma=1, svm__kernel=rbf; total time=   0.0s
[CV] END .........svm__C=0.1, svm__gamma=1, svm__kernel=poly; total time=   0.0s
[CV] END .........svm__C=0.1, svm__gamma=1, svm__kernel=poly; total time=   0.0s
[CV] END ......svm__C=0.1, svm__gamma=1, svm__kernel=sigmoid; total time=   0.0s
[CV] END ......svm__C=0.1, svm__gamma=1, svm__kernel=sigmoid; total time=   0.0s
[CV] END ......svm__C=0.1, svm__gamma=1, svm__kernel=sigmoid; total time=   0.0s
[CV] END ......svm__C=0.1, svm__gamma=1, svm__kernel=sigmoid; total time=   0.0s
[CV] END ......svm__C=0.1, svm__gamma=1, svm__kernel=sigmoid; total time=   0.0s
[CV] END .........svm__C=0.1, svm__gamma=1, svm__kernel=poly; total time=   0.0s
[CV] END .........svm__C=0.1, svm__gamma=1, svm

Results:
- logistic regression preforms much better
- over complicating relationship between features and output