## Using Model Pipelines

https://www.datacamp.com/tutorial/k-nearest-neighbor-classification-scikit-learn

https://www.datacamp.com/blog/classification-machine-learning

In [1]:
# Loading the Data
import pandas as pd
heart_df = pd.read_csv("../TeamProject/data/raw/heart.csv")
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [2]:
# Split the data into features (X) and target (y)
X = heart_df.drop(columns='HeartDisease')
Y = heart_df['HeartDisease']

In [3]:
from sklearn.model_selection import train_test_split

scoring = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'max_error', 
           'accuracy', 'precision', 'recall']

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

## Preprocessing steps (scaling, encoding, transformations)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression

In [5]:
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [6]:
categorical = ['ChestPainType', 'RestingECG','ST_Slope','Sex', 'ExerciseAngina', 'FastingBS']
numeric = ['Age', 'RestingBP', 'Cholesterol','MaxHR', 'Oldpeak']
numeric_power = [ 'MaxHR', 'Oldpeak']
numeric_standard = list(set(numeric) - set(numeric_power))

In [7]:
preproc1 =  ColumnTransformer([
    ('scale_numeric', StandardScaler(), numeric),
    ('encode_cat', OneHotEncoder(handle_unknown = 'ignore'), categorical)
])
preproc1

In [8]:
preproc2 = ColumnTransformer([
    ('scale_numeric', StandardScaler(), numeric_standard),
    ('non_linear_scale_numeric', PowerTransformer(), numeric_power),
    ('encode_cat', OneHotEncoder(handle_unknown = 'ignore'), categorical)
])
preproc2

### Using KNN Classifier to predict 'Heart Disease' or 'Normal'

In [9]:
# Pipeline A = preproc1 + KNeighborsClassifier
pipeline_A = Pipeline([
    ('preprocessing', preproc1),
    ('model', KNeighborsClassifier() )
])
pipeline_A

In [10]:
# Pipeline B = preproc2 + KNeighborsClassifier
pipeline_B = Pipeline([
    ('preprocessing', preproc2),
    ('model', KNeighborsClassifier() )
])
pipeline_B

### Logistic Regression

https://www.ibm.com/think/topics/logistic-regression
https://www.datacamp.com/tutorial/understanding-logistic-regression-python

Logistic Regression is a supervised machine learning algorithm used to estimate the relationship between a dependent categorical variable and one or more independent variables. Unlike linear regression, which predicts continuous values, logistic regression classifies data into categories by computing the probability of an event occurence.

In [11]:
# Pipeline C = preproc1 + LogisticRegression
pipeline_C = Pipeline([
    ('preprocessing', preproc1),
    ('model', LogisticRegression(random_state=16)) 
])
pipeline_C

In [12]:
# Pipeline D = preproc2 + LogisticRegression
pipeline_D = Pipeline([
    ('preprocessing', preproc2),
    ('model', LogisticRegression(random_state=16)) 
])
pipeline_D

# Tune Hyperparams

### Grid for KNN Classifier ( Get best value of K) - dictionary of hyperparameters and values for tuning

In [13]:
#Parameter Grid for KNN Classifier - Pipeline A and B
param_grid_knn = {
    'model__n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17,19, 21],  # Number of Nearest Neighbors
    'model__weights': ['uniform', 'distance'],  # Weighting scheme "uniform" (equal influence) vs "distance" (closer points matter more)
    'model__p': [1, 2]  # Distance metric: 1 = Manhattan (Sum of absolute differences), 2 = Euclidean (Straight-line distance)
}

#### Pipeline A

In [14]:
#Pipeline A
grid_knn_A = GridSearchCV(
    estimator=pipeline_A,
    param_grid=param_grid_knn,
    scoring=scoring,
    cv=10,
    refit = "accuracy"
)
grid_knn_A.fit(
    X_train, 
    Y_train)

#### Pipeline B

In [15]:
#Pipeline B
grid_knn_B = GridSearchCV(
    estimator=pipeline_B,
    param_grid=param_grid_knn,
    scoring=scoring,
    cv=10,
    refit = "accuracy"
)
grid_knn_B.fit(
    X_train, 
    Y_train)

### Grid for Logistic Regression

In [16]:
# Parameter grid for Logistic Regression - Pipeline C and D
param_grid_logreg = {
    'model__C': [0.01, 0.1, 1, 10, 100],  # Regularization strength - Lower(Strong): simpler model, less overfitting | Higher(Weaker): more flexibility, but may overfit
    'model__solver': ['liblinear', 'lbfgs']  # Optimization solver - liblinear: small datasets | lbfgs: Default/ large datasets
}

#### Pipeline C

In [17]:
grid_logreg_C = GridSearchCV(
    estimator=pipeline_C,
    param_grid=param_grid_logreg,
    scoring=scoring,  
    cv=10,                                                    
    verbose=1, #controls how much output GridSearchCV prints during training 1: Shows basic progress                       
    refit='accuracy' #If multiple metrics are used in scoring, refit decides which one to use for selecting the final best model.
)

grid_logreg_C.fit(X_train, Y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [18]:
grid_logreg_D = GridSearchCV(
    estimator=pipeline_D,
    param_grid=param_grid_logreg,
    scoring=scoring,  
    cv=10,                             
    n_jobs=-1,                        
    verbose=1,                        
    refit='accuracy'
)

grid_logreg_D.fit(X_train, Y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


# Evaluate

## Attributes of GridSearchCV
- **best_score_** : The best score obtained from the cross-validation process. This score is based on the evaluation metric specified in the scoring parameter

- **best_params_**: The hyperparameters that gave the best score.

- **best_estimator_**: The estimator (model) that gave the best score using the hyperparameters specified in best_params_

- **cv_results_**: A dictionary containing cross-validation results for all parameter combinations tested during the grid search. This includes a variety of evaluation metrics. Key entries in cv_results_ include: 
    - mean_test_score: The mean score obtained for each parameter set across the folds.
    - std_test_score: The standard deviation of the test scores for each parameter set.
    - mean_train_score: The mean training score.
    - std_train_score: The standard deviation of the training scores.
    - params: The parameter combinations tested

Specific columns of cv_results_.Provide results for each combination of hyperparameters tested during the grid search.:
- 'mean_test_neg_mean_squared_error'
- 'mean_test_neg_mean_absolute_error'
- 'mean_test_max_error'
- 'mean_test_accuracy'
- 'mean_test_precision'
- 'mean_test_recall'




In [19]:
import pandas as pd

# DataFrame with results
results_df = pd.DataFrame({
    "Pipeline": ["KNN A", "KNN B", "LogReg C", "LogReg D"],
    "Best Params": [
        grid_knn_A.best_params_, 
        grid_knn_B.best_params_, 
        grid_logreg_C.best_params_, 
        grid_logreg_D.best_params_
    ],
    "Best Score": [
        grid_knn_A.best_score_, 
        grid_knn_B.best_score_, 
        grid_logreg_C.best_score_, 
        grid_logreg_D.best_score_
    ],
})

# Display sorted results (best performing first)
results_df


Unnamed: 0,Pipeline,Best Params,Best Score
0,KNN A,"{'model__n_neighbors': 21, 'model__p': 1, 'mod...",0.880007
1,KNN B,"{'model__n_neighbors': 15, 'model__p': 1, 'mod...",0.875916
2,LogReg C,"{'model__C': 0.1, 'model__solver': 'liblinear'}",0.869086
3,LogReg D,"{'model__C': 1, 'model__solver': 'liblinear'}",0.859534
