In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv(r"C:\Users\22213\OneDrive\Desktop\csv\heart_disease_data.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [5]:
# distributionn of target variables
df['target'].value_counts()

target
1    165
0    138
Name: count, dtype: int64

1 --> defective heart

0 --> healthy heart

In [6]:
x = df.drop(columns=['target'], axis=1)
y = df['target']

In [7]:
x = np.asarray(x)
y = np.asarray(y)

# Model selection

**1. Comparing the models with default hyper-parameter values using cross validation**

In [8]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [9]:
def compare_models_cross_validation():
    for model in models:
        cv_score = cross_val_score(model, x, y, cv=5)
        mean_accuracy = 100*(round(cv_score.mean(), 4))
        
        print(f'Cross validation mean accuracy of {model} = {mean_accuracy}')
        print('----------------------------------------------------------------------------')

In [10]:
compare_models_cross_validation()

Cross validation mean accuracy of LogisticRegression(max_iter=1000) = 82.83
----------------------------------------------------------------------------
Cross validation mean accuracy of SVC(kernel='linear') = 82.83
----------------------------------------------------------------------------
Cross validation mean accuracy of KNeighborsClassifier() = 64.39
----------------------------------------------------------------------------
Cross validation mean accuracy of RandomForestClassifier(random_state=0) = 83.81
----------------------------------------------------------------------------


**2. Comparing the models with different hyper-parameter values using grid search**

In [23]:
models_list = [LogisticRegression(max_iter=10000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [24]:
# hyper-parameters for the models
model_hyperparameters = {
    'lr':{
        'C':[1,5,10,20]
    },
    'svc':{
        'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
        'C':[1,5,10,20]
    },
    'knc':{
        'n_neighbors':[3,5,10]
    },
    'rfc':{
        'n_estimators':[10,20,50,100]
    }
}

In [32]:
model_keys = list(model_hyperparameters.keys())

In [33]:
print(model_keys)
print(models_list)

['lr', 'svc', 'knc', 'rfc']
[LogisticRegression(max_iter=10000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]


***applying the grid search***

In [34]:
def ModelSelection(list_of_models, hyperparameter_dict, mk):
    result = []
    i = 0
    for model in list_of_models:
        key = model_keys[i]
        param = hyperparameter_dict[key]
        i+=1
        print(model)
        print(param)
        print('----------------------------------------------------')

        classifier = GridSearchCV(model, param, cv=5)

        # fitting the data to classifier
        classifier.fit(x, y)

        result.append({
            'model':model,
            'best_hyperparameter':classifier.best_params_,
            'accuracy_sccore':classifier.best_score_
        })
    result_dataframe = pd.DataFrame(result)
    return result_dataframe

In [35]:
ModelSelection(list_of_models=models, hyperparameter_dict=model_hyperparameters, mk=model_keys)

LogisticRegression(max_iter=1000)
{'C': [1, 5, 10, 20]}
----------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


SVC(kernel='linear')
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
----------------------------------------------------
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
----------------------------------------------------
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}
----------------------------------------------------


Unnamed: 0,model,best_hyperparameter,accuracy_sccore
0,LogisticRegression(max_iter=1000),{'C': 5},0.831585
1,SVC(kernel='linear'),"{'C': 1, 'kernel': 'linear'}",0.828306
2,KNeighborsClassifier(),{'n_neighbors': 5},0.64388
3,RandomForestClassifier(random_state=0),{'n_estimators': 100},0.838087
