# Import Library

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [4]:
import os

# Import Dataset

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("johnsmith88/heart-disease-dataset")

print("Path to dataset files:", path)
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Path to dataset files: /kaggle/input/heart-disease-dataset
/kaggle/input/heart-disease-dataset/heart.csv


In [6]:
df = pd.read_csv(path+'/heart.csv')

In [18]:
df.shape

(1025, 14)

In [7]:
df.sample()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
294,56,0,0,200,288,1,0,133,1,4.0,0,2,3,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [25]:
df.isnull().sum().sum()

np.int64(0)

# Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

# Train Various Model with Default

In [13]:
lr = LogisticRegression(max_iter=5000)
gb = GradientBoostingClassifier()
rf = RandomForestClassifier()
svc = SVC()

In [14]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

0.7951219512195122

In [15]:
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
accuracy_score(y_test, y_pred)

0.9317073170731708

In [16]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9853658536585366

In [17]:
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.6829268292682927

# Grid Search CV

> Python
```python
class sklearn.model_selection.GridSearchCV(
  estimator, param_grid, *, scoring=None,
   n_jobs=None, refit=True, cv=None,
   verbose=0, pre_dispatch='2*n_jobs', error_score=nan,
   return_train_score=False
)


```

In [19]:
params = {
    'n_estimators': [50, 200, 500],
    'max_depth': [3, 8, None],
    'max_features': ['auto', 'log2'],
    'bootstrap': [True, False],
    'max_samples' : [0.5, 0.75, 1.0]
}

In [20]:
rf = RandomForestClassifier()

In [21]:
grid = GridSearchCV(
    estimator=rf,
    param_grid=params,
    cv=5,
    n_jobs=-1
)

In [None]:
%%time

grid.fit(X_train, y_train)

In [26]:
grid.best_params_

{'bootstrap': True,
 'max_depth': None,
 'max_features': 'log2',
 'max_samples': 0.75,
 'n_estimators': 500}

In [29]:
grid.best_score_

np.float64(0.9817073170731707)

In [30]:
grid.best_estimator_

In [27]:
dtx = RandomForestClassifier(
    bootstrap=True, max_depth=None,
    max_features='log2',
    max_samples=0.75,
    n_estimators=500
    )

In [28]:
dtx.fit(X_train, y_train)
y_pred = dtx.predict(X_test)
accuracy_score(y_test, y_pred)

0.9853658536585366

# RandomizedSearchCV

>Python

```python

class sklearn.model_selection.RandomizedSearchCV(
  estimator, param_distributions, *, n_iter=10,
   scoring=None, n_jobs=None, refit=True,
   cv=None, verbose=0, pre_dispatch='2*n_jobs',
    random_state=None, error_score=nan, return_train_score=False
)


```

In [31]:
params2 = {
    'n_estimators': [50, 200, 500],
    'max_depth': [3, 8, None],
    'max_features': ['auto', 'log2', 0.25],
    'bootstrap': [True, False],
    'max_samples' : [0.5, 0.75, 1.0],
    'min_samples_split' : [2, 5]
}

In [32]:
rscv = RandomizedSearchCV(
    estimator = rf,
    param_distributions = params2,
    cv = 10,

    verbose = 2,
    n_jobs = -1,
    n_iter = 10
)

In [34]:
%%time
rscv.fit(X_train, y_train)
y_pred = rscv.predict(X_test)
accuracy_score(y_test, y_pred)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


60 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/ensemble/_forest.py", line 431, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample

CPU times: user 1.31 s, sys: 38.9 ms, total: 1.35 s
Wall time: 31.9 s


0.9853658536585366

In [35]:
rscv.best_estimator_

In [36]:
rscv.best_score_

np.float64(0.9780487804878047)

In [37]:
rscv.best_params_

{'n_estimators': 500,
 'min_samples_split': 5,
 'max_samples': 0.75,
 'max_features': 'log2',
 'max_depth': None,
 'bootstrap': True}