In [21]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [7]:
df = pd.read_csv("heart.csv")

In [8]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [9]:
df.shape

(303, 14)

In [10]:
X = df.iloc[:,0:-1]
y= df.iloc[:,-1]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=43)

In [12]:
X_train.shape

(242, 13)

In [26]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8852459016393442


In [22]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8688524590163934


In [27]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.6721311475409836


In [29]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.8852459016393442


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
rf = RandomForestClassifier(max_samples=0.2, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.9180327868852459


# GridSearchCV

In [35]:
n_estimators = [20,60,100,120,200]

max_features = [0.2,0.5,0.7,1.0]

max_depth = [2,8,None]

max_samples = [0.2,0.5,0.7,1.0]

In [36]:
parameters = {
    'n_estimators' : n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples
}

In [37]:
from sklearn.model_selection import GridSearchCV

In [43]:
gs = GridSearchCV(RandomForestClassifier(), parameters, cv=10, n_jobs=-1, verbose=2)
gs.fit(X_train,y_train)
y_pred = gs.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
Accuracy:  0.8688524590163934


In [44]:
gs.best_estimator_

In [46]:
print(gs.best_index_)
print(gs.best_score_)
print(gs.best_params_)

9
0.8433333333333334
{'max_depth': 2, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 200}


In [47]:
from sklearn.model_selection import RandomizedSearchCV

In [48]:
parameters_grid = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "max_samples": max_samples,
    "bootstrap": [True, False],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [2, 1],
}

In [49]:
rf_rscv = RandomizedSearchCV(RandomForestClassifier(), param_distributions=parameters_grid, cv=5, verbose=5, n_jobs=-1)

In [50]:
rf_rscv.fit(X_train, y_train)
y_pred = rf_rscv.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Accuracy:  0.8360655737704918


15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nithi\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\nithi\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\nithi\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\ensemble\_forest.py", line 402, in fit
    raise ValueError(
ValueError: `ma

In [51]:
parameters = {
    "n_estimators": n_estimators,
    "max_features": max_features,
    "max_depth": max_depth,
    "max_samples": max_samples,
    'oob_score': [False, True]   
}

In [52]:
gs = GridSearchCV(RandomForestClassifier(), parameters, cv=10, n_jobs=-1, verbose=5)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_test)
print("Accuracy: ", accuracy_score(y_test, y_pred))

Fitting 10 folds for each of 480 candidates, totalling 4800 fits
Accuracy:  0.8688524590163934


In [53]:
gs.best_params_


{'max_depth': None,
 'max_features': 0.5,
 'max_samples': 0.5,
 'n_estimators': 20,
 'oob_score': True}

In [54]:
gs.best_estimator_.oob_score_

0.7975206611570248