In [1]:
# In this, how to apply 'Random Forest' in any problem, let's see through the help of the code.
# And, this is a 'Classification-problem'.

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('heart_disease_uci.csv') 

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
df.shape          # means there are total 303 patients having 'heart-disease-related-issue'.

(303, 14)

In [6]:
# Extraxt 'rows' and 'columns':-

X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [7]:
# Apply train test split operation

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [8]:
# create an object for every algo. to check where "Random_Forest" stands,and find their accuracy also to compare them.


rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svc = SVC()
lor = LogisticRegression()

In [9]:
# train the differnt model and find their Accuracy also:-

rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

accuracy_score(y_test,y_pred) 

0.8360655737704918

In [10]:
gb.fit(X_train,y_train)
y_pred = gb.predict(X_test)

accuracy_score(y_test,y_pred) 

0.7704918032786885

In [11]:
svc.fit(X_train,y_train)
y_pred = svc.predict(X_test)

accuracy_score(y_test,y_pred) 

0.7049180327868853

In [12]:
lor.fit(X_train,y_train)
y_pred = lor.predict(X_test)

accuracy_score(y_test,y_pred) 

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8852459016393442

In [13]:
# So, we can see that Accuracy of 'R.F' is approximately much better than other algorithms.

In [14]:
# Also,through 'hyperparameter tuning', we can improve our model's performnace.

rf = RandomForestClassifier(max_samples=0.75,random_state=42)    # 'max_samples' is for 'rows'  (i.e., 75% taken). 
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)

accuracy_score(y_test,y_pred)

# So, we can see our model's performance increases from previous one.

0.9016393442622951

In [15]:
# let's do the 'cross-validation' 10 times for "Random-Forest" :-

from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(RandomForestClassifier(max_samples=0.75),X,y,cv=10,scoring='accuracy'))

# so,we can see that this is our approx accuracy.

0.8313978494623656

In [16]:
# In "Random_forest", there is around 25 hyperparameters,then how we will check them their's best value. 
# so,we have only one solution for this to solve this issue, i.e., 'hyperparameter tuning'.
# And,for finding hyperparameter,there are various techniques and 'GridSearchCV' is one of them.
# ON that,hm apne hyperparamter ko alg-alg value de dete hai,jinke liye hme test krna hai.

<h2> GridSearchCV </h2>

In [17]:
# Number of trees in Random_Forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# So,yha pe hrek particular combination k liye (1  R.F) train hoga.
# then, total = (4*3*3*3) = 108 combination create honge -> so, we train 108 different 'random-forest'.

In [18]:
# then,we create a dictionary named "parameter_grid" where we give 'name and value' of parameters.

param_grid = {'n_estimators':n_estimators,
              'max_features':max_features,
              'max_depth':max_depth,
              'max_samples':max_samples
             }

print(param_grid)     # prints the dictionary

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [19]:
# create an object of random_forest_classifier

rf = RandomForestClassifier()

In [20]:
# import it

from sklearn.model_selection import GridSearchCV

rf_grid = GridSearchCV(estimator=rf,            # on which algo. we are going to run it.
                       param_grid=param_grid,   # kya-kya options ke upr hme train krna hai.
                       cv = 5,                  # hrek R.F ko 5 times train krna hai (108*5)=540 times.
                       verbose = 2,             # isse hmara o/p dikhega during the process.
                       n_jobs = -1)             # it fastens the process.

In [21]:
# train the model

rf_grid.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


0,1,2
,estimator,RandomForestClassifier()
,param_grid,"{'max_depth': [2, 8, ...], 'max_features': [0.2, 0.6, ...], 'max_samples': [0.5, 0.75, ...], 'n_estimators': [20, 60, ...]}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,60
,criterion,'gini'
,max_depth,8
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.2
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
# for finding the best parameter's values:-

rf_grid.best_params_

{'max_depth': 8, 'max_features': 0.2, 'max_samples': 1.0, 'n_estimators': 60}

In [23]:
# for finding the best score :-

rf_grid.best_score_

0.8346938775510205

<h2> RandomSearchCV</h2>

In [24]:
# also called "randomized-SearchCV'.
# so,for higher dataset,there is too many hyperparamters....... where 'GridSearchCV' becomes very slow(b/c it takes more time to train the model). 
# in 'RSCV',from 108 options,it randomly choose only 10-12 options and performs randomly according to it.

In [25]:
# so we add here some extra hyperparameters:- 

# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]

# Bootstrap samples   
bootstrap = [True,False]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2]

In [26]:
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples,
              'bootstrap':bootstrap,
              'min_samples_split':min_samples_split,
              'min_samples_leaf':min_samples_leaf
             }
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}


In [27]:
# Now, import RSCV :-

from sklearn.model_selection import RandomizedSearchCV

rf_grid = RandomizedSearchCV(estimator = rf, 
                       param_distributions = param_grid, 
                       cv = 5, 
                       verbose=2, 
                       n_jobs = -1)

In [28]:
rf_grid.fit(X_train,y_train)

# so,we can see that it picks only 10 candidates,and left rest all
# that's why,this process is much faster.

Fitting 5 folds for each of 10 candidates, totalling 50 fits


5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\munna\anaconda3\envs\profiling310\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\munna\anaconda3\envs\profiling310\lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\munna\anaconda3\envs\profiling310\lib\site-packages\sklearn\ensemble\_forest.py", line 430, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'bootstrap': [True, False], 'max_depth': [2, 8, ...], 'max_features': [0.2, 0.6, ...], 'max_samples': [0.5, 0.75, ...], ...}"
,n_iter,10
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,0.6
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [29]:
# for best parameters:-

rf_grid.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_samples': 0.5,
 'max_features': 0.6,
 'max_depth': 2,
 'bootstrap': True}

In [30]:
# for best score:- 

rf_grid.best_score_

0.8264455782312925

<h2> Summary :- </h2>