# Random Forest Classification with RandomizedSearchCV

In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [51]:
data=pd.read_csv("advertising.csv")
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [52]:
X=data.iloc[:,[1,2,3]]
Y=data.iloc[:,4]

In [53]:
X.head()

Unnamed: 0,Gender,Age,EstimatedSalary
0,Male,19,19000
1,Male,35,20000
2,Female,26,43000
3,Female,27,57000
4,Male,19,76000


In [54]:
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

In [55]:
gender=pd.get_dummies(X['Gender'],drop_first=True)

In [56]:
gender.head()

Unnamed: 0,Male
0,1
1,1
2,0
3,0
4,1


In [57]:
X=X.drop(['Gender'],axis=1)

In [58]:
X=pd.concat([X,gender],axis=1)

In [59]:
X.head()

Unnamed: 0,Age,EstimatedSalary,Male
0,19,19000,1
1,35,20000,1
2,26,43000,0
3,27,57000,0
4,19,76000,1


In [60]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size = 0.2,random_state=10)


In [61]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

In [62]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(n_estimators=10,criterion='gini',random_state=40)
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=40, verbose=0,
                       warm_start=False)

In [63]:
y_pred=classifier.predict(x_test)

In [64]:
from sklearn.metrics import confusion_matrix
conmat=confusion_matrix(y_test,y_pred)

In [65]:
conmat

array([[47,  5],
       [ 0, 28]])

In [66]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)

In [67]:
accuracy

0.9375

In [68]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


In [69]:
est = RandomForestClassifier(n_jobs=-1)
rf_p_dist={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200,300,400,500],
              'max_features':randint(1,3),
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf':randint(1,4),
              }

In [70]:
def hypertuning_rscv(est, p_distr, nbr_iter,X,Y):
    rdmsearch = RandomizedSearchCV(est, param_distributions=p_distr,
                                  n_jobs=-1, n_iter=nbr_iter, cv=9)
    #CV = Cross-Validation ( here using Stratified KFold CV)
    rdmsearch.fit(X,Y)
    ht_params = rdmsearch.best_params_
    ht_score = rdmsearch.best_score_
    return ht_params, ht_score

In [71]:
rf_parameters, rf_ht_score = hypertuning_rscv(est, rf_p_dist, 40, X, Y)
claasifier=RandomForestClassifier(n_jobs=-1, n_estimators=300,bootstrap= True,criterion='entropy',max_depth=3,max_features=2,min_samples_leaf= 3)


In [72]:
# Predicting the Test set results
y_pred = classifier.predict(x_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)

accuracy_score=accuracy_score(y_test,y_pred)

#claasifier=RandomForestClassifier(n_jobs=-1, n_estimators=300,bootstrap= True,criterion='entropy',max_depth=3,max_features=2,min_samples_leaf= 3)



In [73]:
accuracy_score

0.9375

In [74]:
## Cross Validation good for selecting models
from sklearn.model_selection import cross_val_score


cross_val=cross_val_score(claasifier,X,Y,cv=10,scoring='accuracy').mean()


In [75]:
cross_val

0.89