In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('mobilePrice.csv')
df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [6]:
df.isnull().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [7]:
## we are seperating depended and idepended features
X = df.drop('price_range',axis=1)
y = df['price_range']

In [10]:
## splitting to train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=2)

In [14]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

RandomForestClassifier()

In [15]:
y_pred = rf.predict(X_test)


In [17]:
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[95  3  0  0]
 [ 9 93  7  0]
 [ 0  5 83  6]
 [ 0  0  7 92]]
0.9075


In [34]:
## Manual hyperparameter tuning
rf = RandomForestClassifier(criterion='entropy',n_estimators=300,max_features='sqrt',min_samples_leaf=2,random_state=100)
rf.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', max_features='sqrt',
                       min_samples_leaf=2, n_estimators=300, random_state=100)

In [35]:
y_pred = rf.predict(X_test)
accuracy_score(y_test, y_pred)

0.91

In [43]:
n_estimators = [int(i) for i in np.linspace(start=100,stop=3000,num=20)]
criterion = ['gini','entropy','log_loss']
max_depth = [int(i) for i in np.linspace(start=5,stop=1500,num=15)]
min_samples_split = [1,2,3,4,5,6]
min_samples_leaf = [i for i in range(11)]
random_state = [int(i) for i in np.linspace(1,100,3)]

random_grid = {
    'n_estimators' : n_estimators,
    'criterion': criterion,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf,
    'random_state': random_state
    
}
print(random_grid)

{'n_estimators': [100, 252, 405, 557, 710, 863, 1015, 1168, 1321, 1473, 1626, 1778, 1931, 2084, 2236, 2389, 2542, 2694, 2847, 3000], 'criterion': ['gini', 'entropy', 'log_loss'], 'max_depth': [5, 111, 218, 325, 432, 538, 645, 752, 859, 966, 1072, 1179, 1286, 1393, 1500], 'min_samples_split': [1, 2, 3, 4, 5, 6], 'min_samples_leaf': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'random_state': [1, 50, 100]}


In [44]:
rf = RandomForestClassifier()
rf_search_cv = RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=5,verbose=3,random_state=99,n_jobs=-1)
rf_search_cv.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END criterion=log_loss, max_depth=538, min_samples_leaf=4, min_samples_split=2, n_estimators=405, random_state=50.5;, score=nan total time=   0.0s
[CV 2/5] END criterion=log_loss, max_depth=538, min_samples_leaf=4, min_samples_split=2, n_estimators=405, random_state=50.5;, score=nan total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=1393, min_samples_leaf=0, min_samples_split=3, n_estimators=710, random_state=100.0;, score=nan total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=1393, min_samples_leaf=0, min_samples_split=3, n_estimators=710, random_state=100.0;, score=nan total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=1393, min_samples_leaf=0, min_samples_split=3, n_estimators=710, random_state=100.0;, score=nan total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=1393, min_samples_leaf=0, min_samples_split=3, n_estimators=710, random_state=100.0;, score=nan total time=   0.

[CV 1/5] END criterion=gini, max_depth=1179, min_samples_leaf=4, min_samples_split=1, n_estimators=1626, random_state=100.0;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=1179, min_samples_leaf=4, min_samples_split=1, n_estimators=1626, random_state=100.0;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=1179, min_samples_leaf=4, min_samples_split=1, n_estimators=1626, random_state=100.0;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=1179, min_samples_leaf=4, min_samples_split=1, n_estimators=1626, random_state=100.0;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=645, min_samples_leaf=7, min_samples_split=4, n_estimators=100, random_state=100.0;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=645, min_samples_leaf=7, min_samples_split=4, n_estimators=100, random_state=100.0;, score=nan total time=   0.0s
[CV 4/5] END criterion=gini, max_depth=645, min_samples_leaf=7, min_sample

[CV 5/5] END criterion=gini, max_depth=325, min_samples_leaf=0, min_samples_split=2, n_estimators=3000, random_state=1.0;, score=nan total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=752, min_samples_leaf=2, min_samples_split=2, n_estimators=1168, random_state=1.0;, score=nan total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=752, min_samples_leaf=2, min_samples_split=2, n_estimators=1168, random_state=1.0;, score=nan total time=   0.0s
[CV 3/5] END criterion=entropy, max_depth=752, min_samples_leaf=2, min_samples_split=2, n_estimators=1168, random_state=1.0;, score=nan total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=752, min_samples_leaf=2, min_samples_split=2, n_estimators=1168, random_state=1.0;, score=nan total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=752, min_samples_leaf=2, min_samples_split=2, n_estimators=1168, random_state=1.0;, score=nan total time=   0.0s
[CV 1/5] END criterion=entropy, max_depth=966, min_samples_leaf=1, min_sa

[CV 1/5] END criterion=entropy, max_depth=1286, min_samples_leaf=2, min_samples_split=6, n_estimators=2084, random_state=50.5;, score=nan total time=   0.0s
[CV 2/5] END criterion=entropy, max_depth=1286, min_samples_leaf=2, min_samples_split=6, n_estimators=2084, random_state=50.5;, score=nan total time=   0.0s
[CV 4/5] END criterion=entropy, max_depth=5, min_samples_leaf=7, min_samples_split=3, n_estimators=1778, random_state=50.5;, score=nan total time=   0.0s
[CV 5/5] END criterion=entropy, max_depth=5, min_samples_leaf=7, min_samples_split=3, n_estimators=1778, random_state=50.5;, score=nan total time=   0.0s
[CV 1/5] END criterion=gini, max_depth=966, min_samples_leaf=1, min_samples_split=5, n_estimators=1015, random_state=50.5;, score=nan total time=   0.0s
[CV 2/5] END criterion=gini, max_depth=966, min_samples_leaf=1, min_samples_split=5, n_estimators=1015, random_state=50.5;, score=nan total time=   0.0s
[CV 3/5] END criterion=gini, max_depth=966, min_samples_leaf=1, min_samp

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy',
                                                      'log_loss'],
                                        'max_depth': [5, 111, 218, 325, 432,
                                                      538, 645, 752, 859, 966,
                                                      1072, 1179, 1286, 1393,
                                                      1500],
                                        'min_samples_leaf': [0, 1, 2, 3, 4, 5,
                                                             6, 7, 8, 9, 10],
                                        'min_samples_split': [1, 2, 3, 4, 5, 6],
                                        'n_estimators': [100, 252, 405, 557,
                                                         710, 863, 1015, 1168,
                                                         1321, 1473,

In [45]:
y_pred = rf_search_cv.pred(X_test)
accuracy_score()

RandomForestClassifier()

In [47]:
rf_search_cv.best_params_

{'random_state': 1,
 'n_estimators': 1931,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_depth': 218,
 'criterion': 'gini'}

In [50]:
best_estimator = rf_search_cv.best_estimator_

In [51]:
rf_search_cv.best_estimator_

RandomForestClassifier(max_depth=218, min_samples_split=3, n_estimators=1931,
                       random_state=1)

In [52]:
y_pred = best_estimator.predict(X_test)
accuracy_score(y_test,y_pred)

0.915

[CV 4/5] END criterion=entropy, max_depth=1393, min_samples_leaf=6, min_samples_split=3, n_estimators=557, random_state=100;, score=0.831 total time=   2.7s
[CV 3/5] END criterion=log_loss, max_depth=1072, min_samples_leaf=9, min_samples_split=1, n_estimators=2542, random_state=100;, score=nan total time=   1.4s
[CV 1/5] END criterion=log_loss, max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=710, random_state=100;, score=nan total time=   0.4s
[CV 3/5] END criterion=log_loss, max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=710, random_state=100;, score=nan total time=   0.4s
[CV 5/5] END criterion=log_loss, max_depth=5, min_samples_leaf=1, min_samples_split=6, n_estimators=710, random_state=100;, score=nan total time=   0.4s
[CV 3/5] END criterion=gini, max_depth=218, min_samples_leaf=7, min_samples_split=3, n_estimators=1626, random_state=100;, score=0.866 total time=   7.1s
[CV 1/5] END criterion=log_loss, max_depth=111, min_samples_leaf=5, min_s

[CV 2/5] END criterion=entropy, max_depth=752, min_samples_leaf=2, min_samples_split=2, n_estimators=1168, random_state=1;, score=0.856 total time=   7.0s
[CV 5/5] END criterion=log_loss, max_depth=859, min_samples_leaf=0, min_samples_split=3, n_estimators=2694, random_state=1;, score=nan total time=   1.7s
[CV 1/5] END criterion=gini, max_depth=538, min_samples_leaf=6, min_samples_split=1, n_estimators=1473, random_state=1;, score=nan total time=   0.8s
[CV 4/5] END criterion=gini, max_depth=538, min_samples_leaf=6, min_samples_split=1, n_estimators=1473, random_state=1;, score=nan total time=   0.8s
[CV 2/5] END criterion=gini, max_depth=432, min_samples_leaf=4, min_samples_split=4, n_estimators=1931, random_state=1;, score=0.856 total time=   9.2s
[CV 4/5] END criterion=gini, max_depth=432, min_samples_leaf=4, min_samples_split=4, n_estimators=1931, random_state=1;, score=0.838 total time=   9.0s
[CV 3/5] END criterion=entropy, max_depth=325, min_samples_leaf=5, min_samples_split=3,

[CV 2/5] END criterion=entropy, max_depth=1179, min_samples_leaf=4, min_samples_split=5, n_estimators=100, random_state=1;, score=0.875 total time=   0.5s
[CV 5/5] END criterion=entropy, max_depth=1179, min_samples_leaf=4, min_samples_split=5, n_estimators=100, random_state=1;, score=0.847 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=752, min_samples_leaf=4, min_samples_split=3, n_estimators=2236, random_state=100;, score=0.872 total time=  11.4s
[CV 2/5] END criterion=log_loss, max_depth=645, min_samples_leaf=5, min_samples_split=2, n_estimators=1168, random_state=100;, score=nan total time=   0.7s
[CV 4/5] END criterion=log_loss, max_depth=645, min_samples_leaf=5, min_samples_split=2, n_estimators=1168, random_state=100;, score=nan total time=   0.7s
[CV 1/5] END criterion=entropy, max_depth=432, min_samples_leaf=9, min_samples_split=3, n_estimators=1626, random_state=50;, score=0.884 total time=   8.2s
[CV 4/5] END criterion=gini, max_depth=5, min_samples_leaf=7, min_sa

[CV 3/5] END criterion=entropy, max_depth=1286, min_samples_leaf=2, min_samples_split=6, n_estimators=2084, random_state=50;, score=0.891 total time=  13.5s
[CV 5/5] END criterion=gini, max_depth=111, min_samples_leaf=4, min_samples_split=3, n_estimators=3000, random_state=1;, score=0.859 total time=  14.5s
[CV 1/5] END criterion=gini, max_depth=645, min_samples_leaf=3, min_samples_split=4, n_estimators=100, random_state=50;, score=0.875 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=645, min_samples_leaf=3, min_samples_split=4, n_estimators=100, random_state=50;, score=0.822 total time=   0.6s
[CV 2/5] END criterion=gini, max_depth=966, min_samples_leaf=2, min_samples_split=1, n_estimators=252, random_state=1;, score=nan total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=966, min_samples_leaf=2, min_samples_split=1, n_estimators=252, random_state=1;, score=nan total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=111, min_samples_leaf=0, min_samples_split=5, n_es

In [54]:
param_grid = {
    'criterion': [rf_search_cv.best_params_['criterion']],
    'max_depth': [rf_search_cv.best_params_['max_depth']],
    'min_samples_leaf': [rf_search_cv.best_params_['min_samples_leaf'], 
                         rf_search_cv.best_params_['min_samples_leaf']+2, 
                         rf_search_cv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_search_cv.best_params_['min_samples_split'] - 2,
                          rf_search_cv.best_params_['min_samples_split'] - 1,
                          rf_search_cv.best_params_['min_samples_split'], 
                          rf_search_cv.best_params_['min_samples_split'] +1,
                          rf_search_cv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_search_cv.best_params_['n_estimators'] - 200, rf_search_cv.best_params_['n_estimators'] - 100, 
                     rf_search_cv.best_params_['n_estimators'], 
                     rf_search_cv.best_params_['n_estimators'] + 100, rf_search_cv.best_params_['n_estimators'] + 200]
}

print(param_grid)

{'criterion': ['gini'], 'max_depth': [218], 'min_samples_leaf': [1, 3, 5], 'min_samples_split': [1, 2, 3, 4, 5], 'n_estimators': [1731, 1831, 1931, 2031, 2131]}


In [None]:
rf = RandomForestClassifier()
rf_Grid_cv = GridSearchCV(estimator=rf, param_grid=param_grid,cv=15,n_jobs=-1,verbose=2)
rf_Grid_cv.fit(X_train,y_train)

Fitting 15 folds for each of 75 candidates, totalling 1125 fits
