# Load dataset

In [1]:
import pandas as pd
df = pd.read_csv('Smoking Data.csv')

In [2]:
df.head()

Unnamed: 0,region,age,sex,bmi,children,smoker
0,southwest,19,female,27.9,0,yes
1,southeast,18,male,33.77,1,no
2,southeast,28,male,33.0,3,no
3,northwest,33,male,22.705,0,no
4,northwest,32,male,28.88,0,no


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   region    1338 non-null   object 
 1   age       1338 non-null   int64  
 2   sex       1338 non-null   object 
 3   bmi       1338 non-null   float64
 4   children  1338 non-null   int64  
 5   smoker    1338 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 62.8+ KB


# String to number using LebelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
le = LabelEncoder()
df.region = le.fit_transform(df['region'])
df.sex = le.fit_transform(df['sex'])
df.smoker = le.fit_transform(df['smoker'])

In [6]:
df.head()

Unnamed: 0,region,age,sex,bmi,children,smoker
0,3,19,0,27.9,0,1
1,2,18,1,33.77,1,0
2,2,28,1,33.0,3,0
3,1,33,1,22.705,0,0
4,1,32,1,28.88,0,0


In [7]:
x = df.drop('smoker', axis=1)

In [8]:
y = df['smoker']

# train_test_split

In [9]:
from sklearn.model_selection import train_test_split as tts
xtrain, xtest, ytrain, ytest = tts(x, y, test_size=0.30, random_state= 42)

In [10]:
xtrain.shape

(936, 5)

In [11]:
xtest.shape

(402, 5)

In [12]:
ytrain.shape

(936,)

In [13]:
ytest.shape

(402,)

In [14]:
xtest

Unnamed: 0,region,age,sex,bmi,children
764,0,45,0,25.175,2
887,1,36,0,30.020,0
890,1,64,0,26.885,0
1293,1,46,1,25.745,3
259,1,19,1,31.920,0
...,...,...,...,...,...
701,0,50,0,44.745,0
672,2,36,1,29.700,0
1163,0,18,0,28.215,0
1103,2,58,1,36.080,0


In [15]:
ytest

764     0
887     0
890     1
1293    0
259     1
       ..
701     0
672     0
1163    0
1103    0
1295    0
Name: smoker, Length: 402, dtype: int32

# Applying the RandomForestClassifier model

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
clf = RandomForestClassifier()

In [18]:
clf.fit(xtrain, ytrain)

RandomForestClassifier()

In [19]:
clf.score(xtest, ytest)

0.7786069651741293

# Hyper Parameter Tuning

In [20]:
import numpy as np
trees = np.random.randint(25,200, 25)
trees

array([ 35, 126, 114,  75,  83,  82, 176, 139, 105,  69, 117,  33, 172,
       127, 198,  44,  97, 129, 103,  39, 153, 121, 164,  87,  37])

# RF_GridSearchCV

In [21]:
criterion = ['gini', 'entropy','log_loss']
max_depth = np.random.randint(1,8,25)
min_samples_split = [2,3,4]
max_features = ['auto', 'sqrt', 'log2']

In [22]:
ids = {
'n_estimators': trees,
'criterion' : criterion,
'min_samples_split': min_samples_split,
'max_depth': max_depth,
'max_features': max_features
    
}

In [23]:
print(ids)

{'n_estimators': array([ 35, 126, 114,  75,  83,  82, 176, 139, 105,  69, 117,  33, 172,
       127, 198,  44,  97, 129, 103,  39, 153, 121, 164,  87,  37]), 'criterion': ['gini', 'entropy', 'log_loss'], 'min_samples_split': [2, 3, 4], 'max_depth': array([5, 2, 6, 7, 6, 4, 3, 6, 6, 2, 1, 1, 5, 5, 2, 6, 6, 3, 5, 7, 2, 7,
       2, 6, 7]), 'max_features': ['auto', 'sqrt', 'log2']}


# GridSearchCV

In [24]:
from sklearn.model_selection import GridSearchCV

In [25]:
rgcv = GridSearchCV(clf, ids, n_jobs=5, cv=3)

In [26]:
rgcv.fit(xtrain, ytrain)



GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=5,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': array([5, 2, 6, 7, 6, 4, 3, 6, 6, 2, 1, 1, 5, 5, 2, 6, 6, 3, 5, 7, 2, 7,
       2, 6, 7]),
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_split': [2, 3, 4],
                         'n_estimators': array([ 35, 126, 114,  75,  83,  82, 176, 139, 105,  69, 117,  33, 172,
       127, 198,  44,  97, 129, 103,  39, 153, 121, 164,  87,  37])})

In [27]:
rgcv.cv_results_

{'mean_fit_time': array([0.06769252, 0.23432088, 0.18224835, ..., 0.0833145 , 0.04165729,
        0.01562103]),
 'std_fit_time': array([0.00736423, 0.02209203, 0.0194824 , ..., 0.01472785, 0.00736412,
        0.01275416]),
 'mean_score_time': array([0.00520722, 0.01562015, 0.01041436, ..., 0.        , 0.        ,
        0.        ]),
 'std_score_time': array([7.36412216e-03, 1.80877156e-06, 7.36406597e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', ..., 'log_loss', 'log_loss',
                    'log_loss'],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[5, 5, 5, ..., 7, 7, 7],
              mask=[False, False, False, ..., False, False, False],
        fill_value='?',
             dtype=object),
 'param_max_features': masked_array(data=['auto', 'auto', 'auto', ..., 'log2', 'log2', 'log2'],
  

In [28]:
cv_results = pd.DataFrame(rgcv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067693,0.007364,0.005207,0.007364,gini,5,auto,2,35,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.791667,0.782051,0.791667,0.788462,0.004533,7321
1,0.234321,0.022092,0.015620,0.000002,gini,5,auto,2,126,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.791667,0.788462,0.791667,0.790598,0.001511,4238
2,0.182248,0.019482,0.010414,0.007364,gini,5,auto,2,114,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.791667,0.785256,0.791667,0.789530,0.003022,5294
3,0.124970,0.012755,0.010414,0.007364,gini,5,auto,2,75,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.791667,0.782051,0.791667,0.788462,0.004533,7321
4,0.130178,0.007364,0.010414,0.007364,gini,5,auto,2,83,"{'criterion': 'gini', 'max_depth': 5, 'max_fea...",0.791667,0.788462,0.791667,0.790598,0.001511,4238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16870,0.072899,0.019482,0.000000,0.000000,log_loss,7,log2,4,153,"{'criterion': 'log_loss', 'max_depth': 7, 'max...",,,,,,13121
16871,0.046864,0.000002,0.000000,0.000000,log_loss,7,log2,4,121,"{'criterion': 'log_loss', 'max_depth': 7, 'max...",,,,,,13120
16872,0.083314,0.014728,0.000000,0.000000,log_loss,7,log2,4,164,"{'criterion': 'log_loss', 'max_depth': 7, 'max...",,,,,,13119
16873,0.041657,0.007364,0.000000,0.000000,log_loss,7,log2,4,87,"{'criterion': 'log_loss', 'max_depth': 7, 'max...",,,,,,15004


In [29]:
rgcv.best_score_

0.7927350427350427

In [30]:
rgcv.best_params_

{'criterion': 'entropy',
 'max_depth': 5,
 'max_features': 'log2',
 'min_samples_split': 3,
 'n_estimators': 35}

In [31]:
clf2 = RandomForestClassifier(criterion = 'gini',
 max_depth = 4,
 max_features = 'log2',
 min_samples_split = 3,
 n_estimators = 35)

In [32]:
print(clf2)

RandomForestClassifier(max_depth=4, max_features='log2', min_samples_split=3,
                       n_estimators=35)


In [33]:
rgcv.best_estimator_

RandomForestClassifier(criterion='entropy', max_depth=5, max_features='log2',
                       min_samples_split=3, n_estimators=35)

In [34]:
clf2.fit(xtrain, ytrain)

RandomForestClassifier(max_depth=4, max_features='log2', min_samples_split=3,
                       n_estimators=35)

# Accuracy 

In [35]:
clf2.score(xtest, ytest)

0.8034825870646766