In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import accuracy_score

In [2]:
dataset_url = "/content/breast_cancer.csv"
dataset = pd.read_csv(dataset_url)

In [3]:
dataset.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
# Distribution of y-variable
dataset.diagnosis.value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
B,357
M,212


In [5]:
dataset.diagnosis.value_counts()/len(dataset)*100

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
B,62.741652
M,37.258348


In [6]:
dataset.shape

(569, 33)

In [7]:
dataset['diagnosis'] = dataset['diagnosis'].map({'M': 1, 'B': 0})

In [8]:
dataset.head(5)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [9]:
dataset.diagnosis.value_counts()

Unnamed: 0_level_0,count
diagnosis,Unnamed: 1_level_1
0,357
1,212


In [10]:
# X & y

X = dataset.iloc[:, [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]]
y = dataset.diagnosis.values

#### Train Test Split

In [11]:
# Train-Test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
len(X_train)

455

In [13]:
len(X_test)

114

#### Feature Scaling

In [14]:
# Standardization
# Normalization
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### Model Building (RF)

In [15]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

In [16]:
# Predicting the test set values

y_pred = model.predict(X_test)

In [17]:
# Metrics

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
result = accuracy_score(y_test, y_pred)
print("Accuracy of the base RF model is: ", round(result*100,2))

Accuracy of the base RF model is:  94.74


In [18]:
cm = confusion_matrix(y_test, y_pred)
print("CM is: ")
print(cm)

CM is: 
[[63  4]
 [ 2 45]]


In [19]:
cr = classification_report(y_test, y_pred)
print("Classification Report is: ")
print(cr)

Classification Report is: 
              precision    recall  f1-score   support

           0       0.97      0.94      0.95        67
           1       0.92      0.96      0.94        47

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.95       114
weighted avg       0.95      0.95      0.95       114



### Manual HPO

In [20]:
n_estimators_list = [1,2,3,10,50,100,200]

for estim_list in n_estimators_list:
  model = RandomForestClassifier(n_estimators=estim_list)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  result = accuracy_score(y_test, y_pred)
  print("\n Estimator Value", estim_list)
  print("Accuracy is", result)


 Estimator Value 1
Accuracy is 0.8859649122807017

 Estimator Value 2
Accuracy is 0.8421052631578947

 Estimator Value 3
Accuracy is 0.9473684210526315

 Estimator Value 10
Accuracy is 0.9385964912280702

 Estimator Value 50
Accuracy is 0.9385964912280702

 Estimator Value 100
Accuracy is 0.9385964912280702

 Estimator Value 200
Accuracy is 0.9473684210526315


In [21]:
leaf_size = [1,2,3,4,5,10]

for i in leaf_size:
  model = RandomForestClassifier(n_estimators=10, min_samples_leaf=i)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  result = accuracy_score(y_test, y_pred)
  print("\n Estimator Value", i)
  print("Accuracy is", result)



 Estimator Value 1
Accuracy is 0.9122807017543859

 Estimator Value 2
Accuracy is 0.9473684210526315

 Estimator Value 3
Accuracy is 0.956140350877193

 Estimator Value 4
Accuracy is 0.9298245614035088

 Estimator Value 5
Accuracy is 0.9473684210526315

 Estimator Value 10
Accuracy is 0.9298245614035088


### Random Search CV

In [22]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
max_depth = [int(x) for x in np.linspace(start=10, stop=110, num=11)]

print(n_estimators)
print(max_depth)

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110]


In [23]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=10)]
max_depth = [int(x) for x in np.linspace(start=10, stop=110, num=11)]
min_samples_leaf = [1,2,4,10,20,50,100]
min_samples_split = [2,3,4,5,8,10,20,50,100,200]
bootstrap = [True, False]

# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap,
               'min_samples_leaf': min_samples_leaf}


Altogether, there are 15,400 combinations.

In [24]:
# Use the random grid to search for best hyper parameters
# First create the base model to tune

rf = RandomForestClassifier()
# Random search of parameters using 3 fold cross validations
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=3, n_jobs=-1)

# Fit the random search model
rf_random.fit(X_train, y_train)

In [25]:
def evaluate(model, test_features, test_labels):
  predictions = model.predict(test_features)
  accuracy = accuracy_score(test_labels, predictions)
  print('Model Performance')
  print('Accuracy = {:0.2f}%.'.format(accuracy))
  return accuracy

In [26]:
base_model = RandomForestClassifier(n_estimators=5, random_state=42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model,X_test,y_test)

Model Performance
Accuracy = 0.93%.


In [27]:
best_random = rf_random.best_estimator_
print(best_random)

RandomForestClassifier(max_depth=40, min_samples_split=3, n_estimators=700)


In [28]:
random_accuracy = evaluate(best_random,X_test,y_test)

Model Performance
Accuracy = 0.95%.


In [29]:
print('Improvement of {:0.2f}%.'.format(100*(random_accuracy-base_accuracy)/base_accuracy))

Improvement of 1.89%.


### Grid Search CV

In [30]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [75, 80, 85, 90, 95, 100],
    'max_features': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [200, 250, 270, 300, 350, 400, 450, 500]
}
# Create a based model
rf_gd = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf_gd, param_grid = param_grid,
                          cv = 3, n_jobs = -1, verbose = 2)

In [31]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


{'bootstrap': True,
 'max_depth': 80,
 'max_features': 3,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 350}

In [32]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Model Performance
Accuracy = 0.94%.


In [33]:
print('Improvement of {:0.2f}%.'.format(100*(grid_accuracy-base_accuracy)/base_accuracy))

Improvement of 0.94%.


In [34]:
best_grid