# Importing packages 

In [17]:
# Data wrangling 
import pandas as pd 

# Array math
import numpy as np 

# Ploting 
import seaborn as sns
import matplotlib.pyplot as plt

# List iteration tracking
from tqdm import tqdm

# Importing the custom written class 
from DecisionTree import Node 

# Importing the custom regression tree 
from RandomForest import RandomForestClassifier, RandomForestTree

# Time tracking
import time

# Precision metrics 
from sklearn.metrics import precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

# Reading data 

The data regards telecom churn. 

The objective is to create a model that predicts whether a customer will quit using the features available.

In [18]:
d = pd.read_csv('data.csv')

In [19]:
print(f"Data shape: {d.shape}")

Data shape: (569, 33)


In [20]:
d.isna().sum() # finds the sum of colomuns with missing val
d=d.dropna(axis=1) # drops the last col with empty val
d.shape

(569, 32)

In [21]:
d.head(-10)

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.990,10.38,122.80,1001.0,0.11840,0.27760,0.300100,0.14710,...,25.38,17.33,184.60,2019.0,0.1622,0.66560,0.71190,0.26540,0.4601,0.11890
1,842517,M,20.570,17.77,132.90,1326.0,0.08474,0.07864,0.086900,0.07017,...,24.99,23.41,158.80,1956.0,0.1238,0.18660,0.24160,0.18600,0.2750,0.08902
2,84300903,M,19.690,21.25,130.00,1203.0,0.10960,0.15990,0.197400,0.12790,...,23.57,25.53,152.50,1709.0,0.1444,0.42450,0.45040,0.24300,0.3613,0.08758
3,84348301,M,11.420,20.38,77.58,386.1,0.14250,0.28390,0.241400,0.10520,...,14.91,26.50,98.87,567.7,0.2098,0.86630,0.68690,0.25750,0.6638,0.17300
4,84358402,M,20.290,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.10430,...,22.54,16.67,152.20,1575.0,0.1374,0.20500,0.40000,0.16250,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
554,924632,B,12.880,28.92,82.50,514.3,0.08123,0.05824,0.061950,0.02343,...,13.89,35.74,88.84,595.7,0.1227,0.16200,0.24390,0.06493,0.2372,0.07242
555,924934,B,10.290,27.61,65.67,321.4,0.09030,0.07658,0.059990,0.02738,...,10.84,34.91,69.57,357.6,0.1384,0.17100,0.20000,0.09127,0.2226,0.08283
556,924964,B,10.160,19.59,64.73,311.7,0.10030,0.07504,0.005025,0.01116,...,10.65,22.88,67.88,347.3,0.1265,0.12000,0.01005,0.02232,0.2262,0.06742
557,925236,B,9.423,27.88,59.26,271.3,0.08123,0.04971,0.000000,0.00000,...,10.49,34.24,66.50,330.6,0.1073,0.07158,0.00000,0.00000,0.2475,0.06969


In [22]:
labelencoder_Y= LabelEncoder()
d.iloc[:,1]=labelencoder_Y.fit_transform(d.iloc[:,1].values)
d.iloc[:,1]

0      1
1      1
2      1
3      1
4      1
      ..
564    1
565    1
566    1
567    1
568    0
Name: diagnosis, Length: 569, dtype: int32

In [23]:
d.dtypes

id                           int64
diagnosis                    int32
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     

In [24]:
# Distribution of target in data 
d.groupby('diagnosis').size()

diagnosis
0    357
1    212
dtype: int64

# Random forest - quick theory review

The classifier which will be created is a random forest classifier. 

Lets denote it as **rf()**.  

Given a set of input matrix $\mathbb{X}_{nxp}$ the classifier **rf()** outputs either 1 or 0.

$$rf: \mathbb{X} \rightarrow \{1, 0\}$$

The algorithm of the random forest grows **k** decision trees. 

The final prediction of the **rf()** classifier is a majority vote: the input matrix $\mathbb{X}$ is used with each of the **k** trees, and then the class with the most outputs wins. 

In the notebook about decision trees it is clear that with the same input and the same hyperparameters, the same output and the same rules will be learnt by a decision tree. So why grow **k** of them? 

## Data bootstrapping

The random in the random forest starts at the data sample creation for each of the decision trees. The technique used in creating **k** datasamples is bootstrapping

Given a dataset of n rows and p features: we sample the rows from the original dataset with replacement. For every new decision tree *i*, a new bootsrapped dataset is created: $\mathbb{X_{b}^{i}}$.

For example, lets assume that the whole dataset has 10 rows of data:

In [25]:
# Lets imagine this the whole dataset
dsubset = d.sample(10).copy()[['id',
'diagnosis',
'radius_mean',
'texture_mean',
'perimeter_mean',
'area_mean',
'smoothness_mean',
'compactness_mean',
'concavity_mean',
'concave points_mean',
'symmetry_mean',
'fractal_dimension_mean',
'radius_se',
'texture_se',
'perimeter_se',
'area_se',
'smoothness_se',
'compactness_se',
'concavity_se',
'concave points_se',
'symmetry_se',
'fractal_dimension_se',
'radius_worst',
'texture_worst',
'perimeter_worst',
'area_worst',
'smoothness_worst',
'compactness_worst',
'concavity_worst',
'concave points_worst',
'symmetry_worst',
'fractal_dimension_worst']]
dsubset.reset_index(inplace=True, drop=True)

print(dsubset)

          id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0  901034301          0        9.436         18.32           59.82      278.6   
1      86208          1       20.260         23.03          132.40     1264.0   
2   88203002          0       11.220         33.81           70.79      386.8   
3     911150          0       14.530         19.34           94.25      659.7   
4   91903902          0       13.680         16.33           87.76      575.5   
5   91903901          0       11.670         20.02           75.21      416.2   
6      87127          0       10.800          9.71           68.77      357.6   
7     892399          0       10.510         23.09           66.85      334.2   
8    8610404          1       16.070         19.65          104.10      817.7   
9    8911670          1       18.810         19.98          120.90     1102.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.10090           0.

To create 3 more random bootsrapped samples we use the pandas function **sample(replace=True)**. The key concept is that the sampling is done *with replacement*: the same rows might appear several times in our sample. 

In [26]:
for i, _ in enumerate(range(3)):
    print("----- \n")
    print(f"Boostrapped sample: {i + 1} \n")
    print(dsubset.sample(frac=1.0, replace=True))
    print("----- \n")

----- 

Boostrapped sample: 1 

          id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
2   88203002          0       11.220         33.81           70.79      386.8   
8    8610404          1       16.070         19.65          104.10      817.7   
3     911150          0       14.530         19.34           94.25      659.7   
4   91903902          0       13.680         16.33           87.76      575.5   
3     911150          0       14.530         19.34           94.25      659.7   
0  901034301          0        9.436         18.32           59.82      278.6   
5   91903901          0       11.670         20.02           75.21      416.2   
1      86208          1       20.260         23.03          132.40     1264.0   
5   91903901          0       11.670         20.02           75.21      416.2   
9    8911670          1       18.810         19.98          120.90     1102.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \

For each of the **k** trees grown in random forest, we create **k** bootstrapped data samples. 

## Feature selection at each split 

Now that we have a dataset $\mathbb{X_{b}^{i}}$ for each of the **k** trees the final part is to determine the splitting criterion for the creation of the nodes. 

In the classification case, the gini gain criterion is the same as in the simple decision tree case. The difference is that at each node splitting, a random subsample of collumns are select to find the "best split". 

For example, if we have 10 collumns as features and we select the hyperparameter of **X_features_fraction = 0.8** then at each node where the best split is beeing calculated, we would select 8 random features (10 * 0.8 = 8).  

# Features to use 

The bellow feature list will be used in the creation of the random forest. 

In [11]:
# Defining the feature list used in the growth of the tree
features = [
   'id',
'radius_mean',
'texture_mean',
'perimeter_mean',
'area_mean',
'smoothness_mean',
'compactness_mean',
'concavity_mean',
'concave points_mean',
'symmetry_mean',
'fractal_dimension_mean',
'radius_se',
'texture_se',
'perimeter_se',
'area_se',
'smoothness_se',
'compactness_se',
'concavity_se',
'concave points_se',
'symmetry_se',
'fractal_dimension_se',
'radius_worst',
'texture_worst',
'perimeter_worst',
'area_worst',
'smoothness_worst',
'compactness_worst',
'concavity_worst',
'concave points_worst',
'symmetry_worst',
'fractal_dimension_worst'
]

In [12]:
d[features + ['diagnosis']].sample(10)

Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
50,857343,11.76,21.6,74.72,427.9,0.08637,0.04966,0.01657,0.01115,0.1495,...,25.72,82.98,516.5,0.1085,0.08615,0.05523,0.03715,0.2433,0.06563,0
373,901288,20.64,17.35,134.8,1335.0,0.09446,0.1076,0.1527,0.08941,0.1571,...,23.17,166.8,1946.0,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055,1
542,921644,14.74,25.42,94.7,668.6,0.08275,0.07214,0.04105,0.03027,0.184,...,32.29,107.4,826.4,0.106,0.1376,0.1611,0.1095,0.2722,0.06956,0
268,8910506,12.87,16.21,82.38,512.2,0.09425,0.06219,0.039,0.01615,0.201,...,23.64,89.27,597.5,0.1256,0.1808,0.1992,0.0578,0.3604,0.07062,0
64,85922302,12.68,23.84,82.69,499.0,0.1122,0.1262,0.1128,0.06873,0.1905,...,33.47,111.8,888.3,0.1851,0.4061,0.4024,0.1716,0.3383,0.1031,1
147,86973701,14.95,18.77,97.84,689.5,0.08138,0.1167,0.0905,0.03562,0.1744,...,25.47,107.1,809.7,0.0997,0.2521,0.25,0.08405,0.2852,0.09218,0
485,913063,12.45,16.41,82.85,476.7,0.09514,0.1511,0.1544,0.04846,0.2082,...,21.03,97.82,580.6,0.1175,0.4061,0.4896,0.1342,0.3231,0.1034,0
267,8910499,13.59,21.84,87.16,561.0,0.07956,0.08259,0.04072,0.02142,0.1635,...,30.04,97.66,661.5,0.1005,0.173,0.1453,0.06189,0.2446,0.07024,0
88,861597,12.36,21.8,79.78,466.1,0.08772,0.09445,0.06015,0.03745,0.193,...,30.5,91.46,574.7,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261,0
135,868202,12.77,22.47,81.72,506.3,0.09055,0.05761,0.04711,0.02704,0.1585,...,33.37,92.04,653.6,0.1419,0.1523,0.2177,0.09331,0.2829,0.08067,1


# Creating the train and test sets 

In [13]:
# Fraction of rows in the training set 
train_share = 0.75

# Creating the train and test sets
train = d.sample(frac=train_share)
test = d[~d.index.isin(train.index)].copy()

print(f"Total rows in the dataset: {d.shape[0]}")
print(f"Rows in training set: {train.shape[0]}")
print(f"Rows in test set: {test.shape[0]}")

Total rows in the dataset: 569
Rows in training set: 427
Rows in test set: 142


# Training the random forest 

In [14]:
# Initiating the random forest object 
rf = RandomForestClassifier(
    Y=train['diagnosis'], 
    X=train[features],
    min_samples_split=5,
    max_depth=3,
    n_trees=30, # Number of trees grown
    X_features_fraction=0.75
    )

# Growing the random forest 
rf.grow_random_forest()

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [10:18<00:00, 20.62s/it]


In [15]:
# Printing out the trees
if rf.n_trees < 10:
    rf.print_trees()

# Predictions

In [16]:
yhat = rf.predict(test[features])
test['yhat'] = yhat

print(f"Total target in test set: {test['diagnosis'].sum()}")
print(f"Total predicted target in test set: {test['yhat'].sum()}")

print(f"Precision: {round(precision_score(test['diagnosis'], test['yhat']), 2) * 100} %")
print(f"Recall: {round(recall_score(test['diagnosis'], test['yhat']), 2) * 100} %")

Total target in test set: 49
Total predicted target in test set: 47
Precision: 96.0 %
Recall: 92.0 %
