# Random Forest

Consists of lots of different trees and it's randomness means that we add some randomness in to the equations, when creating trees.

We start by creating given number of trees and then we get their votes (of what class label should be for this data point) and determine majority vote.

We sample dataset into subsets that are randomly created.

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier # just for testing
from sklearn.decomposition import PCA
from evaluation import *
from feature_scaler import *

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Pre-processing

In [4]:
df = pd.read_csv('./data/breast-cancer.csv')

encoder = LabelEncoder()
df['diagnosis'] = encoder.fit_transform(df['diagnosis'])
df = df.drop(columns=["id"], axis=1)

In [5]:
# Data split and scaling
X = df.drop(columns=["diagnosis"])
y = df['diagnosis']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_ss, X_test_ss = standard_scaler(X_train, X_test) # Standard Scaler
X_train_mms, X_test_mms = min_max_scaler(X_train, X_test) # Min-Max Scaler

### Removing features when the correlation between them exceeds a certain threshold

In [6]:
X_corr = X.corr(numeric_only=True).round(3)
correlation_threshold = 0.9
upper_matrix = X_corr.where(np.triu(np.ones(X_corr.shape), k=1).astype(bool))
features_to_drop = [x for x in upper_matrix.columns if any(upper_matrix[x] > correlation_threshold)]
X_corr = X.drop(X[features_to_drop], axis=1)

X_corr_train, X_corr_test = train_test_split(X_corr, test_size=0.2, random_state=42)
X_corr_train_ss, X_corr_test_ss = standard_scaler(X_corr_train, X_corr_test) # Standard Scaler
X_corr_train_mms, X_corr_test_mms = min_max_scaler(X_corr_train, X_corr_test) # Min-Max Scaler

Steps:
- Training:
    - Get a subset of the dataset randomly
    - Create a decision tree with random subset
    - Repeat as many trees there are
- Testing:
    - With given data point get the prediction from each tree
    - Hold a majority vote (Classification)

## Implementation from Scratch

In [55]:
class RandomForestClassifierScratch:
    def __init__(self, n_estimators=10, max_depth=5, min_samples_split=2, max_features=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def _get_sample(self, X, y):
        n_samples = len(X)
        i = np.random.choice(n_samples, n_samples, replace=True)
        return X[i], y[i]

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_depth=self.max_depth,
                                          max_features=self.max_features)
            X_sample, y_sample = self._get_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def _majority_vote(self, votes):
        return mode(votes)

    def predict(self, X):
        tree_preds = [tree.predict(X) for tree in self.trees]
        return [self._majority_vote(pred) for pred in tree_preds]
    
    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)

In [54]:
model = RandomForestClassifierScratch()        
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("Accuracy: ", model.accuracy(y_test, predictions))

KeyError: "None of [Index([407, 289, 343,  14, 308,  61,  25, 428, 168, 428,\n       ...\n       376,  42, 266,  60, 126, 374, 113, 362, 219, 268],\n      dtype='int32', length=455)] are in the [columns]"

## Built-In

### Whitout Feature Scaling

### Min-Max Scaling

In [33]:
score = []
model = RandomForestClassifier(criterion='gini')
model.fit(X_train_ss, y_train)
y_pred = model.predict(X_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = RandomForestClassifier(criterion='entropy')
model.fit(X_train_ss, y_train)
y_pred = model.predict(X_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = RandomForestClassifier(criterion='log_loss')
model.fit(X_train_ss, y_train)
y_pred = model.predict(X_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

In [34]:
model = RandomForestClassifier(criterion='gini')
model.fit(X_corr_train_ss, y_train)
y_pred = model.predict(X_corr_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = RandomForestClassifier(criterion='entropy')
model.fit(X_corr_train_ss, y_train)
y_pred = model.predict(X_corr_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

model = RandomForestClassifier(criterion='log_loss')
model.fit(X_corr_train_ss, y_train)
y_pred = model.predict(X_corr_test_ss)
score.append(np.array(evaluate(y_test, y_pred, False)))

### PCA

In [35]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_mms)
    X_test_pca = pca.transform(X_test_mms)

    model = RandomForestClassifier()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_mms)
    X_test_pca = pca.transform(X_corr_test_mms)

    model = RandomForestClassifier()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)

In [36]:
tmp_pca = [0,0,0,0,0,0]
tmp_pca2 = [0,0,0,0,0,0]

import warnings
for n in range(2, 20):
    pca = PCA(n_components=n)
    pca.fit(X_train)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_train_ss)
    X_test_pca = pca.transform(X_test_ss)

    model = RandomForestClassifier()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca[0] < tmp[0]:
        tmp_pca = tmp

    pca = PCA(n_components=n)
    pca.fit(X_corr_train_ss)

    warnings.filterwarnings("ignore")
    X_train_pca = pca.transform(X_corr_train_ss)
    X_test_pca = pca.transform(X_corr_test_ss)

    model = RandomForestClassifier()
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    tmp = evaluate(y_test, y_pred, False)
    if tmp_pca2[0] < tmp[0]:
        tmp_pca2 = tmp

score.append(tmp_pca)
score.append(tmp_pca2)

In [38]:
pd.DataFrame(score,
    index=['Gini Standard Scaler', 'Entopy Standard Scaler Corr', 'Log Loss Standard Scaler', 'Gini Standard Scaler Corr', 'Entopy Standard Scaler', 'Log Loss Standard Scaler Corr', 'PCA Min-Max Scaler', 'PCA Min-MAx Scaler Corr', 'PCA Standard Scaler', 'PCA Standard Scaler Corr'], 
    columns=['Accuracy', 'F1', 'Recall', 'Precision', 'AUC', 'Confusion Matrix'])

Unnamed: 0,Accuracy,F1,Recall,Precision,AUC,Confusion Matrix
Gini Standard Scaler,0.964912,0.964738,0.964912,0.965205,0.958074,"[[70, 1], [3, 40]]"
Entopy Standard Scaler Corr,0.964912,0.964738,0.964912,0.965205,0.958074,"[[70, 1], [3, 40]]"
Log Loss Standard Scaler,0.964912,0.964738,0.964912,0.965205,0.958074,"[[70, 1], [3, 40]]"
Gini Standard Scaler Corr,0.964912,0.964912,0.964912,0.964912,0.96266,"[[69, 2], [2, 41]]"
Entopy Standard Scaler,0.95614,0.956036,0.95614,0.956088,0.951032,"[[69, 2], [3, 40]]"
Log Loss Standard Scaler Corr,0.964912,0.964912,0.964912,0.964912,0.96266,"[[69, 2], [2, 41]]"
PCA Min-Max Scaler,0.973684,0.973621,0.973684,0.973719,0.969702,"[[70, 1], [2, 41]]"
PCA Min-MAx Scaler Corr,0.938596,0.93845,0.938596,0.938457,0.932362,"[[68, 3], [4, 39]]"
PCA Standard Scaler,0.982456,0.982369,0.982456,0.982937,0.976744,"[[71, 0], [2, 41]]"
PCA Standard Scaler Corr,0.938596,0.93845,0.938596,0.938457,0.932362,"[[68, 3], [4, 39]]"


### Standard Scaling