In [None]:
# import the necessary packages
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *


from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.model_selection import train_test_split

from sklearn import metrics 
from sklearn.preprocessing import StandardScaler #Z-score variables

from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv
from sklearn.model_selection import cross_val_score # cross validation metrics
from sklearn.model_selection import cross_val_predict # cross validation metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

# 1. The Ensemble
A common theme in applied Machine Learning is *The Ensemble Method*. Ensemble methods use multiple machine learning models (these models can be the same type or different algorithms entirely). The idea is that using ensembles improves predictive performance, because even though our models are sometimes incorrect, it's unlikely that a MAJORITY of the models in our ensemble will all be incorrect in the exact same way each time. Therefore in aggregate, we will get a more accurate model.

Each model gets a "vote" about what category a data point should be in (ensemble methods also work for continuous outcomes, but here we'll focus on categorical ones). Whichever category gets the most "votes" is the category we choose for that data point. 

To combat overfitting and reduce potential *over-reliance* on a small number of features, we can use the two following techniques when creating models for our ensemble:

* **Bagging (Bootstrap Aggregating)**: Instead of using all of our training data to train each model in our sample we use **bootstrapping** to choose the samples we will include.
    * **Bootstrapping** is when you randomly sample data points *with replacement*, meaning that a data point can be included in your bootstrapped sample *more* than once, OR not at all.
    

 ## Bootstrapping (before we start)
 The code block below gives an example of how we can use `np.random.choice()` to bootstrap samples from a dataframe.

In [None]:
# How to BOOTSTRAP

## simple bootstrapping example of names dataframe 
np.random.seed(1234)

names = ["Alex", "Charlie", "Addison", "James", "Blake", "Greg", "Daniel", "Susan", "Erik", "Georgia", "Kayne",
         "Lydia", "Peter", "Jane", "Jasper", "Link", "Rhett", "John", "Miranda", "Luke", "Leia", "Janet", "Jung",
         "Anthony", "Mark", "Torrence", "Bonnie", "Rudy", "Lisa", "Bart", "Tina", "Marie"]

# create data
names_df = pd.DataFrame({"name": names, "age": np.random.randint(17,27, len(names))})
names_df

# bootstrap a sample
# np.random.choice(possible values, number of samples to choose, sample with replacement?)
names_index = np.random.choice(range(0,names_df.shape[0]), # possible indices
                               names_df.shape[0], # how many to choose
                               replace = True) # with replacement
names_boot = names_df.iloc[names_index]

# notice how Lisa shows up more than once?
names_boot

Unnamed: 0,name,age
14,Jasper,26
19,Luke,19
7,Susan,24
28,Lisa,20
10,Kayne,25
11,Lydia,17
14,Jasper,26
28,Lisa,20
17,John,17
23,Anthony,17


* **Random Feature Selection**: Instead of using all the available features/predictors in our dataset for every model, for each model we randomly choose a different subset of features to use when training. This helps our ensemble generalize, because it doesn't become overly reliant on one feature (since that feature might not appear in every model).

While ensemble methods take a lot of computational power (you're training MANY models instead of just one), in practice they're often really useful. An incredibly popular ensemble method is the **Random Forest** which is an ensemble method that uses a bunch of decision trees along with Bagging and Random Feature selection to generate the ensemble.

## 1.1 Building a Random Forest

Let's build a tiny random forest function of our own! The `Forest()` that takes in 5 arguments:

* `n_samples` (**integer**): number of bootstrapped samples to use to train each decision tree.
* `n_features` (**integer**): number of randomly selected features from your data set to use when training.
* `n_trees` (**integer**): how many decision trees to create for the ensemble.
* `X` (**data frame**): the *already* z-scored predictor data to be used.
* `y` (**data frame**): the outcome data to be used (`X` and `y` are the same length, and the $i^{th}$ element of `X` corresponds to the $i^{th}$ element of `y`)

The function should:

1. (DONE FOR YOU) use a for loop to create `n_trees` models and store them in a list called `forest` (yes! You can store fitted decision trees in a list!)
2. (DONE FOR YOU) For each model you should choose use bootstrapping to sample `n_samples` data points to train each model. Remember that boostrapping means sampling WITH replacement (hint: try using [`np.random.choice`](https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html) to randomly select (*with replacement*) which row numbers/indices to use.
3. (DONE FOR YOU) For each model, randomly select `n_features` to use to train your model. (hint: try using [`np.random.choice`](https://numpy.org/doc/stable/reference/random/generated/numpy.random.choice.html) to randomly select (*withOUT replacement*) which predictor indices to use.
4. For each model, train the model (no need to use model validation, and assume X is already z-scored).
5. (DONE FOR YOU) Return a list (`forest`) of dictonaries that look like this (where `tree` is your trained model and `samples_index` is an array of indices for the features/predictors you selected):
 ```{"tree": tree, "feats": samples_index}```


In [None]:
def Forest(X, y, n_samples = 1000, n_features = 5, n_trees = 100):
    forest = []
    
    # 1. create models for the ensemble
    for i in range(0,n_trees):
        
        # 2. randomly bootstrap datapoints
        possible_rows_index = range(0,X.shape[0])
        samples_index = np.random.choice(possible_rows_index, n_samples, replace = True)
        
        
        # 3. randomly choose features
        possible_features_index = range(0,X.shape[1])
        
        if n_features >= X.shape[1]: #if they ask for more features than you have, select all features...
            features_index = possible_features_index
        else:
            features_index = np.random.choice(possible_features_index, n_features, replace = False)
        
        # 4. select only the rows and features that were randomly selected above and train tree
        
        
        ### YOUR CODE HERE ##################################################################
        
        # use samples_index to choose only the subset of bootstrapped rows from X and from y
        
        
        # use features_index to choose only the subset of features/predictors from X
        
        
        # create a DecisionTree model
        tree = ??? 
        
        # fit the tree using the *subsetted* data you created just now


        
         ### /YOUR CODE HERE ##################################################################
            
            
        # 5. add tree to forest
        forest.append({"tree": tree, "feats": features_index})
    return(forest)       

## 1.2 Use `Forest()`
Using `X1` and `y1` as your training set, call `Forest()` to build an ensemble model.

In [None]:
# load in the data
X1 = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/X_cols_df.csv")
y1 = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/y_df.csv")

# look at X
X1.head()

In [None]:
# look at y
y1.head()

The cell below uses the Forest function that you helped create above to build a random forest using `X1` and `y1`. The `n_features` argument shows that we will randomly select 5 features per tree to use. Our forest will contain 100 trees (as shown by the `n_trees` agument).

In [None]:
### YOUR CODE HERE ##################################################################

my_forest = Forest(X1,
                   y1,
                   n_features = 5,
                   n_trees = 100) ### call Forest and create an ensemble with 100 trees and 5 features per tree

### /YOUR CODE HERE ##################################################################

## 1.3 Comparing Ensemble to an Individual Model

- Use the `ForestPredictor()` function below (which takes in the ensemble created by `Forest()` and data) to generate predictions (basically like a `.predict()` function for our custom `Forest()` model) for `X2`, our *test* set.
- Use the `ForestPredictor()` function below (which takes in the ensemble created by `Forest()` and data) to generate predictions for `X1`, our *train* set.
- calculate the accuracy of your ensemble.
- calculate the accuracy for ONE of your ensemble models by using `oneModel = my_forest[0]` to grab the first model of your ensemble. 

### 1.3.1
In this example, does an ensemble method do *better* (in terms of train accuracy) than an individual decision tree? Explain how you figured this out.

<img src="https://drive.google.com/uc?export=view&id=1ghyQPx1N8dmU3MV4TrANvqNhGwnLni72" width = 200px />


## `ForestPredictor()` function
The code below is fully functional, no need to change anything. It takes in two arguments:

- `forest` which should be a trained Random forest made by the `Forest()` function.
- `X` which is the data you want to use to predict.

In [None]:
# NO NEED TO CHANGE ANYTHING HERE
def ForestPredictor(forest, X):
    import operator
    from collections import Counter

    ps = []

    # get predictions from each model
    for model in forest:
        tree = model["tree"]
        X_sub = X.iloc[:, model["feats"]]

        p = tree.predict(X_sub)
        ps.append(p)

    ps = pd.DataFrame(ps)
    
    # get ensemble prediction for each data point
    predictions = []
    
    for column_ind in range(0, ps.shape[1]):
        ensemble_predict = ps.iloc[:,column_ind]
        predictions.append(ensemble_predict.mode()[0])

    return(predictions)

Import our "test set", `X2` and `y2`, and use `ForestPredictor()` to get predictions for `X2` using the Random Forest we built above, `my_forest`.

In [None]:
# Import our Test Set
X2 = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/X_cols_df2.csv")
y2 = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/y_df2.csv")

# ForestPredict() will take your ensemble and use it to find the predicted values for the test set
ensemble_predictions =  ForestPredictor(my_forest, X2) #Call ForestPredictor using my_forest and X_cols_df2

Now we will calculate the accuracy of our Random Forest as a whole and compare it to a randomly selected tree FROM our forest.

In [None]:
# calculate the accuracy for the ensemble
acc_e = accuracy_score(y2, ensemble_predictions)
print("Test Acc (Ensemble) is: ", acc_e)

In [None]:
# calculate the accuracy for the first model
oneModel = my_forest[0]

acc_o = accuracy_score(y2, oneModel["tree"].predict(X2.iloc[:,oneModel["feats"]]))
print("Test Acc (OneModel) is: ", acc_o)

In [None]:
ensemble_predictions =  ForestPredictor(my_forest, X1) ### Call ForestPredictor using my_forest and X_cols_df2
print("Train Acc (Ensemble) is: ", accuracy_score(y1, ensemble_predictions))
print("Train Acc (OneModel) is: ", accuracy_score(y1, oneModel["tree"].predict(X1.iloc[:,oneModel["feats"]])))

## 1.4 Comparing Ensemble to an Individual Models

- put the accuracy from your ENSEMBLE model in the code below
- run the cell to see a histogram of the individual tree accuracies, and the (dashed line) ensemble accuracy.

### 1.4.1
Write down your thoughts about this graph. What patterns do you see between individual tree accuracies and ensemble accuracies?

<img src="https://drive.google.com/uc?export=view&id=1ghyQPx1N8dmU3MV4TrANvqNhGwnLni72" width = 200px />

In [None]:
### YOUR CODE HERE ##################################################################
ensemble_acc = ???### put your ensemble accuracy here!
### /YOUR CODE HERE ##################################################################

# calculates the accuracy of each model in the ensemble
allAcc = [accuracy_score(y2,my_forest[mod]["tree"].predict(X2.iloc[:,my_forest[mod]["feats"]])) for mod in range(0,len(my_forest))]

# plot individual model accuracies (blue hist) and ensemble accuracy (black line)
df = pd.DataFrame({"acc": allAcc, "no": range(0,len(my_forest))})
(ggplot(df, aes(x = "acc")) +
 geom_histogram(color = "black", fill = "lightblue", binwidth = 0.025) +
 xlim([0,1]) + theme_minimal() + geom_vline(xintercept = ensemble_acc, linetype = "dashed", size = 3) +
labs(title = "Ensemble Accuracy vs. Individual Model Accuracy",
    x = "Accuracy",
    y = "Count"))


### 1.4.2
How does the difference between individual tree accuracies and ensemble accuracies change when you change the number of predictors used in each tree?
<img src="https://drive.google.com/uc?export=view&id=1ghyQPx1N8dmU3MV4TrANvqNhGwnLni72" width = 200px />

In [None]:
### YOUR CODE HERE ##################################################################
n_feat = 249
### /YOUR CODE HERE ##################################################################


# build a new model with n_feat features
my_forest2 = Forest(X1, y1, n_features = n_feat)
ensemble_acc2 = accuracy_score(y2, ForestPredictor(my_forest2, X2))


# calculates the accuracy of each model in the ensemble
allAcc2 = [accuracy_score(y2,my_forest2[mod]["tree"].predict(X2.iloc[:,my_forest2[mod]["feats"]])) for mod in range(0,len(my_forest2))]


# plot individual model accuracies (blue hist) and ensemble accuracy (black line)
df = pd.DataFrame({"acc": allAcc2, "no": range(0,len(my_forest2))})
(ggplot(df, aes(x = "acc")) +
 geom_histogram(color = "black", fill = "lightblue", binwidth = 0.025) +
 xlim([0,1]) + theme_minimal() + geom_vline(xintercept = ensemble_acc2, linetype = "dashed", size = 3) +
labs(title = "Ensemble Accuracy vs. Individual Model Accuracy",
    x = "Accuracy",
    y = "Count"))


In [None]:
### YOUR CODE HERE ##################################################################
n_feat = 2
### /YOUR CODE HERE ##################################################################

# build a new model with n_feat features
my_forest2 = Forest(X1, y1, n_features = n_feat)
ensemble_acc2 = accuracy_score(y2, ForestPredictor(my_forest2, X2))


# calculates the accuracy of each model in the ensemble
allAcc2 = [accuracy_score(y2,my_forest2[mod]["tree"].predict(X2.iloc[:,my_forest2[mod]["feats"]])) for mod in range(0,len(my_forest2))]


# plot individual model accuracies (blue hist) and ensemble accuracy (black line)
df = pd.DataFrame({"acc": allAcc2, "no": range(0,len(my_forest2))})
(ggplot(df, aes(x = "acc")) +
 geom_histogram(color = "black", fill = "lightblue", binwidth = 0.025) +
 xlim([0,1]) + theme_minimal() + geom_vline(xintercept = ensemble_acc2, linetype = "dashed", size = 3) +
labs(title = "Ensemble Accuracy vs. Individual Model Accuracy",
    x = "Accuracy",
    y = "Count"))
