## <center> From Decision Trees to Random Forests

## <center> Growing Supervised Machine Learning Models

<center> <b>Contact Lens Type Determination</b> <br><br>
<center><img src='decision_tree.png' height=700 width=700>

## <center>Two algorithms

### <center> ID3
<center>Iterative Dichotomiser 3 (Classification only) 

### <center> CART   </center>
<center>Classification and Regression Trees </center>

## ID3

### <center> Entropy

<center><img src='entropy.png'>

<center><img src="entropy_formula.png">

In [None]:
-((1/2)*np.log(1/2)+(1/2)*np.log(1/2))

### <center> Information Gain

<center>The reduction of uncertainty about Y given an additional piece of information X about Y.

<center><img src='ig_formula.png'>

## <center> Building a Decision Tree From Scratch (ID3)

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame()
df['Older Than 30'] = [False, True, True, False, True]
df['Occupation'] = ['Doctor', 'Lawyer', 'Truck Driver', 'Doctor', 'Truck Driver']
df['Income Over 80000'] = [True, True, False, False, True]
df['Approved'] = [True, True, False, False, False]
df

In [None]:
## Number approved/not approved
df['Approved'].value_counts()
approved = df[df['Approved']==True]
not_approved = df[df['Approved']==False]
total = len(df)

In [None]:
## Entropy of parent node
entropy_parent = -((len(approved)/total)*np.log(len(approved)/total)+
                   (len(not_approved)/total)*np.log(len(not_approved)/total))
entropy_parent

In [None]:
## Entropy and Information Gain of Age
older = df[df['Older Than 30']==True]
younger = df[df['Older Than 30']==False]

weight_older = len(older)/total
weight_younger = len(younger)/total

older_approved = older[older['Approved']==True]
older_not_approved = older[older['Approved']==False]
younger_approved = younger[younger['Approved']==True]
younger_not_approved = older[older['Approved']==False]

entropy_older = -((len(older_approved)/len(older))*np.log(len(older_approved)/len(older))
                  +(len(older_not_approved)/len(older))*np.log(len(older_not_approved)/len(older)))

entropy_younger = -((len(younger_approved)/len(younger))*np.log(len(younger_approved)/len(younger))
                  +(len(younger_not_approved)/len(younger))*np.log(len(younger_not_approved)/len(younger)))

entropy_age = entropy_older*weight_older + entropy_younger*weight_younger
ig_age = entropy_parent - entropy_age
print('Information Gain (Age):', ig_age)

In [None]:
def entropy(count_true, count_false, count_total):
    true = (count_true/count_total)*np.log(count_true/count_total) if count_true > 0 else 0
    false = (count_false/count_total)*np.log(count_false/count_total) if count_false > 0 else 0
    return -(true + false)

In [None]:
## Entropy and Information Gain of Occupation
occupations = list(df['Occupation'].value_counts().index)
occupation_dfs = [df[df['Occupation']==i] for i in occupations]
occupation_weights = [len(i)/total for i in occupation_dfs]
occupation_approved = [i[i['Approved']==True] for i in occupation_dfs]
occupation_not_approved = [i[i['Approved']==False] for i in occupation_dfs]

entropy_occupation = np.sum([entropy(len(x), len(y), len(occupation))*weight for x,y,occupation,weight 
                                         in zip(occupation_approved, occupation_not_approved, 
                                                        occupation_dfs, occupation_weights)])
                                                                                         
ig_occupation = entropy_parent - entropy_occupation
print('Information Gain (Occupation):', ig_occupation)

In [None]:
## Entropy and Information Gain of Income
incomes = list(df['Income Over 80000'].value_counts().index)
income_dfs = [df[df['Income Over 80000']==i] for i in incomes]
income_weights = [len(i)/total for i in income_dfs]
income_approved = [i[i['Approved']==True] for i in income_dfs]
income_not_approved = [i[i['Approved']==False] for i in income_dfs]

entropy_income = np.sum([entropy(len(x), len(y), len(income))*weight for x,y,income,weight
                                         in zip(income_approved, income_not_approved, 
                                                        income_dfs, income_weights)])
                                                                                         
ig_income = entropy_parent - entropy_income
print('Information Gain (Income):', ig_income)

In [None]:
print('Information Gains')
p=[print(i,":",j) for i,j in zip(['Age','Occupation','Income'],[ig_age, ig_occupation, ig_income])]

In [None]:
for occupation in occupations:
    ## calculate parent entropy
    o_df = df[df['Occupation']==occupation]
    o_total = len(o_df)
    true = len(o_df[o_df['Approved']==True])
    false = len(o_df[o_df['Approved']==False])
    p_entropy = entropy(true,false,o_total)
    ## calculate children entropy
    ## Age
    ages = list(o_df['Older Than 30'].value_counts().index)
    age_dfs = [o_df[o_df['Older Than 30']==i] for i in ages]
    age_weights = [len(i)/o_total for i in age_dfs]
    age_approved = [i[i['Approved']==True] for i in age_dfs]
    age_not_approved = [i[i['Approved']==False] for i in age_dfs]
    entropy_age = np.sum([entropy(len(x), len(y), len(age))*weight for x,y,age,weight
                                     in zip(age_approved, age_not_approved, 
                                                    age_dfs, age_weights)])                                                                  
    ig_age = entropy_parent - entropy_age
    ## Income
    incomes = list(o_df['Income Over 80000'].value_counts().index)
    income_dfs = [o_df[o_df['Income Over 80000']==i] for i in incomes]
    income_weights = [len(i)/o_total for i in income_dfs]
    income_approved = [i[i['Approved']==True] for i in income_dfs]
    income_not_approved = [i[i['Approved']==False] for i in income_dfs]
    entropy_income = np.sum([entropy(len(x), len(y), len(income))*weight for x,y,income,weight
                                     in zip(income_approved, income_not_approved, 
                                                    income_dfs, income_weights)])                                                                  
    ig_income = entropy_parent - entropy_income
    print(occupation, '--', 'Age' if ig_age>ig_income else 'Income')

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [None]:
## convert features to categories
for i in df.columns:
    df[i] = df[i].astype('category').cat.codes
## fit model
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(df.drop('Approved',axis=1), df['Approved'])

In [None]:
import matplotlib.pyplot as plt
plot_tree(clf,node_ids=False,
              impurity=False,
              label=None,
              filled=True,
              feature_names=df.drop('Approved',axis=1).columns, 
              class_names=['Approved','Not Approved'])
plt.show()

## <center> Activity

<center>Using the data in data_scientists.csv, build a decision tree from scratch to predict whether or not someone is a Data Scientist.

<center>Then, build the tree in Python and compare.

Steps:
    - Find the entropy of the target.
    - Find the weighted entropy for each feature.
    - Calculate the information gain for each feature.
    - Select the feature with the highest IG as the root node.
    - Repeat process, finding entropy of root node, then entropy of children, and compare IG.

### <center>CART

<center>Similar to ID3, except for the formula used in determining branches. <br>
Newer, more common algorithm, default in DecisionTreeClassifer/DecisionTreeRegressor.

<center><img src="gini_formula.png">

### Decision Trees for Regression

In [None]:
from sklearn.datasets import make_regression, make_classification
X,y = make_regression(n_samples=10,n_features=2, n_informative=2, random_state=47)

In [None]:
reg_df = pd.DataFrame(X)
reg_df.columns = ['X'+str(i+1) for i in range(len(reg_df.columns))]
reg_df['target'] = y
reg_df

In [None]:
from sklearn.tree import DecisionTreeRegressor
reg_clf = DecisionTreeRegressor().fit(X,y)

In [None]:
x_vals = np.linspace(-3,3,100)
y_vals = np.linspace(-3,3,100)
preds = []
for x_ in x_vals:
    for y_ in y_vals:
        pred = reg_clf.predict(np.array([x_, y_]).reshape(1,-1))
        preds.append(pred[0])

In [None]:
set(y)==set(preds)

### <center> In regression, decision trees can only predict values in the training set.

## <center> Tuning Decision Trees

<center> <b>Parameters</b><br><br>
<center> <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html">DecisionTreeClassifier Documentation</a>
<center> <a href="https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html">DecisionTreeRegressor Documentation</a> 

In [None]:
## make dataset
X,y = make_regression(n_samples=1000,n_features=10, n_informative=8, random_state=47)
## train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15, random_state=47)
## fit base model
base_model = DecisionTreeRegressor().fit(X_train,y_train)
## train/test accuracy
from sklearn.metrics import mean_squared_error, r2_score
print('Train MSE:', mean_squared_error(y_train, base_model.predict(X_train)))
print('Train R2:', r2_score(y_train, base_model.predict(X_train)));print('')
print('Test MSE:', mean_squared_error(y_test, base_model.predict(X_test)))
print('Test R2:', r2_score(y_test, base_model.predict(X_test)))

In [None]:
from sklearn.model_selection import GridSearchCV
params = {'max_depth':[5,10,20], 
              'min_samples_split':[0.5,1.0,2,3],
              'max_features':['auto',None],
              'min_impurity_decrease':[0,0.5,0.75]
             }
dtr = DecisionTreeRegressor()
grid_search_model = GridSearchCV(dtr, params,scoring='r2', verbose=1)
grid_search_model.fit(X_train, y_train)

In [None]:
grid_search_model.best_params_

In [None]:
best_model = grid_search_model.best_estimator_
print('Train MSE:', mean_squared_error(y_train, best_model.predict(X_train)))
print('Train R2:', r2_score(y_train, best_model.predict(X_train)));print('')
print('Test MSE:', mean_squared_error(y_test, best_model.predict(X_test)))
print('Test R2:', r2_score(y_test, best_model.predict(X_test)))

## <center> Activity

- Use the data in disease_prediction.csv to build a model to predict whether or not a person is likely to have a disease.<br><br>
- Choose the appropriate measures of model success and scoring metrics to fit the best model - 
 <a href="https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter">Scoring metrics</a> <br><br>
- Utitlize GridSearchCV to tune your decision tree model.