### Classification-tree
- Sequence of if-else questions about individual features.
- infer class labels
- able to capture non-linear relationships between features and labels
- don't require feature scaling

In [None]:
# Classification-tree in scikit-learn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=1)

dt = DecisionTreeClassifier(max_depth=2, random_state=1)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier

# Instantiate dt_entropy, set 'entropy' as the information criterion
dt_entropy = DecisionTreeClassifier(max_depth=8, criterion='entropy', random_state=1)

# Fit dt_entropy to the training set
dt_entropy.fit(X_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE
X_train, X_test, y_train, y_test = train_test_split(test_size=0.2, random_state=42)
df = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.1, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
mse_dt = MSE(y_test, y_pred)
rmse_dt = mse_dt**(1/2)
print(rmse_dt)

### Diagnose Bias Problem
- if f^ suffers high bias: CV error of f^ ~~ training set of error of f^ >> desired error.
- f^ is said to underfit the training set. To remedy underfitting:
    - increase model compexity
    - for ex: increase max depth, decrease min samples per leaf,...
    - gather more relevant features 

In [None]:
# K-Fold CV in sklearn on the Auto Dataset
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_predict

# Set seed for reproducibility
SEED = 123

# Split data into 70% train and 30%test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

# Instantiate decision tree regressor and assign it ti 'dt'
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.14, random_state=42)

# Evaluate the list of MSE ontained by 10-fold CV
# Set n_jobs to -1 in order to exploit all CPU cores in computation
MSE_CV = - cross_val_predict(dt, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit 'dt' to the training set
dt.fit(X_train, y_train)

# Predict the labels of training set
y_predict_train = dt.predict(X_train)

# Predict the labels of test set
y_predict_test = dt.predict(X_test) 

# CV MSE
print('CV MSE: {:.2f}'.format(MSE_CV.mean())) 

# Training set MSE
print('Train MSE: {:.2f}'.format(MSE(y_train, y_predict_train)))

# Test set MSE
print('Test MSE: {:.2f}'.format(MSE(y_test, y_predict_test)))


### Advantages of CARTs
- Simple to understand
- Simple to interpret
- Easy to use
- Flexibility: ability to describe non-linear dependencies
- Preprocessing: no need to standardize features,...

### Limitations of CARTs
- Classification: can only produce orthogonal decision boundaries.
- Sensitive to small variations in the training set.
- High variance: unconstrained CARTs may overfit the training set
- Solution: ensemble learning

### Ensemble Learning
- Train different models on the same dataset
- Let each model make its predictions
- Meta-model: aggreagates predictions of individual models
- Final prediction: more robust and less prone to errors.
- Best results: models are skillful in different ways

In [None]:
# Voting Classifier in sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN 
from sklearn.ensemble import VotingClassifier

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)
lr = LogisticRegression(random_state=SEED)
knn = KNN()
dt = DecisionTreeClassifier(random_state=SEED)
classifiers = [('Logistic Regression', lr),
                ('K Nearest Neighbours', knn),
                ('Classification Tree', dt)]

# Iterate over the defined list of tuples containing the classifiers
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))

# Instance a VotingClassifier 'vc'
vc = VotingClassifier(estimators=classifiers)

vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)

print('Voting Classifier: {.3f}'.format(accuracy_score(y_test, y_pred))) 

In [None]:
# Bagging Classifier in sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)

# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=SEED)

# Instantiate a BaggingClassifier 'bc'
bc = BaggingClassifier(base_estimator=dt, n_estimators=300, n_jobs=-1)

# Fit 'bc' to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate and print test-set accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy of Bagging Classifier: {:.3}'.format(accuracy))

In [None]:
# OOB Evaluation in sklearn
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

SEED = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=SEED)

dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=SEED)

bc = BaggingClassifier(base_eliminator = dt, n_estimators=300, oob_score=True, n_jobs=-1)

bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)
oob_accuracy = bc.oob_score_

print('Test set accuracy: {:.3f}'.format(test_accuracy))

print('OOB accuracy: {:.3f}'.format(oob_accuracy))

### Further Diversity with Random Forests
- Base estimator
- Each estimator is trained on a different bootstrap sample having the same size as the training set
- RF introduces further randomization in the training of individual trees
- d features are sampled at each node without replacement

In [None]:
# Random Forests Regressor in sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MSE 

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

rf = RandomForestClassifier(n_estimators=40, min_samples_leaf=0.12, random_state=SEED)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

rmse_test = MSE(y_test, y_pred)**(1/2)

print('Test set RMSE of rf: {:.2f}'.format(rmse_test))


In [None]:
# Feature Importance in sklearn
import pandas as pd
import matplotlib.pyplot as plt

importances_rf = pd.Series(rf.feature_importances_, index = X.columns)

sorted_importances_rf = importances_rf.sort_values()

sorted_importances_rf.plot(kind='barh', color='lightgreen');plt.show()

### Adaboost
- Stands for Adaptive Boosting
- Each predictor pays more attention to the instances wrongly predicted by its predecessor.
- Achieved by changing the weights of training instances
- Each predictor is assigned a coefficient α
- α depends on the predictor's training error

In [None]:
# AdaBoost Classification in sklearn
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

dt = DecisionTreeClassifier(max_depth=1, random_state=SEED)

adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100)

adb_clf.fit(X_train, y_train)

y_pred_proba = adb_clf.predict_proba(X_test)[:,1]

# Evaluate test-set roc_auc_score
adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

print('ROC AUC score: {:.2f}'.format(adb_clf_roc_auc_score))

### Gradient Boosted Trees
- Sequential correction of predecessor's errors
- Does not tweak the weights of training instances
- Fit each predictor is trained using its predecessor's residual errors as labels
- Gradient Boosted Trees: a CART is used as a base learner

In [None]:
# Gradient Boosting in sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as MSE

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

gbt = GradientBoostingRegressor(n_estimators=300, max_depth=1, random_state=SEED)

gbt.fit(X_train, y_train)

y_pred = gbt.predict(X_test)

rmse_test = MSE(y_test, y_pred) ** (1/2)

print('Test set RMSE: {:.2f}'.format(rmse_test))

### Stochastic Gradient Boosting
- Each tree is trained on a random subset of rows of the training data
- Sampled instance (40%-80% of the training set) are sampled without replacement
- Features are sampled (without replacement) when choosing split points
- Result: further ensemble diversity
- Effect: adding further variance to the ensemble of trees

In [None]:
# Stochastic Gradient Bosting
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE 

SEED = 1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

sgbt = GradientBoostingRegressor(max_depth=1, subsample=0.8, max_features=0.2, n_estimators=300, random_state=SEED)

sgbt.fit(X_train, y_train)

y_pred = sgbt.predict(X_test) 

rmse_test = MSE(y_test, y_pred) ** (1/2)

print('Test set RMSE: {:.2f}'.format(rmse_test))


In [2]:
# Inspecting the hyperparameters of a CART in sklearn
from sklearn.tree import DecisionTreeClassifier

SEED = 1

dt = DecisionTreeClassifier(random_state=SEED)

print(dt.get_params())

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 1, 'splitter': 'best'}


In [3]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV
params_dt = {
    'max_depth': [3, 4, 5, 6],
    'min_samples_leaf': [0.04, 0.06, 0.08],
    'max_features': [0.2, 0.4, 0.6, 0.8]
}

grid_dt = GridSearchCV(estimator=dt, param_grid=params_dt, scoring='acuuracy', cv=10, n_jobs=-1)

grid_dt.fit(X_train, y_train)

best_hyperparams = grid_dt.best_params_
print('Best hyperparameters: \n', best_hyperparams)

best_CV_score = grid_dt.best_score_
print('Best CV accuracy'.format(best_CV_score))

best_model = grid_dt.best_estimator_
test_acc = best_model.score(X_test, y_test)
print('Test set accuracy of best model: {:.3f}'.format(test_acc))

In [None]:
# Inspecting RF Hyperparameters in sklearn
from sklearn.ensemble import RandomForestRegressor

SEED = 1

rf = RandomForestRegressor(random_state=SEED)
rf.get_params()

In [None]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import GridSearchCV

params_rf = {
    'n_estimators': [300, 400, 500],
    'max_depth': [4, 6, 8],
    'min_samples_leaf': [0.1, 0.2],
    'max_features': ['log2', 'sqrt']
}

grid_rf = GridSearchCV(estimator=rf, param_grid=params_rf, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_rf.fit(X_train, y_train)

best_hyperparams = grid_rf.best_params_
print('Best hyperparameters: ', best_hyperparams)

best_model = grid_rf.best_estimator_
y_pred = best_model.predict(X_test) 
rmse_test = MSE(y_test, y_pred) ** (1/2)
print('Test set RMSE of rf: {:.2f}'.format(rmse_test))