## 1) Classification and Regression Trees

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=1)

In [None]:
dt = DecisionTreeClassifier(max_depth=2, random_state=1)

#fit dt to the training set
dt.fit(X_train, y_train)

#predict the test set labels
y_pred = dt.predict(X_test)

#evaluate the test-set accuracy
accuracy_score(y_test, y_pred)

#compute test-set- MSE
mse_dt = MSE(y_test, y_pred)

#compute test-set- RMSE
rmse_dt = mse_dt**(1/2)

In [5]:
# Import LogisticRegression from sklearn.linear_model
from sklearn.linear_model import  LogisticRegression

# Instatiate logreg
logreg = LogisticRegression(random_state=1)

# Fit logreg to the training set
logreg.fit(X_train, y_train)

# Define a list called clfs containing the two classifiers logreg and dt
clfs = [logreg, dt]

# Review the decision regions of the two classifiers
plot_labeled_decision_regions(X_test, y_test, clfs)

## 2) The Bias Variance Tradeoff

In [None]:
#bias: error term that tells yot, on average, how much y ‡ y hat
#varinace: tells you how much y hat is inconsistent over different tr  aining sets.

#### K-Fold CV in sklearn on the Auto Dataset

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import cross_val_score

#set seed for reproducibility
SEEN = 123

#split data into 70% and 30% test
X_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=SEED)

#Instantiate decision tree regressor and assign it to 'dt'
dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.14, random_state=SEED)

In [None]:
#evaluate the list of MSE ontained by 10-fold CV
#Set n_jobs to -1 in order to exploit all cpu cores in computation
MSE_CV = - cross_val_score(dt, X_train, y_train, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)

#### ensemble learning

In [None]:
#import functions to compute accuarcy and split data
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#import models, including votingclassifier meta-model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import VotingClassifier

#set seed for reproducibility
SEED = 1

#split data into 70% train and 30% test
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size= 0.3, random_state= SEED)

#instantiate individual classifiers
lr = LogisiticRegression(random_state=SEED)
knn = KNN()
dt = DecisionTreeClassifier(random_state=SEED)

#difine a list called classifier that contains the tuples
classifiers = [('Logistic Regression', lr),
               ('K Nearest Neighbours', knn),
               ('Classification Tree', dt)]

#iterate over the defined list of tulpes containing the classifiers
for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    #evaluate the accuracy of clf on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test, y_pred)))
    
    
#votingclassifier
vc = VotingClassifier(estimators=classifiers)

#fit 'vc' to the traing set and predict test set labels
vc.fit(X_train, y_train)
y_pred = vc.predict(X_test)

#evaluate the test-set accuracy of 'vc'
print('Voting Classifier: {.3f}'.format(accuracy_score(y_test, y_pred)))

## 3) bagging and random forests

#### feature importance in sklearn

In [None]:
#create a pd.Series of features importances
importances_rf = pd.Series(rf.feature_importances_, index = X.columns)

#sort importances_rf
sorted_importances_rf = importances_rf.sort_values()

#make a horizontal bar plot
sorted_importances_rf.plot(kind='barh', color='lightgreen'); plt.show()

## 4) Boosting

In [None]:
adb_clf = AdaBoostClassifier(base_estimator=dt, n_estimators=100)

adb_clf.fit(X_train, y_train)

y_pred_proba = adb_clf.predict_proba(X_test)[:,1]

adb_clf_roc_auc_score = roc_auc_score(y_test, y_pred_proba)

## 5) Model Tuning

In [None]:
#import GridSearchCV
from sklearn.model_selection import GridSearchCV

#define the grid of hyperparameters 'params_df'
params_df = {
             'max_depth': [3,4,5,6],
             'min_samples_leaf': [0.04, 0.06, 0.08],
             'max_features': [0.2, 0.4, 0.6, 0.8]
}

#instantiate a 10-fold CV grid search object 'grid_dt'
grid_dt = GridSearchCV(estimator = dt,
                       param_grid=params_dt,
                       socring='accuracy',
                       cv=10,
                       n_jobs=-1)

#fit 'grid_dt' to the training data
grid_dt.fit(X_train, y_train)

In [None]:
best_hyperparams = grid_dt.best_params_
best_CV_score = grid_dt.best_score_
best_model = grid_dt.best_estimator_
test_acc = best_model.socre(X_test, y_test)