# Coding Tutorial 6: Supervised Learning, Cross Validation and Hyperparameter Tuning

In this coding tutorial we cover the following topics:

- Decision Trees
- Random Forests
- Hyperparameter Tuning with k-fold Cross Validation

## Decision Trees

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set random seed
np.random.seed(634)

In [None]:
link = 'http://github.com/muhark/dpir-intro-python/raw/master/Week2/data/bes_data_subset_week2.feather'
df = pd.read_feather(link)
# Refactoring e01: partisan self-id
df.loc[:, 'e01'] = df['e01'].apply(
    lambda x: int(x.split(' ')[0]) if x[0] in ''.join(list(map(str, list(range(10))))) else None)

In [None]:
col_dict = {
    'region': 'Region',
    'Age': 'Age',
    'a02': 'Which party is best able to handle this issue?',
    'a03': 'How interested are you in politics?',
    'e01': 'Left-Right Self-Placement',
    'k01': 'Attention to Politics',
    'k02': 'Reads politics news',
    'k11': 'Contacted by canvasser',
    'k06': 'Uses Twitter',
    'k08': 'Uses Facebook',
    'y01': 'Income bracket',
    'y03': 'Housing type',
    'y06': 'Religion',
    'y08': 'Trade Union Membership',
    'y09': 'Gender',
    'y11': 'Ethnicity',
    'y17': 'Employment type'
}

In [None]:
# Let's predict 'Age' as a function of the rest
features = ['region', 'Age', 'a03', 'e01', 'k01',
            'k02', 'k11', 'k06', 'k08', 'y01',
            'y03', 'y06', 'y08', 'y09', 'y11', 'y17']
labels = 'a02'

In [None]:
# Task: Prepare X. Keep the feature labels in a separater variable, called feature_labels
df = df[features+[labels]].dropna()
y = df[labels].values

In [None]:
temp = pd.get_dummies(df[features])
feature_names = temp.columns.tolist()
X = temp.values

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', max_depth=10)
dt = dt.fit(X, y)

In [None]:
feature_names_long = [f.replace(f.split('_')[0], col_dict[f.split('_')[0]]) for f in feature_names]
r = export_text(dt, feature_names=feature_names_long)
print(r)

## Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier()
rf.fit(X, y)

In [None]:
feature_importances = pd.DataFrame(
    data = {
        "features":  [col_dict[f.split('_')[0]] for f in feature_names],
        "importance": rf.feature_importances_
    }
)

feature_importances = feature_importances.groupby(['features'])[['importance']].sum().sort_values('importance').reset_index()
feature_importances

In [None]:
# Task: Plot the feature importances

In [None]:
f, ax = plt.subplots(1,1, figsize=(15, 8))
ax.set_title("Sum Feature Importance for Predicting Preferred Party to Solve Biggest Issue")
sns.barplot(data = feature_importances, x='importance', y='features', ax=ax);

# Training-Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
def correct_prediction_rate(model, X_test, y_test):
    "Returns rate of correct predictions."
    preds = model.predict(X_test)
    correct_preds =  sum(preds == y_test)
    cpr = correct_preds/len(y_test)
    return cpr

In [None]:
rf_OOS = RandomForestClassifier()
rf_OOS.fit(X_train, y_train)

In [None]:
print(correct_prediction_rate(rf_OOS, X_test, y_test)) 

In [None]:
# Check: Why are these both equal to 1?
print(correct_prediction_rate(rf, X_test, y_test))
print(correct_prediction_rate(rf_OOS, X_train, y_train))

# Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf.get_params()

In [None]:
param_grid = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
 }

In [None]:
# This may take some time to complete
rf = RandomForestClassifier()

rf_tuning = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=100,
    cv=5,
    n_jobs=-1
)

rf_tuning.fit(X_train, y_train)

In [None]:
rf_tuning.best_params_

In [None]:
correct_prediction_rate(
        rf_tuning.best_estimator_,
        X_test,
        y_test
    )

In [None]:
# Compare OOS accuracy
"{:.3f} percentage point".format(100*(
    correct_prediction_rate(
        rf_tuning.best_estimator_,
        X_test,
        y_test
    )-correct_prediction_rate(
        rf_OOS,
        X_test,
        y_test
    ))
) + " prediction accuracy improvement achieved!"