In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

# About the Data

The dataset for this challenge has been obtained from the European Social Survey. Our objective for this challenge is to determine what variables we can use to predict if a person has a partner or not, and how significant each variable is for predicting the outcome.

# Exercise

From our initial decision tree, we were able to predict whether someone has a partner or not with an error rate of 6.258% for false positives, and 18.528% for false negatives. The challenge here is to reduce those error rates through modifying the features and the tree.

In [2]:
df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()

In [3]:
def booster (X, y, iterations, loss, depth):
    # Create training and test sets.
    offset = int(X.shape[0] * 0.8)

    # Put 90% of the data in the training set.
    X_train, y_train = X[:offset], y[:offset]

    # And put 10% in the test set.
    X_test, y_test = X[offset:], y[offset:]

    # We'll make 500 iterations, use 2-deep trees, and set our loss function.
    params = {'n_estimators': iterations,
              'max_depth': depth,
              'loss': loss}

    # Initialize and fit the model.
    clf = ensemble.GradientBoostingClassifier(**params)
    clf.fit(X_train, y_train)

    predict_train = clf.predict(X_train)
    predict_test = clf.predict(X_test)

    # Accuracy tables.
    table_train = pd.crosstab(y_train, predict_train, margins=True)
    table_test = pd.crosstab(y_test, predict_test, margins=True)

    train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
    train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

    test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
    test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

    print((
        'Training set accuracy:\n'
        'Percent Type I errors: {}\n'
        'Percent Type II errors: {}\n\n'
        'Test set accuracy:\n'
        'Percent Type I errors: {}\n'
        'Percent Type II errors: {}'
    ).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

In [4]:
# Definine outcome and predictors.
# Set our outcome to 0 and 1.
y = df['partner'] - 1
X1 = df.loc[:, ~df.columns.isin(['partner', 'cntry', 'idno'])]

# Make the categorical variable 'country' into dummies.
X1 = pd.concat([X1, pd.get_dummies(df['cntry'])], axis=1)

booster(X1, y, 500, 'deviance', 2)

Training set accuracy:
Percent Type I errors: 0.04603345097437471
Percent Type II errors: 0.1752340033757864

Test set accuracy:
Percent Type I errors: 0.09693251533742331
Percent Type II errors: 0.1558282208588957


In [5]:
# Changed loss type - less accurate
booster(X1, y, 500, 'exponential', 2)

Training set accuracy:
Percent Type I errors: 0.0469541199938622
Percent Type II errors: 0.17784256559766765

Test set accuracy:
Percent Type I errors: 0.10429447852760736
Percent Type II errors: 0.15521472392638036


In [6]:
# Doubled number of iterations + more accurate
booster(X1, y, 1000, 'deviance', 2)

Training set accuracy:
Percent Type I errors: 0.04342488875249348
Percent Type II errors: 0.1677152063833052

Test set accuracy:
Percent Type I errors: 0.08282208588957055
Percent Type II errors: 0.17484662576687116


In [7]:
# Significantly higher number of iterations + more accurate
booster(X1, y, 10000, 'deviance', 2)

Training set accuracy:
Percent Type I errors: 0.03206997084548105
Percent Type II errors: 0.1310418904403867

Test set accuracy:
Percent Type I errors: 0.04171779141104295
Percent Type II errors: 0.22576687116564417


In [8]:
# Doubled tree depth + more accurate
booster(X1, y, 500, 'deviance', 4)

Training set accuracy:
Percent Type I errors: 0.016572042350774897
Percent Type II errors: 0.11124750652140555

Test set accuracy:
Percent Type I errors: 0.09263803680981596
Percent Type II errors: 0.1588957055214724


In [9]:
X2 = X1 ** 2
X3 = np.sqrt(X1)

In [10]:
booster(X1 + X2, y, 500, 'deviance', 2)

Training set accuracy:
Percent Type I errors: 0.04603345097437471
Percent Type II errors: 0.1752340033757864

Test set accuracy:
Percent Type I errors: 0.09693251533742331
Percent Type II errors: 0.1558282208588957


In [11]:
booster(X3, y, 500, 'deviance', 2)

Training set accuracy:
Percent Type I errors: 0.04603345097437471
Percent Type II errors: 0.1752340033757864

Test set accuracy:
Percent Type I errors: 0.09693251533742331
Percent Type II errors: 0.1558282208588957


In [12]:
booster(X1, y, 500, 'deviance', 2)

Training set accuracy:
Percent Type I errors: 0.04603345097437471
Percent Type II errors: 0.1752340033757864

Test set accuracy:
Percent Type I errors: 0.09693251533742331
Percent Type II errors: 0.1558282208588957
