In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nose.tools import *

np.random.seed(24680)

Write your imports in the cell below.

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import f1_score

# Ensemble Models and Support Vector Machines Lab
## Training and comparing different algorithms

Once again, we'll work with the bank dataset. This time, the data preprocessing steps have been done for you.

The goal is to try and improve our predictions (if they can be improved at all) using different types of algorithms.

### 1. Read the data (1 point)
This time you only need to read the data. The indicator variables have been separated out for you.

Read the dataset and save it in the variable `bank_data`. The target column is `y`. Use the variables `bank_attributes` and `bank_labels` to save the attributes (explanatory variables, features, predictors), and labels (`y`).

##### read and split the data set to attr and labels

In [4]:
bank_data = pd.read_csv('data/bank.csv')
bank_attributes = bank_data.drop('y', axis=1)
bank_labels = bank_data['y']

##### check the data set

In [5]:
bank_data.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,33,4789,11,220,1,339,4,0,0,0,...,0,1,0,0,0,1,0,0,0,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,30,1476,3,199,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,59,0,5,226,1,-1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0


In [6]:
bank_data.shape

(4521, 52)

In [7]:
bank_data.dtypes

age                    int64
balance                int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
job_admin.             int64
job_blue-collar        int64
job_entrepreneur       int64
job_housemaid          int64
job_management         int64
job_retired            int64
job_self-employed      int64
job_services           int64
job_student            int64
job_technician         int64
job_unemployed         int64
job_unknown            int64
marital_divorced       int64
marital_married        int64
marital_single         int64
education_primary      int64
education_secondary    int64
education_tertiary     int64
education_unknown      int64
default_no             int64
default_yes            int64
housing_no             int64
housing_yes            int64
loan_no                int64
loan_yes               int64
contact_cellular       int64
contact_telephone      int64
contact_unknow

##### check the data attr

In [8]:
bank_attributes.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,33,4789,11,220,1,339,4,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,35,1350,16,185,1,330,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,30,1476,3,199,4,-1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,5,226,1,-1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [9]:
bank_attributes.shape

(4521, 51)

##### check the data labels

In [10]:
bank_labels

0       0
1       0
2       0
3       0
4       0
       ..
4516    0
4517    0
4518    0
4519    0
4520    0
Name: y, Length: 4521, dtype: int64

In [11]:
bank_labels.shape

(4521,)

In [12]:
assert_is_not_none(bank_data)
assert_is_not_none(bank_attributes)
assert_is_not_none(bank_labels)

### 2. Normalize the data (1 point)
Because both forests and SVMs are sensitive to non-scaled data, we need to normalize our dataset first.

Rescale all columns in `bank_attributes` so they have mean = 0 and variance = 1. You can either look at the `sklearn` docs or do this yourself. When you're ready, overwrite the `bank_attributes` column. Make sure that you don't lose the column names in the process.

##### create a var with normalized vals from data attr

In [13]:
scaler = StandardScaler(with_mean=False)
# scaler = StandardScaler()
# scaler = MinMaxScaler((-1, 1))
bank_attributes_scaled = scaler.fit_transform(bank_attributes)
bank_attributes_scaled

array([[2.83686838, 0.59382477, 2.30393648, ..., 0.        , 0.        ,
        2.60013155],
       [3.12055521, 1.5913972 , 1.33385796, ..., 0.        , 0.        ,
        0.        ],
       [3.30967977, 0.44860853, 1.94015704, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [5.39004991, 0.09802927, 2.30393648, ..., 0.        , 0.        ,
        2.60013155],
       [2.64774382, 0.37782807, 0.72755889, ..., 4.89844855, 0.        ,
        0.        ],
       [4.16074028, 0.37749577, 0.36377944, ..., 4.89844855, 0.        ,
        0.        ]])

##### overwrite the data attr with the same columns name

In [14]:
bank_attributes = pd.DataFrame(
    bank_attributes_scaled, 
    columns=bank_attributes.columns,
)
bank_attributes.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,2.836868,0.593825,2.303936,0.304047,0.321599,-0.009989,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,7.584887,0.0,0.0,0.0,0.0,2.600132
1,3.120555,1.591397,1.333858,0.846714,0.321599,3.386273,2.362147,0.0,0.0,0.0,...,0.0,0.0,2.163689,0.0,0.0,0.0,3.216845,0.0,0.0,0.0
2,3.30968,0.448609,1.940157,0.71201,0.321599,3.296372,0.590537,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.216845,0.0,0.0,0.0
3,2.836868,0.490479,0.363779,0.765892,1.286396,-0.009989,0.0,0.0,0.0,0.0,...,3.105995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.600132
4,5.579174,0.0,0.606299,0.869807,0.321599,-0.009989,0.0,0.0,2.458391,0.0,...,0.0,0.0,2.163689,0.0,0.0,0.0,0.0,0.0,0.0,2.600132


In [15]:
assert_is_not_none(bank_attributes)

### 3. Split the data (1 point)
Use the standard 70% / 30% split. Since this is a classification problem, be sure to stratify the split according to the `bank_labels`.

In [16]:
bank_attributes_train, bank_attributes_test, bank_labels_train, bank_labels_test = train_test_split(
                                                                                        bank_attributes_scaled,
                                                                                        bank_labels,
                                                                                        train_size=0.7,
                                                                                        stratify=bank_labels,
                                                                                    )

##### check the splited data shapes

In [17]:
bank_attributes_train.shape, bank_attributes_test.shape

((3164, 51), (1357, 51))

In [18]:
bank_labels_train.shape, bank_labels_test.shape

((3164,), (1357,))

In [19]:
assert_is_not_none(bank_attributes_train)
assert_is_not_none(bank_labels_train)

assert_is_not_none(bank_attributes_test)
assert_is_not_none(bank_labels_test)

### 4. Prepare the cross-validation folds (1 point)
Use a stratified k-fold cross-validation split, with $k = 5$. Fit it to the train data. Save the trained cross-validator to the variable `k_fold`.

The data should already be shuffled. There's no need to shuffle it again.

##### create stratified k-fold cross-validation generator witn n_splits=5

In [20]:
k_fold = StratifiedKFold(n_splits=5)

In [21]:
assert_is_not_none(k_fold)

### 5. Decision Tree (2 points)
Use cross-validation to train and optimize the hyperparameters for a decision tree classifier.

Use grid search with the following grid:
* `max_depth`: 1, 5, 7, 15, 20
* `min_samples_leaf`: 2, 5, 10, 12
* `max_leaf_nodes`: 5, 10, 20

Use the most appropriate scoring metric (remember that accuracy doesn't work in this case because the data is highly imbalanced; we need something which combines precision and recall). Use the cross-validation splits you just created.

Save the grid results in `grid_search`. Save the best classifier in `tree_classifier`.

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

##### create decision tree model and use cross-validation to fit it on the train set with params (hyperparameters) as per assignment
##### and
##### set the best estimator to tree_classifier

In [22]:
param_grid = {
    'max_depth': [1, 5, 7, 15, 20],
    'min_samples_leaf': [2, 5, 10, 12],
    'max_leaf_nodes': [5, 10, 20],
}
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(),
    param_grid=param_grid,
    cv=k_fold,
    scoring='f1',
)
grid_search.fit(
    bank_attributes_train, 
    bank_labels_train,
)
tree_classifier = grid_search.best_estimator_
print("Decision tree; best score:", grid_search.best_score_)

Decision tree; best score: 0.5090152112059995


In [23]:
assert_is_not_none(grid_search)
assert_is_not_none(tree_classifier)

##### create random forest classifier model and use cross-validation to fit it on the train set with params (hyperparameters) as per assignment
##### and
##### set the best estimator to the forest_classifier

### 5. Random Forest (1 point)
Use cross-validation to train and optimize the hyperparameters for a random forest classifier. Use the same technique as before.

Use the following grid:
* `n_estimators`: 100, 200, 300 
* `max_depth`: 20, 50, 100

Note that this grid is on the small side but this is mainly due to performance reasons. Also note that the training will take some time.

Save the grid results in `grid_search`. Save the best classifier in `forest_classifier`.

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

Due to the relatively slow training, we've chosen low values for the parameters. The performance of the random forest will be worse than the decision tree. This is not necessarily the case in general, it's due to the parameters we've chosen to try.

In [24]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [20, 50, 100],
}
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=k_fold,
    scoring='f1',
)
grid_search.fit(
    bank_attributes_train, 
    bank_labels_train,
)
forest_classifier = grid_search.best_estimator_
print("Random forest; best score:", grid_search.best_score_)

Random forest; best score: 0.3897323976283862


In [25]:
assert_is_not_none(grid_search)
assert_is_not_none(forest_classifier)

##### create linear svc model and use cross-validation to fit it on the train set with params (hyperparameters) as per assignment
##### and
##### set the best estimator to the linear_svm_classifier

### 6. Linear SVM (1 point)
Use cross-validation to train and optimize the hyperparameters for a linear support vector machine. Use the same technique as before.

Use the following grid:
* `C`: 0.1, 0.5, 0.8, 1, 1.5, 2, 6, 10, 15, 20

Note that we're choosing relatively small values for `C`. This is allowed because our data is normalized.

Save the grid results in `grid_search`. Save the best classifier in `linear_svm_classifier`. There are many ways to create a linear SVM classifier. Look up the `sklearn` docs to choose the fastest one (in terms of performance).

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

In [26]:
param_grid = {
    'C': [0.1, 0.5, 0.8, 1, 1.5, 2, 6, 10, 15, 20],
}
grid_search = GridSearchCV(
    estimator=LinearSVC(dual=False),
    param_grid=param_grid,
    cv=k_fold,
    scoring='f1',
)
grid_search.fit(bank_attributes_train, bank_labels_train)
linear_svm_classifier = grid_search.best_estimator_
print("Linear SVM; best score:", grid_search.best_score_)

Linear SVM; best score: 0.38163164713187037


In [27]:
assert_is_not_none(grid_search)
assert_is_not_none(linear_svm_classifier)

##### create gaussian svc model and use cross-validation to fit it on the train set with params (hyperparameters) as per assignment
##### and
##### set the best estimator to the gaussian_svm_classifier

### 7. Gaussian SVM (1 point)
Use cross-validation to train and optimize the hyperparameters for an SVM with a Gaussian kernel. Use the same technique as before.

Use the following grid:
* `C`: 10, 15, 20, 50, 200
* `gamma`: 0.001, 0.01, 0.1, 0.2

Note that this time we give larger values of `C` because the governing parameter here is `gamma`.

Save the grid results in `grid_search`. Save the best classifier in `gaussian_svm_classifier`.

Optionally, you can print and / or visualize the cross-validation results and the best chosen parameters.

In [28]:
param_grid = {
    'C': [10, 15, 20, 50, 200],
    'gamma': [0.001, 0.01, 0.1, 0.2],
}
grid_search = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    cv=k_fold,
    scoring='f1',
)
grid_search.fit(bank_attributes_train, bank_labels_train)
gaussian_svm_classifier = grid_search.best_estimator_
print("Gaussian SVM; best score:", grid_search.best_score_)

Gaussian SVM; best score: 0.45760736147055053


In [29]:
assert_is_not_none(grid_search)
assert_is_not_none(gaussian_svm_classifier)

### 9. Compare performance on the testing data (1 point)
Now that you've trained all your models, you've got to select the best one. This should be done on the testing data.

Use the appropriate scoring metric to get the testing scores for all your models. Don't forget to pass the **testing**, not the training data. Save all scores.

Choose the best classifier, based on these scores (the one with the highest test score). Of course, this is not enough. We need to look at ROC curves, track performance through other measures, debug the sources of variance in testing results, try more hyperparameters, etc. However, this is enough for an introductory lab :).

Optionally, you can think of combining them into a boosted model but this is out of the scope of this lab.

##### comapre best scores with F1 scoring metric and see which is the best

In [30]:
tree_classifier_predict = tree_classifier.predict(bank_attributes_test)
tree_classifier_score = f1_score(bank_labels_test, tree_classifier_predict)

forest_classifier_predict = forest_classifier.predict(bank_attributes_test)
forest_classifier_score = f1_score(bank_labels_test, forest_classifier_predict)

linear_svm_classifier_predict = linear_svm_classifier.predict(bank_attributes_test)
linear_svm_classifier_score = f1_score(bank_labels_test, linear_svm_classifier_predict)

gaussian_svm_classifier_predict = gaussian_svm_classifier.predict(bank_attributes_test)
gaussian_svm_classifier_score = f1_score(bank_labels_test, gaussian_svm_classifier_predict)

print("Testing scores:")
print("Decision tree:", tree_classifier_score)
print("Random forest:", forest_classifier_score)
print("Linear SVM:", linear_svm_classifier_score)
print("Gaussian SVM:", gaussian_svm_classifier_score)

Testing scores:
Decision tree: 0.46212121212121204
Random forest: 0.3518518518518518
Linear SVM: 0.4035087719298245
Gaussian SVM: 0.4386617100371747


In [31]:
best_classifier = "tree"

In [32]:
assert_not_equal(best_classifier, "")