##### imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from nose.tools import *
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, fbeta_score, precision_recall_fscore_support

##### random numbers prediction

In [2]:
np.random.seed(1234)

# Model Training and Improvement Lab
## Comparing and selecting models

### 1. Read the data (1 point)
Like in the previous lab, you need to read the Portuguese bank dataset [here](https://archive.ics.uci.edu/ml/machine-learning-databases/00222/). It has been provided for you in the `data` folder.

Read the dataset using `pandas` (you can use the library with the alias `pd`). Save it in the `bank_data` variable.

##### read and visualize the dataSet

In [3]:
bank_data = pd.read_csv('data/bank.csv', sep=';')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


##### check the shape of the dataSet

In [4]:
bank_data.shape

(4521, 17)

##### check the dtype of the dataSet features

In [5]:
bank_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [6]:
# From now on, all test cells might contain hidden tests. If you follow the instructions correctly, 
# your solution will be graded with maximum points
assert_is_not_none(bank_data)

### 2. Preprocess the data (1 point)
Separate explanatory features from labels. Save all features (16 columns total) in the variable `bank_features`. Save the labels (corresponding to the `y` column) in the `bank_labels` variable. Rewrite the labels to be `0` and `1` instead of `no` and `yes`: `bank_labels` should be a numeric column.

##### separate features and labels from the dataSet

In [7]:
bank_features = bank_data.drop('y', axis=1)
bank_labels = bank_data['y']

##### visualize the bank features

In [8]:
bank_features.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


##### check the bank features shape

In [9]:
bank_features.shape

(4521, 16)

##### visualize the bank labels

In [10]:
bank_labels.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

##### check the bank labels shape

In [11]:
bank_labels.shape

(4521,)

##### rewrite the values of bank labels and change dtype from obj to int64

In [12]:
bank_labels.replace({'no': 0, 'yes': 1}, inplace=True)
bank_labels.astype('int64')
bank_labels.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [13]:
assert_is_not_none(bank_features)
assert_is_not_none(bank_labels)

### 3. Get indicator variables (1 point)
Get indicator (dummy) variables for all categorical columns in `bank_features`. Overwrite the `bank_features` variable to store the new data.

##### apply one-hot encoding to all categorical features in bank features

In [14]:
bank_features = pd.get_dummies(bank_features)

##### check the new bank features shape

In [15]:
bank_features.shape

(4521, 51)

In [16]:
assert_equal(bank_features.shape, (4521, 51))

### 4. Split the data (1 point)
Split the data into training and testing set, with 70% of the data for training. Because the output labels are not equaly distributed, use stratification based on the `bank_labels`.

##### split the data into training and test sets

In [17]:
bank_features_train, bank_features_test, bank_labels_train, bank_labels_test = train_test_split(
                                                                                    bank_features, 
                                                                                    bank_labels, 
                                                                                    train_size=0.7, 
                                                                                    stratify=bank_labels
                                                                                )

##### check the training and test sets shape

In [18]:
bank_features_train.shape, bank_features_test.shape, bank_labels_train.shape, bank_labels_test.shape

((3164, 51), (1357, 51), (3164,), (1357,))

In [19]:
assert_is_not_none(bank_features_train)
assert_is_not_none(bank_labels_train)
assert_is_not_none(bank_features_test)
assert_is_not_none(bank_labels_test)

### 5. Train a baseline algorithm (1 point)
Train a logistic regression using the training data. Use 1 000 000 (`1e6`) as the value of C. Score it using the testing data. Save the score in the `baseline_score` variable. You should see a fairly high score.

##### create and train model with train sets

In [20]:
model = LogisticRegression(C=1e6, solver = "liblinear", max_iter=1000)
model.fit(bank_features_train, bank_labels_train)

LogisticRegression(C=1000000.0, max_iter=1000, solver='liblinear')

##### visualize the model coef

In [21]:
model.coef_

array([[-1.63864322e-03,  2.29588383e-06,  1.22201676e-02,
         4.15680271e-03, -8.01365940e-02, -1.85141702e-03,
        -5.91890231e-03,  8.12781329e-02, -2.49197720e-01,
        -5.69474911e-02, -1.00387498e-01, -1.96844870e-02,
         6.76548928e-01, -2.56299599e-01, -1.82452363e-01,
         2.37575617e-01, -9.69323714e-03, -6.80792155e-01,
        -8.49658632e-02,  1.85594222e-01, -4.48076248e-01,
        -3.82535708e-01, -8.29774234e-02, -1.96176383e-02,
         1.85831751e-01, -7.28254424e-01, -6.87328872e-01,
         4.23111368e-02, -1.62511415e-01, -4.82506320e-01,
        -3.82775408e-04, -6.44634960e-01,  3.28837490e-01,
         3.33769973e-01, -1.30762520e+00, -1.80303260e-01,
        -4.95262028e-01, -8.37126115e-02, -1.02147134e-01,
        -1.37073184e+00, -8.53335796e-01,  6.08087292e-01,
         1.00677205e+00, -3.84847095e-01, -9.87551269e-01,
         1.48136005e+00,  7.16653909e-01, -7.13024749e-01,
        -1.83272510e-01,  1.52861544e+00, -1.27733591e+0

##### visualize the model intercept

In [22]:
model.intercept_

array([-0.64501773])

##### check the score with test sets

In [23]:
baseline_score = model.score(bank_features_test, bank_labels_test)
baseline_score

0.899042004421518

In [24]:
assert_is_not_none(model)
assert_greater(baseline_score, 0.7)

### 6. Select a better score (2 points)
As you alrady saw, the positive examples are very few. If you aren't convinced, just check the counts.

We know that the default scoring (accuracy) isn't correct in this case. Better measures would be precision and recall. However, we only want one number. Evaluate the algorithm once again, using a standard scoring method which combines precision and recall. Overwrite the `baseline_score` variable.

Don't forget to score the model on the testing data only.

##### evaluate with standart scoring method which combines precision and recall

In [25]:
bank_labels_predicted = model.predict(bank_features_test)
# baseline_score = precision_recall_fscore_support(bank_labels_test, bank_labels_predicted, average='binary')[2]
# baseline_score = fbeta_score(bank_labels_test, bank_labels_predicted, beta=1)
baseline_score = f1_score(bank_labels_test, bank_labels_predicted)
print(baseline_score)

0.41201716738197425


In [26]:
assert_less(baseline_score, 0.7)

### 7. Tune your model (2 points)
Fine-tune the `C` and `max_iter` parameters.

Use full grid search with the following values:
* `C`: 0.0001, 0.01, 0.1, 1, 10, 100, 10000
* `max_iter`: 50, 100, 300, 1000
* `fit_itercept`: True, False

Save the grid search result in the `grid_search` variable. Don't forget to use the better scoring model that you obtained in the previous task.

In [27]:
params = {
    'C': [0.0001, 0.01, 0.1, 1, 10, 100, 10000],
    'max_iter': [50, 100, 300, 1000],
    'fit_intercept': [True, False],
    'solver': ['liblinear']
}
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=params, scoring='f1')
grid_search.fit(bank_features_train, bank_labels_train)

GridSearchCV(estimator=LogisticRegression(),
             param_grid={'C': [0.0001, 0.01, 0.1, 1, 10, 100, 10000],
                         'fit_intercept': [True, False],
                         'max_iter': [50, 100, 300, 1000],
                         'solver': ['liblinear']},
             scoring='f1')

In [28]:
assert_is_not_none(grid_search)
assert_is_not_none(grid_search.best_estimator_)

### 8. Compare scores (1 point)
Use the best estimator from your grid search. Score it using the function from problem 6. Save your answer in `tuned_score`.

In [29]:
tuned_score = None
# YOUR CODE HERE
raise NotImplementedError()

NotImplementedError: 

In [None]:
print(tuned_score)

In [None]:
print(baseline_score - tuned_score)

Hmmmm, it seems we have not obtained a better algorithm, even the opposite (the difference is marginal and depends on the random initialization of the cross-validation datasets).

We can, of course, do a lot more things to improve our model's performance, such as normalizing the data, feature selection and feature engineering, trying out different aspects, e.g. polynomial terms, RANSAC; even boosting (we'll talk about this later). However, we'll stop at this point.

What can we conclude? It seems that this is close to the best performance we can get out of this algorithm, given these data points.

We can try improving (cleaning) our dataset, selecting features, etc. but we most likely need a better algorithm. In the next labs, we're going to explore that.