# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import VotingClassifier, BaggingClassifier, StackingClassifier

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df_dayofweek = pd.read_csv('../data/dayofweek.csv', usecols=['dayofweek'])

In [3]:
df['dayofweek'] = df_dayofweek
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,...,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [4]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [6]:
def metrics_models(models, params):
    if len(models) != len(params):
        print('The number of models must match the number of parameters')
    else:
        for number in range(len(models)):
            model = models[number]
            param = params[number]
            m = model(**param)
            m.fit(X_train, y_train)
            y_pred = m.predict(X_test)
            print(f'{model}')
            print(f'accureacy is {accuracy_score(y_test, y_pred):.5f}')
            print(f'precision is {precision_score(y_test, y_pred, average='weighted'):.5f}')
            print(f'recall is {recall_score(y_test, y_pred, average='weighted'):.5f}')
            print('')

In [7]:
models = [SVC, DecisionTreeClassifier, RandomForestClassifier]
params_models = [{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'random_state': 21, 'probability':True}, {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22, 'random_state': 21}, {'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50, 'random_state': 21}]
metrics_models(models, params_models)

<class 'sklearn.svm._classes.SVC'>
accureacy is 0.88757
precision is 0.89267
recall is 0.88757

<class 'sklearn.tree._classes.DecisionTreeClassifier'>
accureacy is 0.89053
precision is 0.89262
recall is 0.89053

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
accureacy is 0.92899
precision is 0.93009
recall is 0.92899



## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [8]:
svc = SVC(**params_models[0])
tree = DecisionTreeClassifier(**params_models[1])
rf = RandomForestClassifier(**params_models[2]) 

In [9]:
voting_clf = VotingClassifier(estimators=[('svc', svc), ('tree', tree), ('rf', rf)])
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

In [10]:
def metrics():
    print(f'accuracy is {accuracy_score(y_test, y_pred):.5f}')
    print(f'precision is {precision_score(y_test, y_pred, average='weighted'):.5f}')
    print(f'recall is {recall_score(y_test, y_pred, average='weighted'):.5f}')

In [11]:
metrics()

accuracy is 0.92308
precision is 0.92399
recall is 0.92308


In [12]:
params_voting = {'voting': ['hard', 'soft'], 'weights': [[1, 2, 3], [2, 1, 3], [2, 3, 1], [3, 2, 1], [1, 3, 2]]}

In [13]:
clf = GridSearchCV(voting_clf, params_voting)
clf.fit(X_train, y_train)
clf.best_estimator_

In [14]:
voting_clf = VotingClassifier(estimators=[('svc', svc), ('tree', tree), ('rf', rf)], voting='soft', weights=[2, 1, 3])
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

In [16]:
metrics()

accuracy is 0.92308
precision is 0.92518
recall is 0.92308


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [17]:
bagging = BaggingClassifier(estimator=svc, random_state=21)
params_bagging = {'n_estimators': (10, 50 , 100)}

In [18]:
clf = GridSearchCV(bagging, params_bagging)
clf.fit(X_train, y_train)
clf.best_estimator_

In [19]:
bagging = BaggingClassifier(estimator=svc, random_state=21, n_estimators=50)
bagging.fit(X_train, y_train)
y_pred = bagging.predict(X_test)

In [20]:
metrics()

accuracy is 0.90828
precision is 0.91091
recall is 0.90828


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [21]:
results = []

for n in range(2, 8):
    for passthrough in [True, False]:
        skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=21)
        stack = StackingClassifier(
            estimators=[('svc', svc), ('tree', tree), ('rf', rf)], 
            final_estimator=LogisticRegression(solver='liblinear'), 
            cv=skf, passthrough=passthrough
        )
        stack.fit(X_train, y_train)
        y_pred = stack.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')

        results.append({
            'n_splits': n,
            'passthrough': passthrough,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall
        })

In [22]:
results_df = pd.DataFrame(results)

results_df = results_df.sort_values(['accuracy', 'precision', 'recall'], ascending=[False, False, False])
best_params = results_df.iloc[0]
best_params

n_splits              3
passthrough        True
accuracy       0.934911
precision      0.937109
recall         0.934911
Name: 2, dtype: object

In [23]:
results_df

Unnamed: 0,n_splits,passthrough,accuracy,precision,recall
2,3,True,0.934911,0.937109,0.934911
0,2,True,0.931953,0.934093,0.931953
1,2,False,0.931953,0.933659,0.931953
6,5,True,0.928994,0.931517,0.928994
9,6,False,0.928994,0.931282,0.928994
3,3,False,0.928994,0.930887,0.928994
10,7,True,0.928994,0.930765,0.928994
11,7,False,0.926036,0.92781,0.926036
5,4,False,0.926036,0.927295,0.926036
4,4,True,0.926036,0.927091,0.926036


In [24]:
skf = StratifiedKFold(n_splits=3, random_state=21, shuffle=True)
best_model = StackingClassifier(
    estimators=[('svc', svc), ('tree', tree), ('rf', rf)],
    final_estimator=LogisticRegression(solver='liblinear'),
    cv=skf,
    passthrough=True
)

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

print(f"accuracy is {accuracy_score(y_test, y_pred):.5f}")
print(f"precision is {precision_score(y_test, y_pred, average='weighted'):.5f}")
print(f"recall is {recall_score(y_test, y_pred, average='weighted'):.5f}")

accuracy is 0.93491
precision is 0.93711
recall is 0.93491


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [25]:
cm = confusion_matrix(y_test, y_pred)

In [26]:
class_sum = cm.sum(axis=1)
errors_class = class_sum - np.diag(cm)
error_rates = (errors_class / class_sum) * 100
for i, errors in enumerate(error_rates):
    print(f'Class {i}: count errors = {errors:.1f}%')

Class 0: count errors = 18.5%
Class 1: count errors = 9.1%
Class 2: count errors = 6.7%
Class 3: count errors = 3.8%
Class 4: count errors = 9.5%
Class 5: count errors = 7.4%
Class 6: count errors = 1.4%


In [27]:
def errors_feature(feature):
    col_feature = X_test.columns.str.contains(feature)
    df_feature = X_test[X_test.columns[col_feature]]
    no_pred = y_pred != y_test
    feature = df_feature[no_pred].sum().sort_values(ascending=False)
    return feature.index[0], feature.iloc[0]

In [28]:
errors_feature('labname'), errors_feature('uid')

(('labname_project1', 9.0), ('uid_user_19', 4.0))

In [29]:
joblib.dump(best_model, '../data/model_ex03.pkl')

['../data/model_ex03.pkl']