# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
enrichment = pd.read_csv('../data/dayofweek.csv')
df['dayofweek'] = enrichment['dayofweek']
df.shape

(1686, 44)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('dayofweek', axis=1), df['dayofweek'], test_size=0.2, random_state=21, stratify=df['dayofweek'])

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [5]:
def metrics_on_valid_set(model, X_train, y_train, X_valid, y_valid, **kwargs) -> None:
    if kwargs:
        model.set_params(**kwargs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred, average='weighted')
    rcl = recall_score(y_valid, y_pred, average='weighted')

    print(f"accuracy is {acc:.5f}")
    print(f"precision is {prec:.5f}")
    print(f"recall is {rcl:.5f}")

In [6]:
SVM_model = SVC(random_state=21, probability=True, C=10, class_weight=None, gamma='auto', kernel= 'rbf')
metrics_on_valid_set(SVM_model, X_train, y_train, X_valid, y_valid)

accuracy is 0.87778
precision is 0.88162
recall is 0.87778


In [7]:
tree_model = DecisionTreeClassifier(random_state=21, max_depth=21, criterion='gini', class_weight='balanced')
metrics_on_valid_set(tree_model, X_train, y_train, X_valid, y_valid)

accuracy is 0.86667
precision is 0.87170
recall is 0.86667


In [8]:
forest_model = RandomForestClassifier(random_state=21, n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced')
metrics_on_valid_set(forest_model, X_train, y_train, X_valid, y_valid)

accuracy is 0.89630
precision is 0.89698
recall is 0.89630


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [9]:
clf = VotingClassifier(estimators=[('svc', SVM_model), ('dtc', tree_model), ('rf', forest_model)], voting='hard')
metrics_on_valid_set(clf, X_train, y_train, X_valid, y_valid)

accuracy is 0.90000
precision is 0.89993
recall is 0.90000


In [10]:
clf = VotingClassifier(estimators=[('svc', SVM_model), ('dtc', tree_model), ('rf', forest_model)], voting='soft')
metrics_on_valid_set(clf, X_train, y_train, X_valid, y_valid)

accuracy is 0.88519
precision is 0.88840
recall is 0.88519


In [None]:
best_acc = 0
best_prec = 0
best_weights = ()
best_model = None
param_grid = {
    'weights_combination' : [
    (1,1,1),
    (2,1,1),
    (1,2,1),
    (1,1,2),
    (3,2,1),
    (1,2,3),
    (4,1,4)
    ],
    'voting' : ['soft', 'hard']
}

for weights in param_grid['weights_combination']:
    for voting in param_grid['voting']:
        clf = VotingClassifier(estimators=[('svc', SVM_model), ('dtc', tree_model), ('rf', forest_model)], voting=voting, weights=weights)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_valid)

        acc = accuracy_score(y_valid, y_pred)
        prec = precision_score(y_valid, y_pred, average='weighted')

        if ((acc > best_acc) or (acc==best_acc and prec > best_prec)):
            best_acc = acc
            best_weights = weights
            best_model = clf
            best_params = {
                'weights_combination': weights,
                'voting': voting
            }


print(f"Best Parameters: {best_params}")
print(f"Best accuracy is {best_acc:.5f} with weights {best_weights}")

Best Parameters: {'weights_combination': (4, 1, 4), 'voting': 'soft'}
Best accuracy is 0.91111 with weights (4, 1, 4)


In [12]:
clf = VotingClassifier(estimators=[('svc', SVM_model), ('dtc', tree_model), ('rf', forest_model)], voting='soft', weights=best_weights)
metrics_on_valid_set(clf, X_train, y_train, X_test, y_test)

accuracy is 0.90533
precision is 0.90881
recall is 0.90533


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [13]:
SVM_model = SVC(random_state=21, probability=True, C=10, class_weight=None, gamma='auto', kernel= 'rbf')
clf = BaggingClassifier(SVM_model, n_estimators=20, random_state=21)
metrics_on_valid_set(clf, X_train, y_train, X_valid, y_valid)

accuracy is 0.88519
precision is 0.89258
recall is 0.88519


In [23]:
best_acc = 0
best_n = 0
best_prec = 0
for n in (5,10,20,50,100):
    clf = BaggingClassifier(SVM_model, n_estimators=n, random_state=21)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_valid)
    
    acc = accuracy_score(y_valid, y_pred)
    prec = precision_score(y_valid, y_pred, average='weighted')

    if ((acc > best_acc) or (acc==best_acc and prec > best_prec)):
        best_acc = acc
        best_n = n
        best_prec = prec
    print(f"Accuracy is {acc} and precision is {prec} with {n} estimators")

print(f"Best accuracy is {best_acc:.5f} with {best_n} estimators")

Accuracy is 0.8740740740740741 and precision is 0.8842291933481172 with 5 estimators
Accuracy is 0.8851851851851852 and precision is 0.8942694727247573 with 10 estimators
Accuracy is 0.8851851851851852 and precision is 0.8925755778956984 with 20 estimators
Accuracy is 0.8814814814814815 and precision is 0.8903537908921018 with 50 estimators
Accuracy is 0.8851851851851852 and precision is 0.8939608307494487 with 100 estimators
Best accuracy is 0.88519 with 10 estimators


In [24]:
clf = BaggingClassifier(SVM_model, n_estimators=best_n, random_state=21)
metrics_on_valid_set(clf, X_train, y_train, X_test, y_test)

accuracy is 0.86391
precision is 0.86966
recall is 0.86391


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [16]:
estimators = [
    ('svc', SVM_model),
    ('dtc', tree_model),
    ('rf', forest_model)
]

final_estimator = LogisticRegression(solver='liblinear', random_state=21)

In [17]:
best_params = {}
best_accuracy = 0
best_precision = 0
best_model = None

param_grid = {
    'n_splits': [2, 3, 4, 5, 6, 7],
    'passthrough': [False, True]
}

for n_splits in param_grid['n_splits']:
    for passthrough in param_grid['passthrough']:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        
        stack_clf = StackingClassifier(
            estimators=estimators,
            final_estimator=final_estimator,
            cv=cv,
            passthrough=passthrough
        )
        
        stack_clf.fit(X_train, y_train)
        y_pred = stack_clf.predict(X_valid)
        
        acc = accuracy_score(y_valid, y_pred)
        prec = precision_score(y_valid, y_pred, average='weighted', zero_division=0)
        rec = recall_score(y_valid, y_pred, average='weighted', zero_division=0)
        

        if (acc > best_accuracy) or (acc == best_accuracy and prec > best_precision):
            best_accuracy = acc
            best_precision = prec
            best_params = {
                'n_splits': n_splits,
                'passthrough': passthrough
            }
            best_model = stack_clf

print(f"Best Parameters: {best_params}")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Best Precision: {best_precision:.4f}")

Best Parameters: {'n_splits': 4, 'passthrough': True}
Best Accuracy: 0.9111
Best Precision: 0.9133


In [18]:
metrics_on_valid_set(best_model, X_train, y_train, X_test, y_test)

accuracy is 0.90533
precision is 0.90844
recall is 0.90533


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [19]:
best_clf = VotingClassifier(estimators=[('svc', SVM_model), ('dtc', tree_model), ('rf', forest_model)], voting='hard', weights=(4, 1, 4))

In [20]:
X_full = df.drop('dayofweek', axis=1)
y_full = df['dayofweek']

X_new_train = pd.concat([X_train, X_valid])
y_new_train = pd.concat([y_train, y_valid])

In [21]:
best_clf.fit(X_new_train, y_new_train)
y_pred = best_clf.predict(X_full)

test_data = X_full.copy()
test_data['true_value'] = y_full
test_data['predicted_value'] = y_pred
test_data['is_error'] = (y_full != y_pred).astype(int)

error_rates_by_weekday = test_data.groupby('true_value')['is_error'].mean() * 100
print("Error Rates by Weekday (%):")
print(error_rates_by_weekday.sort_values(ascending=False))

Error Rates by Weekday (%):
true_value
0    5.147059
4    2.884615
1    2.189781
5    1.476015
2    1.342282
3    1.010101
6    0.561798
Name: is_error, dtype: float64


In [22]:
joblib.dump(best_clf,'best_model_voting_classifier.joblib')

['best_model_voting_classifier.joblib']