# Day 09. Exercise 02
# Metrics

## 0. Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from tqdm.autonotebook import tqdm
import joblib

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
enrichment = pd.read_csv('../data/dayofweek.csv')
df['dayofweek'] = enrichment['dayofweek']
df.shape

(1686, 44)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('dayofweek', axis=1), df['dayofweek'], test_size=0.2, random_state=21, stratify=df['dayofweek'])

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

In [4]:
def metrics_calculation(model, X_train, X_test, y_train, y_test, **kwargs) -> None:
    if kwargs:
        model.set_params(**kwargs)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    classes = model.classes_
    n_classes = len(classes)
    
    # попарная классификация
    roc_auc_scores = []
    class_counts = [] 
    
    for i in range(n_classes):
        for j in range(i + 1, n_classes):
            # отбираем значения принадлежащие к одному из классов
            mask = (y_test == classes[i]) | (y_test == classes[j])  #boolean array
            y_true_ij = y_test[mask]
            y_proba_ij = y_pred_proba[mask]
            
            # высчитываем бинарно
            roc_ij = roc_auc_score(
                (y_true_ij == classes[i]).astype(int),
                y_proba_ij[:, i],  # вероятность класса i
            )
            roc_auc_scores.append(roc_ij)
            
            # считаем значения
            count_ij = mask.sum()
            class_counts.append(count_ij)
    
    # вычисляем средневзвешенное
    weighted_roc_auc = np.average(roc_auc_scores, weights=class_counts)

    acc = accuracy_score (y_test, y_pred)
    prec = precision_score (y_test, y_pred, average='weighted')
    rcl = recall_score (y_test, y_pred, average='weighted')

    if acc:
        print(f"accuracy is {acc:.5f}")
        print(f"precision is {prec:.5f}")
        print(f"recall is {rcl:.5f}")
        print(f"roc is {weighted_roc_auc:.5f}")


In [5]:
SVM_model = SVC(random_state=21, probability=True, C=10, class_weight=None, gamma='auto', kernel= 'rbf')

In [6]:
metrics_calculation(SVM_model, X_train, X_test, y_train, y_test)

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc is 0.97965


## 3. Decision tree

1. The same task for decision tree

In [7]:
tree_model = DecisionTreeClassifier(random_state=21, max_depth=21, criterion='gini', class_weight='balanced')

In [8]:
metrics_calculation(tree_model, X_train, X_test, y_train, y_test)

accuracy is 0.88462
precision is 0.88765
recall is 0.88462
roc is 0.93261


## 4. Random forest

1. The same task for random forest.

In [9]:
forest_model = RandomForestClassifier(random_state=21, n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced')

In [10]:
metrics_calculation(forest_model, X_train, X_test, y_train, y_test)

accuracy is 0.92604
precision is 0.92754
recall is 0.92604
roc is 0.98762


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 44 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   numTrials         1686 non-null   int64  
 1   hour              1686 non-null   int64  
 2   uid_user_0        1686 non-null   float64
 3   uid_user_1        1686 non-null   float64
 4   uid_user_10       1686 non-null   float64
 5   uid_user_11       1686 non-null   float64
 6   uid_user_12       1686 non-null   float64
 7   uid_user_13       1686 non-null   float64
 8   uid_user_14       1686 non-null   float64
 9   uid_user_15       1686 non-null   float64
 10  uid_user_16       1686 non-null   float64
 11  uid_user_17       1686 non-null   float64
 12  uid_user_18       1686 non-null   float64
 13  uid_user_19       1686 non-null   float64
 14  uid_user_2        1686 non-null   float64
 15  uid_user_20       1686 non-null   float64
 16  uid_user_21       1686 non-null   float64


In [12]:
total = y_test.sum() # 1135
day_counts = df['dayofweek'].value_counts().sort_index()
day_counts

0    136
1    274
2    149
3    396
4    104
5    271
6    356
Name: dayofweek, dtype: int64

In [13]:
forest_model.fit(X_train, y_train)
y_pred = forest_model.predict(X_test)

test_data = X_test.copy()
test_data['true_label'] = y_test
test_data['predicted_label'] = y_pred
test_data['is_error'] = (y_test != y_pred).astype(int)
#test_data
error_rates_by_weekday = test_data.groupby('true_label')['is_error'].mean() * 100
print("Error Rates by Weekday (%):")
print(error_rates_by_weekday.sort_values(ascending=False))

Error Rates by Weekday (%):
true_label
0    22.222222
4    14.285714
5     9.259259
1     7.272727
2     6.666667
3     3.750000
6     2.816901
Name: is_error, dtype: float64


In [14]:
joblib.dump(forest_model, 'best_model_random_forest.joblib')

['best_model_random_forest.joblib']

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.

In [15]:
def get_all_metrics(model_list, X_train, y_train, X_test, y_test, param_dict=None) -> dict:
    results = {}
    for model in model_list:
        model_name = model.__class__.__name__
        if param_dict:
            model.set_params(**param_dict)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)
        metrics = {}

        classes = model.classes_
        n_classes = len(classes)

        # попарная классификация
        roc_auc_scores = []
        class_counts = [] 

        for i in range(n_classes):
            for j in range(i + 1, n_classes):
                # отбираем значения принадлежащие к одному из классов
                mask = (y_test == classes[i]) | (y_test == classes[j])  #boolean array
                y_true_ij = y_test[mask]
                y_proba_ij = y_pred_proba[mask]

                # высчитываем бинарно
                roc_ij = roc_auc_score(
                    (y_true_ij == classes[i]).astype(int),
                    y_proba_ij[:, i],  # вероятность класса i
                )
                roc_auc_scores.append(roc_ij)

                # считаем значения
                count_ij = mask.sum()
                class_counts.append(count_ij)

        # вычисляем средневзвешенное
        weighted_roc_auc = np.average(roc_auc_scores, weights=class_counts)

        acc = accuracy_score (y_test, y_pred)
        prec = precision_score (y_test, y_pred, average='weighted', zero_division=0)
        rcl = recall_score (y_test, y_pred, average='weighted')

        if acc:
            metrics['accuracy:'] = round(acc, 5)
            metrics['precision:'] = round(prec, 5)
            metrics ['recall:'] = round(rcl, 5)
            metrics['roc_auc:'] = round(weighted_roc_auc,5)

        results[model_name] = metrics
    return results

In [16]:
models = [
    forest_model,
    SVM_model
]

metrics = get_all_metrics(
    model_list=models,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

In [17]:
metrics

{'RandomForestClassifier': {'accuracy:': 0.92604,
  'precision:': 0.92754,
  'recall:': 0.92604,
  'roc_auc:': 0.98762},
 'SVC': {'accuracy:': 0.88757,
  'precision:': 0.89267,
  'recall:': 0.88757,
  'roc_auc:': 0.97965}}