# Day 09. Exercise 02
# Metrics

## 0. Imports

In [33]:
import pip
def install(package):
    if hasattr(pip, 'main'):
        pip.main(['install', package])
    else:
        pip._internal.main(['install', package])
install('seaborn')

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,5,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1682,6,20,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1683,7,20,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1684,8,20,3,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [3]:
X = df.drop('dayofweek', axis=1)
y = df.dayofweek
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=21)

## 2. SVM

1. Use the best parameters from the previous exercise and train the model of SVM.
2. You need to calculate `accuracy`, `precision`, `recall`, `ROC AUC`.

 - `precision` and `recall` should be calculated for each class (use `average='weighted'`)
 - `ROC AUC` should be calculated for each class against any other class (all possible pairwise combinations) and then weighted average should be applied for the final metric
 - the code in the cell should display the result as below:

```
accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878
```

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

In [11]:
svc = SVC(kernel='rbf', probability=True, random_state=21, gamma='auto', C=10)
svc.fit(X_train, y_train)
y_predicted = svc.predict(X_test)
y_predicted_proba = svc.predict_proba(X_test)

In [24]:
ac = accuracy_score(y_predicted, y_test)
pred = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
roc = roc_auc_score(y_test, y_predicted_proba, multi_class='ovo', average='weighted')
print(f'\
accuracy is {ac:0.5f}\n\
precision is {pred:0.5f}\n\
recall is {recall:0.5f}\n\
roc_auc is {roc:0.5f}')

accuracy is 0.88757
precision is 0.89267
recall is 0.88757
roc_auc is 0.97878


## 3. Decision tree

1. The same task for decision tree

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}

In [26]:
tD = tree.DecisionTreeClassifier(random_state=21, class_weight='balanced', criterion='gini', max_depth=22)
tD.fit(X_train, y_train)
y_predicted = tD.predict(X_test)
y_predicted_proba = tD.predict_proba(X_test)

In [28]:
ac = accuracy_score(y_predicted, y_test)
pred = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
roc = roc_auc_score(y_test, y_predicted_proba, multi_class='ovo', average='weighted')
print(f'\
accuracy is {ac:0.5f}\n\
precision is {pred:0.5f}\n\
recall is {recall:0.5f}\n\
roc_auc is {roc:0.5f}')

accuracy is 0.89053
precision is 0.89262
recall is 0.89053
roc_auc is 0.93664


## 4. Random forest

1. The same task for random forest.

Best params: {'class_weight': None,
 'criterion': 'gini',
 'max_depth': 28,
 'n_estimators': 50}

In [47]:
forest = RandomForestClassifier(random_state=21, criterion='gini', max_depth=28, n_estimators=50)
forest.fit(X_train, y_train)
y_predicted = forest.predict(X_test)
y_predicted_proba = forest.predict_proba(X_test)

In [30]:
ac = accuracy_score(y_predicted, y_test)
pred = precision_score(y_test, y_predicted, average='weighted')
recall = recall_score(y_test, y_predicted, average='weighted')
roc = roc_auc_score(y_test, y_predicted_proba, multi_class='ovo', average='weighted')
print(f'\
accuracy is {ac:0.5f}\n\
precision is {pred:0.5f}\n\
recall is {recall:0.5f}\n\
roc_auc is {roc:0.5f}')

accuracy is 0.92899
precision is 0.93009
recall is 0.92899
roc_auc is 0.99033


## 5. Predictions

1. Choose the best model.
2. Analyze: for which `weekday` your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which `labname` and for which `users`.
3. Save the model.

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
def plot_roc_curve(fper, tper):
    plt.plot(fper, tper, color='red', label='ROC')
    plt.plot([0, 1], [0, 1], color='green', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic Curve')
    plt.legend()
    plt.show()
data_X, cls_lab = make_classification(n_samples=2100, n_classes=2, weights=[1,1], random_state=2)
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=21)
model = RandomForestClassifier()
model.fit(X_train, y_train)
prob = model.predict_proba(X_train)
prob = prob[:, 1]
fper, tper, thresholds = roc_curve(y_train, prob)
plot_roc_curve(fper, tper)

ValueError: multiclass format is not supported

## 6. Function

1. Write a function that takes a list of different models and a corresponding list of parameters (dicts) and returns a dict that contains all the 4 metrics for each model.