# Day 09. Exercise 00
# Regularization

## 0. Imports

In [2]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv('../data/dayofweek.csv')

In [4]:
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [5]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

In [7]:
np.bincount(y_train) / np.bincount(y) * 100, np.bincount(y_test) / np.bincount(y) * 100

(array([80.14705882, 79.9270073 , 79.86577181, 79.7979798 , 79.80769231,
        80.07380074, 80.05617978]),
 array([19.85294118, 20.0729927 , 20.13422819, 20.2020202 , 20.19230769,
        19.92619926, 19.94382022]))

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [8]:
logreg = LogisticRegression(random_state=21, fit_intercept=False)

In [9]:
def crossval(n_splits, X, y, model):
    skf = StratifiedKFold(n_splits=n_splits)
    valid_scores = []
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        model.fit(X_train, y_train)

        train_score = model.score(X_train, y_train)
        valid_score = model.score(X_valid, y_valid)
        valid_scores.append(valid_score)
        
        print(f'train - {train_score:.5f} | valid - {valid_score:.5f}')
        mean_valid = np.mean(valid_scores)
        
    print(f'Average accurecy on crossval is {mean_valid:.5f}')
    print(f'Std if {np.std(valid_scores):.5f}')

In [10]:
%%time
crossval(10, X_train, y_train, logreg)

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accurecy on crossval is 0.60239
Std if 0.02852
CPU times: user 1.18 s, sys: 190 ms, total: 1.37 s
Wall time: 368 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [11]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, solver='newton-cg')
crossval(10, X_train, y_train, logreg)

train - 0.66694 | valid - 0.63704
train - 0.65787 | valid - 0.65926
train - 0.66612 | valid - 0.57778
train - 0.66529 | valid - 0.62963
train - 0.66694 | valid - 0.62222
train - 0.65952 | valid - 0.57778
train - 0.65045 | valid - 0.69630
train - 0.68425 | valid - 0.61481
train - 0.66474 | valid - 0.62687
train - 0.65651 | valid - 0.60448
Average accurecy on crossval is 0.62462
Std if 0.03379
CPU times: user 3.07 s, sys: 271 ms, total: 3.34 s
Wall time: 839 ms


In [12]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l1', solver='liblinear')
crossval(10, X_train, y_train, logreg)

train - 0.61830 | valid - 0.54815
train - 0.62737 | valid - 0.62222
train - 0.60511 | valid - 0.54074
train - 0.63644 | valid - 0.62222
train - 0.62407 | valid - 0.55556
train - 0.62325 | valid - 0.58519
train - 0.61253 | valid - 0.63704
train - 0.64716 | valid - 0.58519
train - 0.63015 | valid - 0.59701
train - 0.61367 | valid - 0.59701
Average accurecy on crossval is 0.58903
Std if 0.03129
CPU times: user 802 ms, sys: 105 ms, total: 907 ms
Wall time: 233 ms


In [13]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty='l2')
crossval(10, X_train, y_train, logreg)

train - 0.62819 | valid - 0.59259
train - 0.64716 | valid - 0.62963
train - 0.63479 | valid - 0.57037
train - 0.65540 | valid - 0.61481
train - 0.63314 | valid - 0.57778
train - 0.64056 | valid - 0.59259
train - 0.64221 | valid - 0.65926
train - 0.65952 | valid - 0.56296
train - 0.64333 | valid - 0.59701
train - 0.63591 | valid - 0.62687
Average accurecy on crossval is 0.60239
Std if 0.02852
CPU times: user 1.26 s, sys: 168 ms, total: 1.43 s
Wall time: 359 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [14]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21)
crossval(10, X_train, y_train, svc)

train - 0.70486 | valid - 0.65926
train - 0.69662 | valid - 0.75556
train - 0.69415 | valid - 0.62222
train - 0.70239 | valid - 0.65185
train - 0.69085 | valid - 0.65185
train - 0.68920 | valid - 0.64444
train - 0.69250 | valid - 0.72593
train - 0.70074 | valid - 0.62222
train - 0.69605 | valid - 0.61940
train - 0.71087 | valid - 0.63433
Average accurecy on crossval is 0.65871
Std if 0.04359
CPU times: user 5.4 s, sys: 21.1 ms, total: 5.42 s
Wall time: 5.43 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [15]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21, C=0.01)
crossval(10, X_train, y_train, svc)

train - 0.37923 | valid - 0.40000
train - 0.37923 | valid - 0.40000
train - 0.38417 | valid - 0.35556
train - 0.35449 | valid - 0.36296
train - 0.38252 | valid - 0.37037
train - 0.38087 | valid - 0.38519
train - 0.37923 | valid - 0.40000
train - 0.38252 | valid - 0.37037
train - 0.38468 | valid - 0.35075
train - 0.38386 | valid - 0.35821
Average accurecy on crossval is 0.37534
Std if 0.01848
CPU times: user 6.46 s, sys: 0 ns, total: 6.46 s
Wall time: 6.48 s


In [16]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21, C=1)
crossval(10, X_train, y_train, svc)

train - 0.70486 | valid - 0.65926
train - 0.69662 | valid - 0.75556
train - 0.69415 | valid - 0.62222
train - 0.70239 | valid - 0.65185
train - 0.69085 | valid - 0.65185
train - 0.68920 | valid - 0.64444
train - 0.69250 | valid - 0.72593
train - 0.70074 | valid - 0.62222
train - 0.69605 | valid - 0.61940
train - 0.71087 | valid - 0.63433
Average accurecy on crossval is 0.65871
Std if 0.04359
CPU times: user 4.87 s, sys: 0 ns, total: 4.87 s
Wall time: 4.86 s


In [17]:
%%time
svc = SVC(probability=True, kernel='linear', random_state=21, C=10)
crossval(10, X_train, y_train, svc)

train - 0.75021 | valid - 0.72593
train - 0.77741 | valid - 0.82963
train - 0.78566 | valid - 0.68148
train - 0.76834 | valid - 0.73333
train - 0.75185 | valid - 0.77778
train - 0.75598 | valid - 0.68889
train - 0.76257 | valid - 0.74074
train - 0.77411 | valid - 0.68889
train - 0.78254 | valid - 0.71642
train - 0.78418 | valid - 0.69403
Average accurecy on crossval is 0.72771
Std if 0.04417
CPU times: user 8.73 s, sys: 25.8 ms, total: 8.76 s
Wall time: 11.2 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [18]:
%%time
tree = DecisionTreeClassifier(max_depth=10, random_state=21)
crossval(10, X_train, y_train, tree)

train - 0.81039 | valid - 0.74074
train - 0.77741 | valid - 0.74074
train - 0.83347 | valid - 0.70370
train - 0.79720 | valid - 0.76296
train - 0.82440 | valid - 0.75556
train - 0.80379 | valid - 0.68889
train - 0.80709 | valid - 0.76296
train - 0.80132 | valid - 0.65926
train - 0.80807 | valid - 0.75373
train - 0.80478 | valid - 0.68657
Average accurecy on crossval is 0.72551
Std if 0.03562
CPU times: user 176 ms, sys: 4.14 ms, total: 180 ms
Wall time: 305 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [19]:
%%time
tree = DecisionTreeClassifier(max_depth=2, random_state=21)
crossval(10, X_train, y_train, tree)

train - 0.43034 | valid - 0.47407


train - 0.43281 | valid - 0.45185
train - 0.44023 | valid - 0.38519
train - 0.43034 | valid - 0.47407
train - 0.43776 | valid - 0.40741
train - 0.43364 | valid - 0.44444
train - 0.43364 | valid - 0.44444
train - 0.43776 | valid - 0.40741
train - 0.41763 | valid - 0.38806
train - 0.43657 | valid - 0.41791
Average accurecy on crossval is 0.42949
Std if 0.03116
CPU times: user 128 ms, sys: 2.84 ms, total: 130 ms
Wall time: 251 ms


In [20]:
%%time
tree = DecisionTreeClassifier(max_depth=5, random_state=21)
crossval(10, X_train, y_train, tree)

train - 0.59522 | valid - 0.53333
train - 0.56307 | valid - 0.53333
train - 0.60181 | valid - 0.55556
train - 0.59604 | valid - 0.57037
train - 0.60264 | valid - 0.57778
train - 0.57955 | valid - 0.53333
train - 0.58368 | valid - 0.54815
train - 0.59275 | valid - 0.51111
train - 0.58237 | valid - 0.56716
train - 0.60132 | valid - 0.50000
Average accurecy on crossval is 0.54301
Std if 0.02423
CPU times: user 144 ms, sys: 173 μs, total: 144 ms
Wall time: 286 ms


In [21]:
%%time
tree = DecisionTreeClassifier(max_depth=10, random_state=21)
crossval(10, X_train, y_train, tree)

train - 0.81039 | valid - 0.74074
train - 0.77741 | valid - 0.74074


train - 0.83347 | valid - 0.70370
train - 0.79720 | valid - 0.76296
train - 0.82440 | valid - 0.75556
train - 0.80379 | valid - 0.68889
train - 0.80709 | valid - 0.76296
train - 0.80132 | valid - 0.65926
train - 0.80807 | valid - 0.75373
train - 0.80478 | valid - 0.68657
Average accurecy on crossval is 0.72551
Std if 0.03562
CPU times: user 155 ms, sys: 0 ns, total: 155 ms
Wall time: 342 ms


In [22]:
%%time
tree = DecisionTreeClassifier(max_depth=15, random_state=21)
crossval(10, X_train, y_train, tree)

train - 0.95796 | valid - 0.82222
train - 0.93075 | valid - 0.83704
train - 0.95631 | valid - 0.83704
train - 0.95301 | valid - 0.86667
train - 0.95136 | valid - 0.88889
train - 0.94724 | valid - 0.82222
train - 0.95466 | valid - 0.90370
train - 0.94971 | valid - 0.87407
train - 0.95305 | valid - 0.83582
train - 0.94316 | valid - 0.85821
Average accurecy on crossval is 0.85459
Std if 0.02682
CPU times: user 167 ms, sys: 902 μs, total: 168 ms
Wall time: 359 ms


In [23]:
%%time
tree = DecisionTreeClassifier(max_depth=20, random_state=21)
crossval(10, X_train, y_train, tree)

train - 0.98846 | valid - 0.86667
train - 0.99011 | valid - 0.91111
train - 0.98681 | valid - 0.85926
train - 0.98763 | valid - 0.91111
train - 0.98928 | valid - 0.88148
train - 0.98186 | valid - 0.85926
train - 0.98846 | valid - 0.91852
train - 0.99176 | valid - 0.89630
train - 0.99094 | valid - 0.88060
train - 0.98847 | valid - 0.88060
Average accurecy on crossval is 0.88649
Std if 0.02075
CPU times: user 165 ms, sys: 2.81 ms, total: 168 ms
Wall time: 302 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [24]:
%%time
rf = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
crossval(10, X_train, y_train, rf)

train - 0.96455 | valid - 0.88148
train - 0.96208 | valid - 0.91852
train - 0.96785 | valid - 0.86667
train - 0.96455 | valid - 0.89630
train - 0.96538 | valid - 0.91111
train - 0.96538 | valid - 0.88148
train - 0.97115 | valid - 0.91852
train - 0.96867 | valid - 0.85185
train - 0.97364 | valid - 0.88060
train - 0.97941 | valid - 0.86567
Average accurecy on crossval is 0.88722
Std if 0.02204
CPU times: user 2.03 s, sys: 14.4 ms, total: 2.04 s
Wall time: 3.46 s


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [25]:
for n_estimators in (20, 30, 40, 50):
    for max_depth in (5, 10, 15, 20):
        print(f'n_estiatrix(y_test, y_pred)mators = {n_estimators}, max_depth = {max_depth}')
        rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=21)
        crossval(10, X_train, y_train, rf)

n_estiatrix(y_test, y_pred)mators = 20, max_depth = 5
train - 0.60264 | valid - 0.60741
train - 0.55153 | valid - 0.49630
train - 0.59357 | valid - 0.53333
train - 0.59275 | valid - 0.57778
train - 0.57873 | valid - 0.54074
train - 0.60758 | valid - 0.56296
train - 0.59687 | valid - 0.57037
train - 0.60429 | valid - 0.56296
train - 0.60297 | valid - 0.57463
train - 0.58649 | valid - 0.55224
Average accurecy on crossval is 0.55787
Std if 0.02839
n_estiatrix(y_test, y_pred)mators = 20, max_depth = 10
train - 0.82358 | valid - 0.77037
train - 0.84996 | valid - 0.82963
train - 0.86645 | valid - 0.80741
train - 0.88788 | valid - 0.80741
train - 0.86645 | valid - 0.85926
train - 0.84666 | valid - 0.74074
train - 0.85326 | valid - 0.82222
train - 0.84996 | valid - 0.75556
train - 0.88221 | valid - 0.79104
train - 0.87068 | valid - 0.76119
Average accurecy on crossval is 0.79448
Std if 0.03555
n_estiatrix(y_test, y_pred)mators = 20, max_depth = 15
train - 0.96785 | valid - 0.88889
train - 0.95

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [26]:
best_model = RandomForestClassifier(n_estimators=40, max_depth=20, random_state=21)

In [27]:
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
best_model.score(X_test, y_test)

0.9289940828402367

In [28]:
cm = confusion_matrix(y_test, y_pred)

In [29]:
errors_class = cm.sum(axis=1) - np.diag(cm)
for i, errors in enumerate(errors_class):
    print(f'Class {i}: count errors = {errors}')

Class 0: count errors = 7
Class 1: count errors = 6
Class 2: count errors = 2
Class 3: count errors = 2
Class 4: count errors = 2
Class 5: count errors = 4
Class 6: count errors = 1


In [30]:
cm

array([[20,  1,  0,  2,  0,  0,  4],
       [ 1, 49,  0,  3,  0,  2,  0],
       [ 0,  0, 28,  2,  0,  0,  0],
       [ 0,  0,  1, 78,  0,  0,  1],
       [ 0,  0,  0,  0, 19,  2,  0],
       [ 0,  0,  0,  2,  0, 50,  2],
       [ 0,  0,  0,  0,  0,  1, 70]])

In [31]:
joblib.dump(best_model, '../data/model_ex_00.pkl')

['../data/model_ex_00.pkl']