# Day 08. Exercise 03
# Overfitting

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder, StandardScaler, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split, cross_val_score
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`.
3. Using, for example, `value_counts()` to check if the distribution of classes is similar in train and test.
4. Use the additional parameter `stratify=` and check the distribution again, now it should be more or less similar in both datasets.

In [2]:
df = pd.read_csv("../data/dayofweek.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,numTrials,hour,dayofweek,user_0,user_1,user_10,user_11,user_12,user_13,...,lab02,lab03,lab03s,lab05s,laba04,laba04s,laba05,laba06,laba06s,project1
0,0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('dayofweek', axis=1), df['dayofweek'], test_size=0.2, random_state=21)

In [4]:
y_train.value_counts() 

3    313
6    287
1    222
5    216
2    125
0    105
4     80
Name: dayofweek, dtype: int64

In [5]:
y_test.value_counts()

3    83
6    69
5    55
1    52
0    31
4    24
2    24
Name: dayofweek, dtype: int64

In [6]:
y_test.value_counts() / y_train.value_counts()

0    0.295238
1    0.234234
2    0.192000
3    0.265176
4    0.300000
5    0.254630
6    0.240418
Name: dayofweek, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('dayofweek', axis=1), df['dayofweek'], test_size=0.2, random_state=21, stratify=df['dayofweek'])

In [8]:
y_train.value_counts()

3    316
6    285
1    219
5    217
2    119
0    109
4     83
Name: dayofweek, dtype: int64

In [9]:
y_test.value_counts()

3    80
6    71
1    55
5    54
2    30
0    27
4    21
Name: dayofweek, dtype: int64

In [10]:
y_test.value_counts() / y_train.value_counts()

3    0.253165
6    0.249123
1    0.251142
5    0.248848
2    0.252101
0    0.247706
4    0.253012
Name: dayofweek, dtype: float64

## 2. Baseline models

1. Train exactly the same baseline models from the previous exercise and calculate the accuracies using the test dataset with stratification.
2. Did all the models show the similar values of the metric? Which one has the largest difference comparing the current exercise and the previous? Put the answer to the markdown cell in the end of the section.

### a. Logreg

In [11]:
logreg = LogisticRegression(fit_intercept=False, random_state=21, solver='liblinear')
logreg.fit(X_train, y_train)
accuracy_score(y_test, logreg.predict(X_test))

0.6272189349112426

### b. SVM

In [12]:
my_svm = svm.SVC(random_state = 21, probability=True, kernel='linear')
my_svm.fit(X_train, y_train)
accuracy_score(y_test, my_svm.predict(X_test))

0.7071005917159763

### c. Decision tree

In [13]:
dtc_model_main = DecisionTreeClassifier(max_depth = 4, random_state = 21)
dtc_model_main.fit(X_train, y_train)
accuracy_score(y_test, dtc_model_main.predict(X_test))

0.6331360946745562

### d. Random forest

In [14]:
rf = RandomForestClassifier(n_estimators = 100, max_depth = 25, random_state = 21)
rf.fit(X_train, y_train)
accuracy_score(y_test, rf.predict(X_test))

0.9733727810650887

Нет, у моделей разные показатели.
По сравнению с предыдущим упражением наибольшая разница у random forest

## 3. Crossvalidation

We could play with parameters of the model trying to achive a better accuracy on the test dataset, but it is a bad practice. It leads us again to overfitting. Test dataset is only for checking quality of a final model.

But there is another way of solving the problem – crossvalidation. It does not use test dataset, but creates one more split of train dataset. Again, there are different ways of doing it, but the common thing is that there is a validation dataset that is used for hyperparameters optimization.

1. Using `cross_val_score` with `cv=10` calculate the mean accuracy and standard deviation for every model that you used before (logreg with `solver='liblinear'`, SVC, decision tree, random forest).

### a. Logreg

In [15]:
logreg = LogisticRegression(fit_intercept=False, random_state=21, solver='liblinear')
cvs_logreg = cross_val_score(logreg, df.drop('dayofweek', axis=1), df['dayofweek'], cv=10)
cvs_logreg.mean(), cvs_logreg.std()

(0.47317906452521835, 0.15998503635647593)

### b. SVM

In [16]:
cvs_my_svm = cross_val_score(my_svm, df.drop('dayofweek', axis=1), df['dayofweek'], cv=10)
cvs_my_svm.mean(), cvs_my_svm.std()

(0.5443434770357847, 0.13945203630329392)

### c. Decision tree

In [17]:
cvs_dtc_model = cross_val_score(dtc_model_main, df.drop('dayofweek', axis=1), df['dayofweek'], cv=10)
cvs_dtc_model.mean(), cvs_dtc_model.std()

(0.5185087348548887, 0.2344796471424826)

### d. Random forest

In [18]:
cvs_rf = cross_val_score(rf, df.drop('dayofweek', axis=1), df['dayofweek'], cv=10)
cvs_rf.mean(), cvs_rf.std()

(0.753353057199211, 0.20547952459387447)

## 4. Optimization

1. Choose the best model and play a little bit with the parameters on cross-validation, find a good enough parameter or a combination of the parameters.
2. Calculate the accuracy for the final model on the test dataset.
3. Draw a plot that displays the top-10 most  important features for that model.
4. Save the model using `joblib`.
5. Load the model, make predictions for the test dataset and calculate the accuracy.

In [19]:
cvs_rf_1 = cross_val_score(rf_1, df.drop('target', axis=1), df['target'], cv=5, n_jobs =-1, scoring = 'f1_weighted')
cvs_rf_1.mean()

NameError: name 'rf_1' is not defined

In [None]:
rf_1= RandomForestClassifier(n_estimators = 100, max_depth = 25, random_state = 21)
rf_1.fit(X_train, y_train)
accuracy_score(y_test, rf_1.predict(X_test))

In [None]:
def plot_barh(coef, names, top_n, sums= True):
    if sums:
        mas = np.abs(coef).sum(axis=0)
        top_feat = sorted(np.abs(mas), reverse=True)[:top_n]
    else:
        mas = np.abs(coef)
        top_feat = sorted(np.abs(coef), reverse=True)[:top_n]
        
    index = mas.argsort()[-top_n:][::-1]
    df = pd.DataFrame({'lab': names[index], 'val': top_feat})
    ax = df.plot.barh(x='lab', y='val', figsize=(15,8), colormap='PiYG')

In [None]:
plot_barh(rf_1.feature_importances_, X_train.columns, 10, False)

In [None]:
file_name = '../data/finalized_model.sav'
joblib.dump(rf_1, file_name)

In [None]:
model = joblib.load(file_name)

In [None]:
accuracy_score(y_test, model.predict(X_test))