# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split 
from tqdm.autonotebook import tqdm
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import joblib
from sklearn.pipeline import Pipeline

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor:
    def __init__(self, filename='../data/checker_submits.csv'):
        self.filename = filename

    def fit (self, X=None, y=None):
        return self
    
    def transform(self, X=None, y=None):
        df = pd.read_csv(self.filename, parse_dates=['timestamp'], usecols=['uid', 'labname', 'numTrials', 'timestamp'])
        df['hour'] = df['timestamp'].dt.hour
        df['dayofweek'] = df['timestamp'].dt.weekday
        df = df.drop('timestamp', axis=1)
        return df

In [3]:
extractor = FeatureExtractor('../data/checker_submits.csv')
extractor.fit()
dr = extractor.transform()
dr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1686 entries, 0 to 1685
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   uid        1686 non-null   object
 1   labname    1686 non-null   object
 2   numTrials  1686 non-null   int64 
 3   hour       1686 non-null   int64 
 4   dayofweek  1686 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 66.0+ KB


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.

In [4]:
class MyOneHotEncoder():
    def __init__(self, target_col_name='dayofweek'):
        self.target_col_name = target_col_name
        self.categorical_cols = None

    def fit(self, X, y=None):
        self.categorical_cols = X.select_dtypes(include=['object', 'category']).columns
        self.categorical_cols = [col for col in self.categorical_cols if col != self.target_col_name]
        return self


    def transform(self, X, y=None):
        target_col = X[self.target_col_name]
        if len(self.categorical_cols) > 0:
            encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
            encoded_data = encoder.fit_transform(X[self.categorical_cols])
            encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(self.categorical_cols), index=X.index)
            X = pd.concat([X.drop(self.categorical_cols, axis=1), encoded_df], axis=1)

        return X, target_col

In [5]:
encoder = MyOneHotEncoder()
encoder.fit(dr)
new_df, y = encoder.transform(dr)
new_df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
y

0       4
1       4
2       4
3       4
4       4
       ..
1681    3
1682    3
1683    3
1684    3
1685    3
Name: dayofweek, Length: 1686, dtype: int64

3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).

In [7]:
class TrainValidationTest():
    def __init__(self, test_size=0.2, random_state=21):
        self.test_size = test_size
        self.random_state = random_state
        
    def split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, 
            test_size=self.test_size, 
            random_state=self.random_state, 
            stratify=y
        )
        
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train, y_train,
            test_size=self.test_size,
            random_state=self.random_state,
            stratify=y_train
        )
        
        return X_train, X_valid, X_test, y_train, y_valid, y_test


In [8]:
trainer = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = trainer.split(new_df, y)

In [9]:
X_train

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
862,5,13,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
812,3,14,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
830,19,22,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
482,5,13,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
651,12,10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1639,1,17,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1028,10,7,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
492,4,14,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
143,24,17,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [10]:
class ModelSelection():
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.results = []


    def choose(self, X_train, y_train, X_valid, y_valid):
        best_score = 0
        best_name = ""
        tqdm.pandas()
        
        for idx, grid in enumerate(self.grids):
            model_name = self.grid_dict[idx]
            print(f"\nEstimator: {model_name}")

            pbar = tqdm(total=1)
            grid.fit(X_train, y_train)

            pbar.update(1)
            pbar.close()
            
            #param_counts = [len(v) for v in grid.param_grid.values()]
            #total_combinations = np.prod(param_counts) * grid.cv
            #
            #with tqdm(total=total_combinations) as pbar:
            #    grid.fit(X_train, y_train)
            #    pbar.update(total_combinations)
            
           
            best_params = grid.best_params_
            best_train_score = grid.best_score_
            
            
            best_estimator = grid.best_estimator_
            valid_score = accuracy_score(y_valid, best_estimator.predict(X_valid))
            
            
            self.results.append({
                'model': model_name,
                'params': best_params,
                'valid_score': valid_score,
                'train_score': best_train_score
            })
            
            
            print(f"Best params: {best_params}")
            print(f"Best training accuracy: {best_train_score:.3f}")
            print(f"Validation set accuracy score for best params: {valid_score:.3f}\n")
            
            
            if valid_score > best_score:
                best_score = valid_score
                best_name = model_name
        
        print(f"Classifier with best validation set accuracy: {best_name}")
        return best_name
    
    def best_results(self):
        df = pd.DataFrame(self.results)
        return df[['model', 'params', 'valid_score']]


In [11]:
svm_params = [
    {'kernel':('linear', 'rbf', 'sigmoid'), 
     'C':[0.01, 0.1, 1, 1.5, 5, 10], 
     'gamma': ['scale', 'auto'], 
     'class_weight':('balanced', None)
     }]

tree_params = {
    'max_depth': range(1, 50),
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', None]
}

rf_params = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': range(1, 50),
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', None]
}

svm = SVC(random_state=21, probability=True)
tree = DecisionTreeClassifier(random_state=21)
rf = RandomForestClassifier(random_state=21)

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_rf =GridSearchCV(estimator=rf, param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=-1)

grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

In [12]:
selectioner = ModelSelection(grids, grid_dict)
best_model_name = selectioner.choose(X_train, y_train, X_valid, y_valid)
results_df = selectioner.best_results()

  0%|          | 0/1 [00:00<?, ?it/s]


Estimator: SVM


100%|██████████| 1/1 [00:25<00:00, 25.85s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Best training accuracy: 1.000
Validation set accuracy score for best params: 1.000


Estimator: Decision Tree


100%|██████████| 1/1 [00:01<00:00,  1.41s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 3}
Best training accuracy: 1.000
Validation set accuracy score for best params: 0.604


Estimator: Random Forest


100%|██████████| 1/1 [00:46<00:00, 46.88s/it]

Best params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 14, 'n_estimators': 50}
Best training accuracy: 0.981
Validation set accuracy score for best params: 1.000

Classifier with best validation set accuracy: SVM





## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [13]:
class Finalize():
    def __init__(self, estimator):
        self.estimator = estimator

    def final_score(self, X_train, y_train, X_test, y_test):

        self.estimator.fit(X_train, y_train)
        y_pred = self.estimator.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy of the final model is {acc:.15f}")
        return acc
    
    def save_model(self, path):
        joblib.dump(self.estimator, path)
        print(f"Model was successfully saved to {path}")

In [14]:
model = RandomForestClassifier(random_state=21)
final = Finalize(model)
accuracy = final.final_score(X_train, y_train, X_test, y_test)
final.save_model('final_model.joblib')

Accuracy of the final model is 0.985207100591716
Model was successfully saved to final_model.joblib


## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [15]:
df = pd.read_csv('../data/checker_submits.csv') 

In [16]:
preprocessing = Pipeline([
    ('feature_extractor', FeatureExtractor()),
    ('onehot_encoder', MyOneHotEncoder(target_col_name='dayofweek'))
])

In [17]:
data, target = preprocessing.fit_transform(df)

In [18]:
splitter = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(data, target)

In [19]:
svm_params = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None]
}

tree_params = {
    'max_depth': [3, 5, 7, None],
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 5]
}

rf_params = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10, None],
    'min_samples_leaf': [1, 2]
}

models = [
    ('SVM', SVC(random_state=21, probability=True), svm_params),
    ('DecisionTree', DecisionTreeClassifier(random_state=21), tree_params),
    ('RandomForest', RandomForestClassifier(random_state=21), rf_params)
]

grids = []
grid_dict = {}
for idx, (name, model, params) in enumerate(models):
    grids.append(GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    ))
    grid_dict[idx] = name

In [20]:
model_selector = ModelSelection(grids, grid_dict)
best_model_name = model_selector.choose(X_train, y_train, X_valid, y_valid)
results_df = model_selector.best_results()

  0%|          | 0/1 [00:00<?, ?it/s]


Estimator: SVM
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   19.7s finished
100%|██████████| 1/1 [00:22<00:00, 22.23s/it]
  0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best params: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
Best training accuracy: 1.000
Validation set accuracy score for best params: 1.000


Estimator: DecisionTree
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    0.2s finished
100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
  0%|          | 0/1 [00:00<?, ?it/s][Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Best params: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 2}
Best training accuracy: 1.000
Validation set accuracy score for best params: 1.000


Estimator: RandomForest
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    1.9s finished
100%|██████████| 1/1 [00:02<00:00,  2.06s/it]

Best params: {'max_depth': None, 'min_samples_leaf': 1, 'n_estimators': 50}
Best training accuracy: 0.975
Validation set accuracy score for best params: 0.989

Classifier with best validation set accuracy: SVM





In [21]:
best_model = None
for grid, name in zip(grids, grid_dict.values()):
    if name == best_model_name:
        best_model = grid.best_estimator_
        break

In [22]:
X_train_full = pd.concat([X_train, X_valid])
y_train_full = pd.concat([y_train, y_valid])
best_model.fit(X_train_full, y_train_full)

SVC(C=0.1, class_weight='balanced', kernel='linear', probability=True,
    random_state=21)

In [23]:
final = Finalize(best_model)
accuracy = final.final_score(X_train_full, y_train_full, X_test, y_test)

Accuracy of the final model is 1.000000000000000


In [24]:
model_filename = f"{best_model_name.lower()}_{accuracy:.4f}.sav"
final.save_model(model_filename)

Model was successfully saved to svm_1.0000.sav


In [25]:
print(results_df)

          model                                             params  \
0           SVM  {'C': 0.1, 'class_weight': 'balanced', 'gamma'...   
1  DecisionTree  {'criterion': 'gini', 'max_depth': 5, 'min_sam...   
2  RandomForest  {'max_depth': None, 'min_samples_leaf': 1, 'n_...   

   valid_score  
0     1.000000  
1     1.000000  
2     0.988889  
