[View in Colaboratory](https://colab.research.google.com/github/orico/UnitTestAndDecoratorLogger/blob/master/Unit_Test_and_Logger.ipynb)

There are many definitions to what the scope of a data-scientist job is. On one hand, the norm in many companies is to concentrate only on model creation, which is understandable because your time is expensive and should be used mainly for your field of expertese. However, in other companies, an end-to-end definition is 

The following is an small example of unit testing and logging for a 10-class supervised problem using the MNIST dataset. The python code for unit testing and logger were made by Corey Schafer and are available [here](https://github.com/CoreyMSchafer/code_snippets/tree/master/Python-Unit-Testing) and [here](https://github.com/CoreyMSchafer/code_snippets/tree/master/Decorators). 


Additional asserts are vailable [here](https://docs.python.org/3/library/unittest.html#unittest.TestCase.debug)

In [0]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import unittest
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import check_random_state
from sklearn.datasets import load_digits
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler 
from sklearn.linear_model import LogisticRegression

np.random.seed(31337)

In [0]:
# Decorators
from functools import wraps


def my_logger(orig_func):
    import logging
    logging.basicConfig(filename='{}.log'.format(orig_func.__name__), level=logging.INFO)

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        logging.info(
            'Ran with args: {}, and kwargs: {}'.format(args, kwargs))
        return orig_func(*args, **kwargs)

    return wrapper


def my_timer(orig_func):
    import time

    @wraps(orig_func)
    def wrapper(*args, **kwargs):
        t1 = time.time()
        result = orig_func(*args, **kwargs)
        t2 = time.time() - t1
        print('{} ran in: {} sec'.format(orig_func.__name__, t2))
        return result

    return wrapper

In [0]:
def download():
    mnist = fetch_mldata('MNIST original')
    X = mnist.data.astype('float64')
    y = mnist.target
    return (X, y) 

In [0]:
class Normalize(object): 
    def normalize(self, X_train, X_test):
        self.scaler = MinMaxScaler()
        X_train = self.scaler.fit_transform(X_train)
        X_test  = self.scaler.transform(X_test)
        return (X_train, X_test) 
    
    def inverse(self, X_train, X_val, X_test):
        X_train = self.scaler.inverse_transform(X_train)
        X_test  = self.scaler.inverse_transform(X_test)
        return (X_train, X_test) 

In [0]:
def split(X,y, splitRatio):
    X_train = X[:splitRatio]
    y_train = y[:splitRatio]
    X_test = X[splitRatio:]
    y_test = y[splitRatio:]
    return (X_train, y_train, X_test, y_test)

In [0]:
class TheAlgorithm(object):
  
    @my_logger
    @my_timer
    def __init__(self, X_train, y_train, X_test, y_test):  
      self.X_train, self.y_train, self.X_test, self.y_test = X_train, y_train, X_test, y_test    
        
    @my_logger
    @my_timer
    def fit(self): 
        normalizer = Normalize()
        self.X_train, self.X_test = normalizer.normalize(self.X_train, self.X_test)   
        train_samples = self.X_train.shape[0]
        self.classifier = LogisticRegression(
            C=50. / train_samples,
            multi_class='multinomial',
            penalty='l1',
            solver='saga',
            tol=0.1,
            class_weight='balanced',
            )
        self.classifier.fit(self.X_train, self.y_train)
        self.train_y_predicted = self.classifier.predict(self.X_train)
        self.train_accuracy = np.mean(self.train_y_predicted.ravel() == self.y_train.ravel()) * 100
        self.train_confusion_matrix = confusion_matrix(self.y_train, self.train_y_predicted)        
        return self.train_accuracy
    
    @my_logger
    @my_timer
    def predict(self):
        self.test_y_predicted = self.classifier.predict(self.X_test) 
        self.test_accuracy = np.mean(self.test_y_predicted.ravel() == self.y_test.ravel()) * 100 
        self.test_confusion_matrix = confusion_matrix(self.y_test, self.test_y_predicted)        
        self.report = classification_report(self.y_test, self.test_y_predicted)
        print("Classification report for classifier:\n %s\n" % (self.report))
        return self.test_accuracy

In [7]:
class TestInput(unittest.TestCase):
  
    @classmethod
    def setUpClass(cls):
        # print('setupClass')   
        pass

    @classmethod
    def tearDownClass(cls): 
        # print('teardownClass')
        pass

    def setUp(self):
        print('setUp') 
        X, y = download()
        splitRatio = 60000
        self.X_train, self.y_train, self.X_test, self.y_test = split(X,y,splitRatio) 
        self.train_accuracy = 72.92166666666667
        self.train_confusion_matrix = np.matrix([[5447,   5,  40,  31,  49,  16, 198,  50,  81,   6],
                                                 [   3,6440, 127,  54,   3,  29,  25,  36,  24,   1],
                                                 [ 297, 420,3824, 163, 256,  19, 622, 186, 121,  50],
                                                 [ 124, 221, 255,4566,  54, 251,  97, 129, 275, 159],
                                                 [ 104, 128,  26,  54,4546, 342, 206, 133,  96, 207],
                                                 [ 399, 200, 109,1081, 416,2227, 289, 363, 228, 109],
                                                 [ 173,  89, 112,  55, 156, 229,5034,  25,  45,   0],
                                                 [ 213, 192, 205,  39, 160,  17,  26,5058,  60, 295],
                                                 [  67, 690, 202, 677,  73, 188, 347,  39,3437, 131],
                                                 [ 164, 162,  63, 290, 669, 279, 122, 735, 291,3174]])
        self.test_accuracy = 73.4
        self.test_confusion_matrix = np.matrix([[ 923,   1,   2,   3,   3,   1,  35,   3,   9,   0],
                                                [   0,1084,  23,  11,   0,   0,   5,   4,   8,   0],
                                                [  63,  78, 669,  27,  38,   2,  97,  28,  24,   6],
                                                [  20,  27,  35, 770,   8,  42,  18,  27,  45,  18],
                                                [  15,  21,   3,   8, 750,  60,  45,  23,  18,  39],
                                                [  56,  24,  15, 193,  73, 362,  56,  58,  38,  17],
                                                [  35,  10,  18,  11,  28,  42, 799,   6,   8,   1],
                                                [  23,  40,  52,   6,  21,   4,   7, 821,   8,  46],
                                                [  14,  90,  29,  99,  10,  33,  66,   7, 598,  28],
                                                [  21,  27,  10,  37, 133,  42,  27, 100,  48, 564]])

    def tearDown(self):
        # print('tearDown')
        pass
        
    def test_fit(self):     
        np.random.seed(31337)
        self.ta = TheAlgorithm(self.X_train, self.y_train, self.X_test, self.y_test)
        self.assertEqual(self.ta.fit(), self.train_accuracy) 
        self.assertEqual(self.ta.train_confusion_matrix.tolist(), self.train_confusion_matrix.tolist())  
  
    def test_predict(self):
        np.random.seed(31337)
        self.ta = TheAlgorithm(self.X_train, self.y_train, self.X_test, self.y_test)
        self.ta.fit()
        self.assertEqual(self.ta.predict(), self.test_accuracy)
        self.assertEqual(self.ta.train_confusion_matrix.tolist(), self.train_confusion_matrix.tolist()) 
      
if __name__ == '__main__':
  
    #run tests 
    unittest.main(argv=['first-arg-is-ignored'], exit=False)
    

setUp
__init__ ran in: 2.384185791015625e-06 sec


.

fit ran in: 16.511127471923828 sec
setUp
__init__ ran in: 2.86102294921875e-06 sec


.

fit ran in: 16.124633073806763 sec
Classification report for classifier:
              precision    recall  f1-score   support

        0.0       0.79      0.94      0.86       980
        1.0       0.77      0.96      0.85      1135
        2.0       0.78      0.65      0.71      1032
        3.0       0.66      0.76      0.71      1010
        4.0       0.70      0.76      0.73       982
        5.0       0.62      0.41      0.49       892
        6.0       0.69      0.83      0.76       958
        7.0       0.76      0.80      0.78      1028
        8.0       0.74      0.61      0.67       974
        9.0       0.78      0.56      0.65      1009

avg / total       0.73      0.73      0.73     10000


predict ran in: 0.05926704406738281 sec



----------------------------------------------------------------------
Ran 2 tests in 33.171s

OK


In [8]:
#The solution
if __name__ == '__main__': 
  
  X,y = download()
  print ('MNIST:', X.shape, y.shape)
  
  splitRatio = 60000
  X_train, y_train, X_test, y_test = split(X,y,splitRatio) 

  np.random.seed(31337)
  ta = TheAlgorithm(X_train, y_train, X_test, y_test)
  train_accuracy = ta.fit()
  print()
  print('Train Accuracy:', train_accuracy,'\n') 
  print("Train confusion matrix:\n%s\n" % ta.train_confusion_matrix)
  
  test_accuracy = ta.predict()
  print()
  print('Test Accuracy:', test_accuracy,'\n') 
  print("Test confusion matrix:\n%s\n" % ta.test_confusion_matrix)

MNIST: (70000, 784) (70000,)
__init__ ran in: 3.337860107421875e-06 sec
fit ran in: 15.876034498214722 sec

Train Accuracy: 72.92166666666667 

Train confusion matrix:
[[5447    5   40   31   49   16  198   50   81    6]
 [   3 6440  127   54    3   29   25   36   24    1]
 [ 297  420 3824  163  256   19  622  186  121   50]
 [ 124  221  255 4566   54  251   97  129  275  159]
 [ 104  128   26   54 4546  342  206  133   96  207]
 [ 399  200  109 1081  416 2227  289  363  228  109]
 [ 173   89  112   55  156  229 5034   25   45    0]
 [ 213  192  205   39  160   17   26 5058   60  295]
 [  67  690  202  677   73  188  347   39 3437  131]
 [ 164  162   63  290  669  279  122  735  291 3174]]

Classification report for classifier:
              precision    recall  f1-score   support

        0.0       0.79      0.94      0.86       980
        1.0       0.77      0.96      0.85      1135
        2.0       0.78      0.65      0.71      1032
        3.0       0.66      0.76      0.71      