# Setup

In [1]:
from sklearn import datasets
import pandas as pd
import numpy as np

iris = datasets.load_iris()
iris_frame = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_frame['target'] = iris.target

In [2]:
iris_frame.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# pipeline

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [4]:
class SimplePipeline:
    def __init__(self):
        self.frame = None
        self.X_train, self.X_test, self.y_train, self.Y_test = None, None, None, None
        self.model = None
        self.load_dataset()
    
    def load_dataset(self):
        """Load the dataset and perform train test split."""
        dataset = datasets.load_iris()
        
        # remove units ' (cm)' from variable names
        self.feature_names = [fn[:-5] for fn in dataset.feature_names]
        self.frame = pd.DataFrame(dataset.data, columns=self.feature_names)
        self.frame['target'] = dataset.target
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.frame[self.feature_names], self.frame.target, test_size=0.65, random_state=42)
        
    def train(self, algorithm=LogisticRegression):
        self.model = algorithm(solver='lbfgs', multi_class='auto')
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Helper method to run multiple pipeline methods with one call."""
        self.load_dataset()
        self.train()

In [6]:
pipeline = SimplePipeline()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()

print(f'current model accuracy is: {accuracy_score:.3f}')

current model accuracy is: 0.969


# Test input
- schema 확인
    - 여기서는 간단한게 dictionary로 한다. 나중에 라이브러리를 사용해보자.

In [7]:
# min, max: determined by looking at the dataframe .describe() method

iris_schema = {
    'sepal length': {
        'range': {
            'min': 4.0,  
            'max': 8.0
        },
        'dtype': float,
    },
    'sepal width': {
        'range': {
            'min': 1.0,
            'max': 5.0
        },
        'dtype': float,
    },
    'petal length': {
        'range': {
            'min': 1.0,
            'max': 7.0
        },
        'dtype': float,
    },
    'petal width': {
        'range': {
            'min': 0.1,
            'max': 3.0
        },
        'dtype': float,
    }
}

In [8]:
import unittest
import sys

In [11]:
class TestIrisInputData(unittest.TestCase):
    def setUp(self):
        # `setUp` will be run before each test
        self.pipeline = SimplePipeline()
        self.pipeline.run_pipeline()

    def test_input_data_ranges(self):
        max_value = self.pipeline.frame.max()
        min_value = self.pipeline.frame.min()

        for feat in self.pipeline.feature_names:
            self.assertTrue(max_value[feat] <= iris_schema[feat]['range']['max'])
            self.assertTrue(min_value[feat] >= iris_schema[feat]['range']['min'])

    def test_input_data_types(self):
        data_types = self.pipeline.frame.dtypes

        for feat in self.pipeline.feature_names:
            self.assertEqual(data_types[feat], iris_schema[feat]['dtype'])

In [12]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestIrisInputData)
unittest.TextTestRunner(verbosity=1, stream=sys.stderr).run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.047s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>