In [2]:
# For virtual environment: uncomment the lines below
# %cd ..
# %env PYTHONPATH=.

# For Docker
# %cd /home/binary_clf_device_change

# Import

In [1]:
import argparse
import numpy 
import pandas as pd
import re
from typing import Text
import yaml

# import testing tools 
import pytest

In [2]:
# Set common vars 

# For Docker 
# BASE_PATH = "/home/binary_clf_device_change"

# For venv
BASE_PATH = "./config/"

CONFIG_PATH = f'{BASE_PATH}/params.yaml'
CONFIG_PATH 

'./config//params.yaml'

In [3]:
config = yaml.safe_load(open(CONFIG_PATH))
config

{'base': {'project_dir': '.', 'random_state': 42, 'log_level': 'DEBUG'},
 'data_load': {'target': 'data/raw/target.feather',
  'dataset': 'data/raw/user_features.feather',
  'target_processed': 'data/processed/target.feather',
  'dataset_processed': 'data/processed/user_features.feather'},
 'featurize': {'features_path': 'data/processed/features.feather',
  'categories': ['feature_17',
   'feature_21',
   'feature_11',
   'feature_16',
   'feature_22']},
 'data_split': {'split_oos': True,
  'test_size': 1,
  'train_index_path': 'data/processed/train_index.csv',
  'test_index_path': 'data/processed/test_index.csv'},
 'train': {'catboost_params': {'iterations': 20,
   'thread_count': 20,
   'has_time': True,
   'allow_writing_files': False},
  'top_K_coef': 0.05,
  'model_path': 'models/model.joblib',
  'train_metrics': 'reports/train_metrics.json',
  'train_metrics_path': 'reports/train_metrics.json',
  'train_metrics_png': 'reports/train_metrics.png',
  'train_plots_path': 'reports/tra

# Check if code works 


## Manual eye sanity checks

In [14]:
# Say hello to all tests in the world:
list_of_tests = ['manual', 'print', 'unit', 'integration']

print('Hello {}!'.format(', '.join(list_of_tests)))

Hello manual, print, unit, integration!


## Defensive programming: assertions 

In [15]:
# Assertion example

def hello_tests(list_of_tests):
    assert len(list_of_tests) > 0, 'Test list is empty'
    print('Hello {}!'.format(', '.join(list_of_tests)))

    
hello_tests(['manual', 'print', 'unit', 'integration'])

Hello manual, print, unit, integration!


In [16]:
hello_tests([])

AssertionError: Test list is empty

## Test example

In [22]:
# content of test_sample.py
def inc(x):
    return x + 1


def test_answer():
    assert inc(3) == 4
    assert inc(3) != 5

In [23]:
test_answer()

# Tools review

## Working with Pandas DataFrames

In [4]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'test': ['unit', 'print', 'unit', 'integration'],
                   'count': [50, 0, 10, 3],
                   'times': [100, 2050, 2050, 3232]})
df

Unnamed: 0,test,count,times
0,unit,50,100
1,print,0,2050
2,unit,10,2050
3,integration,3,3232


### Checking for duplicates and missing values

In [31]:
# Checking missing values

assert df.notnull().all().all()
assert ~df.isnull().any().any()
assert df.isnull().sum().sum() == 0

In [48]:
df.isnull()

Unnamed: 0,test,count,times
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [47]:
df.notna().all()

test     True
count    True
times    True
dtype: bool

In [52]:
df.duplicated() # Return boolean Series denoting duplicate rows.

0    False
1    False
2    False
3    False
dtype: bool

In [49]:
# duplicated() - Checking for duplicates
# any() - return whether any element is True, potentially over an axis.

assert ~df.duplicated().any()

In [5]:
if df.duplicated(subset=['test']).any():
    raise ValueError('Duplicate records found for "test" attribute')

ValueError: Duplicate records found for "test" attribute

### Pandas built in testing utilities 
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html 

In [58]:
from pandas.testing import assert_frame_equal
from pandas.testing import assert_index_equal
from pandas.testing import assert_series_equal

In [63]:
from pandas._testing import assert_frame_equal

df1 = pd.DataFrame({'a': [1, 2], 
                    'b': [3, 4]})

df2 = pd.DataFrame({'a': [1, 2], 
                    'b': [3.0, 4.0]})

In [64]:
# df1 equals itself

assert_frame_equal(df1, df1)

In [65]:
# df1 differs from df2 as column ‘b’ is of a different type

assert_frame_equal(df1, df2)

AssertionError: Attributes of DataFrame.iloc[:, 1] (column name="b") are different

Attribute "dtype" are different
[left]:  int64
[right]: float64

In [66]:
# Ignore differing dtypes in columns with check_dtype.

assert_frame_equal(df1, df2, check_dtype=False)

## Pytest

Features:

- All test functions are pre-fixed with test_
- All modules containing tests are prefixed with test_


https://docs.pytest.org/en/stable/index.html

Move followint code to `test/test_sample.py` & run 'pytest' command in terminal

```python
def inc(x):
    return x + 1


def test_answer():
    assert inc(3) == 5
```

Run tests in terminal: 
    
```bash 
pytest
```

In [34]:
! pytest

platform darwin -- Python 3.7.6, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /Users/antongusarov/ML_REPA/github/predict-device-change
plugins: hypothesis-6.1.1
collected 2 items                                                              [0m

test/test_demo.py [31mF[0m[32m.[0m[31m                                                     [100%][0m

[31m[1m________________________________ test_answer_1 _________________________________[0m

    [94mdef[39;49;00m [92mtest_answer_1[39;49;00m():
>       [94massert[39;49;00m inc([94m3[39;49;00m) == [94m5[39;49;00m
[1m[31mE       assert 4 == 5[0m
[1m[31mE        +  where 4 = inc(3)[0m

[1m[31mtest/test_demo.py[0m:6: AssertionError
FAILED test/test_demo.py::test_answer_1 - assert 4 == 5


## Hypothesis
**Propety-based** testing

With Hypothesis https://hypothesis.readthedocs.io/en/latest/

In [32]:
# Hypothesis - Automatic data generation for property based testing
from hypothesis import strategies as st

print('Examples of integers:')
print(st.integers().example())
print(st.integers().example())
print(st.integers().example())

Examples of integers:
0
-13
32573


In [91]:
st.text().example()

'0'

**Create Hypothesis test example**

- Create file: notebook/tests/demo_hypothesis.py 
- Add code: demo_hypothesis.py 
Example source: https://github.com/jesford/testing-in-data-science/blob/master/intro-to-testing-presentation.ipynb 

In [98]:
# By default `pytest` looking for files with 'test_' prefix only

!pytest

platform darwin -- Python 3.7.6, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /Users/antongusarov/ML_REPA/github/predict-device-change
plugins: hypothesis-6.1.1
collected 4 items                                                              [0m

test/test_demo.py [32m.[0m[31mF[0m[32m.[0m[31m                                                    [ 75%][0m
test/test_hypothesis_demo.py [32m.[0m[31m                                           [100%][0m

[31m[1m________________________________ test_inc_type _________________________________[0m

get_test_data = [(0, 1), (-2, -1), (-1, 0), (3, 4), (-1.0, 0.0)]

    [94mdef[39;49;00m [92mtest_inc_type[39;49;00m(get_test_data):
        [94mfor[39;49;00m data [95min[39;49;00m get_test_data:
            num = data[[94m0[39;49;00m]
            expected = data[[94m1[39;49;00m]
>           [94massert[39;49;00m [96misinstance[39;49;00m(inc(num), [96mint[39;49;00m)
[1m[31mE           assert False[0m
[1m[31mE        

In [95]:
# Specify test file name to run a specific test file 

!pytest test/test_hypothesis_demo.py

platform darwin -- Python 3.7.6, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /Users/antongusarov/ML_REPA/github/predict-device-change
plugins: hypothesis-6.1.1
collected 1 item                                                               [0m

test/test_hypothesis_demo.py [32m.[0m[32m                                           [100%][0m



In [99]:
# Use `--hypothesis-show-statistics` to show details 

!pytest test/test_hypothesis_demo.py --hypothesis-show-statistics

platform darwin -- Python 3.7.6, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /Users/antongusarov/ML_REPA/github/predict-device-change
plugins: hypothesis-6.1.1
collected 1 item                                                               [0m

test/test_hypothesis_demo.py [32m.[0m[32m                                           [100%][0m

test/test_hypothesis_demo.py::test_backwards_all_caps:

  - during reuse phase (0.00 seconds):
    - Typical runtimes: < 1ms, ~ 51% in data generation
    - 2 passing examples, 0 failing examples, 0 invalid examples

  - during generate phase (0.20 seconds):
    - Typical runtimes: 0-1 ms, ~ 71% in data generation
    - 98 passing examples, 0 failing examples, 2 invalid examples

  - Stopped because settings.max_examples=100




## Hypothesis + Pandas
https://hypothesis.readthedocs.io/en/latest/numpy.html#pandas

https://hypothesis.readthedocs.io/en/latest/numpy.html#numpy

In [102]:
from hypothesis import strategies as st
from hypothesis.extra.pandas import data_frames, column, range_indexes, series


# Generate example of sample scoring results
# Provides a strategy for producing a pandas.DataFrame:
data_frames([column('user_id',
                    elements=st.integers(min_value=0, max_value=100_000),
                    dtype=int, 
                    unique=True),
             column('prob_score',
                    elements=st.floats(min_value=0, max_value=1),
                    unique=True)]
           ).example()

Unnamed: 0,user_id,prob_score
0,56626,0.723226
1,49808,0.031927
2,83502,0.445731


In [103]:
# Example for features sample
data_frames(index=range_indexes(min_size=5, max_size=10),
            columns = [column('user_id', 
                               elements=st.integers(min_value=1, max_value=100000), 
                               dtype=int, 
                               unique=True),
                       column('month', 
                               elements=st.datetimes(
                                   min_value=pd.Timestamp(2020, 4, 30),
                                   max_value=pd.Timestamp(2020, 8, 31)),
                               unique=True),
                       column('feature_21',
                               elements=st.text(), 
                               unique=True)
            ]).example()


Unnamed: 0,user_id,month,feature_21
0,60692,2020-06-27 20:06:13.703764,ñ񜬅AÕ
1,37560,2020-06-23 03:35:27.974779,񖖭Û
2,4248,2020-06-01 22:08:57.506080,}􍔣¶§ü
3,78847,2020-06-03 01:25:22.491984,N򃏆ní³
4,85410,2020-06-07 04:27:12.981496,9
5,78726,2020-05-20 23:25:38.249477,z×
6,41674,2020-05-23 04:25:12.624769,


# Unit Testing

## Load feature data

In [104]:
# Data for the project
target_csv = 'data/raw/target.feather'
user_features_raw = 'data/raw/user_features.feather'

# Features
categories = ['feature_17', 'feature_21', 'feature_11', 'feature_16', 'feature_22']  

num_features = ['feature_1', 'feature_2', 'feature_3', 'feature_4',
                'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
                'feature_10', 'feature_12', 'feature_13', 'feature_14', 'feature_15',
                'feature_18', 'feature_19', 'feature_20', 'feature_23', 'feature_24',
                'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
                'feature_30']

In [110]:
target_df = pd.read_feather(target_csv)
target_df.head()

Unnamed: 0,user_id,month,target
0,0,2020-04-30,1.0
1,0,2020-05-31,0.0
2,0,2020-06-30,1.0
3,0,2020-07-31,0.0
4,0,2020-08-31,1.0


In [113]:
user_features_df = pd.read_feather(user_features_raw)
user_features_df = user_features_df.loc[user_features_df.user_id.isin(target_df.user_id),]
user_features_df.head()

Unnamed: 0,user_id,month,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30
0,0,2020-04-30,0.993121,-15,2.274309,18,2868,-1.305588,-0.097643,0.617778,...,H4V75OQHSRBLA,AOKOISPPQLWGKK,-3.146305,-1.655508,25,3.032537,-29960,-1.86466,-0.212668,-3.245333
1,0,2020-05-31,-1.416912,-145,-1.087891,-8,-1763,-1.322007,3.143865,-0.272231,...,AUEOMIKY5CRWBNWO7S,OORLO7PMHCZFEMSR,2.365636,-2.632201,-11,-2.373654,-173398,0.45899,1.376687,-0.46015
2,0,2020-06-30,0.673564,23,0.016666,-7,-4092,-0.936663,-1.909813,0.715618,...,4VPOKVAQSMMTDZQ,NZFU27MWLPZRTX4G5D,-2.220931,1.930994,-10,3.301401,-46619,0.26982,0.900846,0.315063
3,0,2020-07-31,-2.124908,-15,1.234815,18,9348,0.233297,-0.108647,-1.83146,...,RAGXKIMJHFFGKA,KQBIGTVRDJZJLQRRPPY,1.64178,-5.166544,-23,0.531913,-131224,-0.341103,0.208832,1.869453
4,0,2020-08-31,-2.122264,-50,-0.584464,-8,1894,-0.092315,-0.54748,-0.065323,...,RAGXKIMJHFFGKA,KQBIGTVRDJZJLQRRPPY,1.852084,-0.761511,-24,-1.080867,68577,-1.328331,-0.681723,0.431699


In [117]:
[user_features_df[feature].nunique() for feature in categories]

[42, 58, 43, 8, 44]

## Testing the feature engineering

### New feature extraction feature

- What can go wrong?
- Will it work in a production environment?
- How to test it?

In [118]:
def add_feature31(df: pd.DataFrame) -> pd.DataFrame:
    """Generate a new feature:a first letter of the device's code
    """
    df['feature31'] = df.copy().feature_21.apply(lambda s: s[0])
    return df

df2 = add_feature31(user_features_df)
df2[['user_id', 'month', 'feature_21', 'feature31']].head()

Unnamed: 0,user_id,month,feature_21,feature31
0,0,2020-04-30,H4V75OQHSRBLA,H
1,0,2020-05-31,AUEOMIKY5CRWBNWO7S,A
2,0,2020-06-30,4VPOKVAQSMMTDZQ,4
3,0,2020-07-31,RAGXKIMJHFFGKA,R
4,0,2020-08-31,RAGXKIMJHFFGKA,R


In [119]:
df2.feature31.value_counts()

H    114859
R     72707
J     70910
A     56506
7     52357
D     52002
W     46512
N     46070
L     43986
M     34397
Q     27852
K     27698
6     23481
X     18273
O     14149
4     11053
E     10735
G      7666
V      7618
F      3746
Z      3683
3      2660
I      2530
U       663
C         8
B         4
5         2
T         1
Name: feature31, dtype: int64

In [120]:
# Test case 1: Null values in source data 
# ----------------------------------------
user_features_df.feature_21.cat.add_categories([''], inplace=True) # add a new category
user_features_df.feature_21.iloc[:4] = ''  # replace by '' for the first 4 users
user_features_df.head().T # rotate for a better view

Unnamed: 0,0,1,2,3,4
user_id,0,0,0,0,0
month,2020-04-30 00:00:00,2020-05-31 00:00:00,2020-06-30 00:00:00,2020-07-31 00:00:00,2020-08-31 00:00:00
feature_1,0.993121,-1.41691,0.673564,-2.12491,-2.12226
feature_2,-15,-145,23,-15,-50
feature_3,2.27431,-1.08789,0.0166665,1.23482,-0.584464
feature_4,18,-8,-7,18,-8
feature_5,2868,-1763,-4092,9348,1894
feature_6,-1.30559,-1.32201,-0.936663,0.233297,-0.0923148
feature_7,-0.0976427,3.14386,-1.90981,-0.108647,-0.54748
feature_8,0.617778,-0.272231,0.715618,-1.83146,-0.0653226


In [121]:
# Try to add features 
df2 = add_feature31(user_features_df)
df2.head()

IndexError: string index out of range

In [122]:
# Reproduce error 
s = ''
s[0]

IndexError: string index out of range

### Generate test data

In [123]:
# Extract sample data from dataset 
test_data = user_features_df[['user_id', 'month', 'feature_21']][:5].to_dict(orient='list')
test_data

{'user_id': [0, 0, 0, 0, 0],
 'month': [Timestamp('2020-04-30 00:00:00'),
  Timestamp('2020-05-31 00:00:00'),
  Timestamp('2020-06-30 00:00:00'),
  Timestamp('2020-07-31 00:00:00'),
  Timestamp('2020-08-31 00:00:00')],
 'feature_21': ['', '', '', '', 'RAGXKIMJHFFGKA']}

In [124]:
# Generate test data
test_data = {
    'user_id': [0, 0, 0, 0, 0],
    'month': [
        pd.Timestamp('2020-04-30 00:00:00'),
        pd.Timestamp('2020-05-31 00:00:00'),
        pd.Timestamp('2020-06-30 00:00:00'),
        pd.Timestamp('2020-07-31 00:00:00'),
        pd.Timestamp('2020-08-31 00:00:00')],
     'feature_21': ['RAGXKIMJHFFGKA', '2322341', '!wersrqqw', None, '']
}

test_df = pd.DataFrame(test_data)
test_df.head()

Unnamed: 0,user_id,month,feature_21
0,0,2020-04-30,RAGXKIMJHFFGKA
1,0,2020-05-31,2322341
2,0,2020-06-30,!wersrqqw
3,0,2020-07-31,
4,0,2020-08-31,


### Update function

In [125]:
# Modify add_feature31()

def add_feature31(df): 
    """Add some first letter device code"""
    
    df['feature_31'] = df.feature_21.apply(lambda s: 'None' if s in ['', None] else re.findall(r'[\w]', s)[0])
    return df
 
add_feature31(test_df)

Unnamed: 0,user_id,month,feature_21,feature_31
0,0,2020-04-30,RAGXKIMJHFFGKA,R
1,0,2020-05-31,2322341,2
2,0,2020-06-30,!wersrqqw,w
3,0,2020-07-31,,
4,0,2020-08-31,,


### Add test function

In [126]:
def test_add_feature31(test_df):
    
    expected_data = {
        'user_id': [0, 0, 0, 0, 0],
        'month': [
            pd.Timestamp('2020-04-30 00:00:00'),
            pd.Timestamp('2020-05-31 00:00:00'),
            pd.Timestamp('2020-06-30 00:00:00'),
            pd.Timestamp('2020-07-31 00:00:00'),
            pd.Timestamp('2020-08-31 00:00:00')],
         'feature_21': ['RAGXKIMJHFFGKA', '2322341', '!wersrqqw', None, ''],
         'feature_31': ['R', '2', 'w', 'None', 'None']
    }
    
    expected_df = pd.DataFrame(expected_data)   
    calculated_df = add_feature31(test_df)

    assert calculated_df is not None 
    
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.testing.assert_frame_equal.html?highlight=assert 
    pd.testing.assert_frame_equal(
        left = calculated_df, 
        right = expected_df, 
        check_dtype=True, 
        check_index_type='equiv', 
        check_column_type='equiv', 
        check_names=True, 
        check_exact=True, 
#         check_datetimelike_compat=True, 
        check_categorical=True, 
    )
    
    
test_add_feature31(test_df) 

### Add new function to src/data/features.py and run feature calculation pipeline 
- add function 
- add tests 

In [127]:
from src.pipelines.featurize import featurize

featurize(CONFIG_PATH)

2021-02-10 14:34:31,842 — FEATURIZE — INFO — Load dataset
2021-02-10 14:34:32,331 — FEATURIZE — INFO — Process dataset
2021-02-10 14:34:32,558 — FEATURIZE — INFO — Add target column
2021-02-10 14:34:33,425 — FEATURIZE — INFO — Process nulls
2021-02-10 14:34:33,907 — FEATURIZE — INFO — Save features
2021-02-10 14:34:35,860 — FEATURIZE — DEBUG — Features path: data/processed/features.feather


In [128]:
df2.feature31.value_counts()

H    114859
R     72707
J     70910
A     56506
7     52357
D     52002
W     46512
N     46070
L     43986
M     34397
Q     27852
K     27698
6     23481
X     18273
O     14149
4     11053
E     10735
G      7666
V      7618
F      3746
Z      3683
3      2660
I      2530
U       663
C         8
B         4
5         2
T         1
Name: feature31, dtype: int64

## Testing featurized data properties

- All features have same data type 
- Numeric features are scaled in range between 0 and 1
- No missing data 
- Missing data is replaced by some values
- Data distributions meet expectations
- No outliers


In [129]:
# Generate sample scoring results 
df = data_frames(index=range_indexes(min_size=5, max_size=10),
                 columns = 
                [column('user_id',
                    elements=st.integers(min_value=0, max_value=100_000),
                    dtype=int, 
                    unique=True),
                 column('prob_score',
                    elements=st.floats(min_value=0, max_value=1),
                    unique=True)
                ]).example()

df.head()

Unnamed: 0,user_id,prob_score
0,78213,0.678828
1,22804,0.247912
2,87087,0.091005
3,82303,0.480749
4,88798,0.673258


### Test values range 

In [130]:
def test_prob_score_range(df):
    """Test that score values range in [0, 1]
    """
    assert df.prob_score.between(0, 1, inclusive=True).all() 

In [131]:
test_prob_score_range(df)

### Test data type

In [132]:
def test_prob_score_dtype(df):
    """test dtype is float
    """ 
    
    # most robust and native way to achieve dtype recognition
    assert pd.api.types.is_float_dtype(df.prob_score) 
    
    # test score dtype for a sigle value
    assert isinstance(df.prob_score[0], float)

test_prob_score_dtype(df)

### Test no missing and duplicates 

In [133]:
def test_prob_score_na(df):
    """Test no missing and duplicates
    """
    assert df.notnull().all().all()
    assert ~df.duplicated().any()

test_prob_score_na(df)

## Test inputs 

### Generate data schema 

Best practice is to use a schema - a specification of rules or data properties expected for a set of fields. 

Example schema for Iris dataset: 
```json
iris_schema = {
    'sepal length': {
        'range': {
            'min': 4.0, 
            'max': 8.0
        },
        'dtype': float,
    },
    'sepal width': {
        'range': {
            'min': 1.0,
            'max': 5.0
        },
        'dtype': float,
    },
    'petal length': {
        'range': {
            'min': 1.0,
            'max': 7.0
        },
        'dtype': float,
    },
    'petal width': {
        'range': {
            'min': 0.1,
            'max': 3.0
        },
        'dtype': float,
    }
}
```

In [134]:
import pprint

def gen_category_schema(df, categories, num_features):
    """Generate a data schema for category and numeric features
    """
    
    schema = {}
    
    cat = df.loc[:, categories].describe()
    cat.loc['unique', :] = df.loc[:, categories].apply(lambda x: x.unique().tolist())
    cat.loc['nunique', :] = df.loc[:, categories].apply(lambda x: x.nunique())
    cat.loc['dtype', :] = 'category'
    schema.update(cat.T.to_dict('index'))
    
    num_df = df.copy().loc[:, num_features]
    num = num_df.describe()
    num = num.append(pd.DataFrame(
        {'dtype': [t.__str__() for t in num_df.dtypes.values]}, 
        index = num_features).T)
    schema.update(num.T.to_dict('index'))
    
    return schema 

dschema = gen_category_schema(user_features_df, categories, num_features)
pprint.pprint(dschema)

{'feature_1': {'25%': -1.1430581298920903,
               '50%': 0.03319521301712278,
               '75%': 1.2199566293957183,
               'count': 752128.0,
               'dtype': 'float64',
               'max': 8.71482774873068,
               'mean': 0.04383947463088342,
               'min': -8.908541643807217,
               'std': 1.774547872522771},
 'feature_10': {'25%': -1.0304815357099264,
                '50%': 0.18618795439381358,
                '75%': 1.4703034668456332,
                'count': 752128.0,
                'dtype': 'float64',
                'max': 10.148381364385477,
                'mean': 0.23737338661451513,
                'min': -9.170694699579633,
                'std': 1.9071496456722656},
 'feature_11': {'count': 752128,
                'dtype': 'category',
                'freq': 69896,
                'nunique': 43,
                'top': 'MHQITDLH6CZQ',
                'unique': ['3NWLPIR2TFFUM62',
                           'JODPUBMS7GRNB

In [135]:
s  = set(df.columns) - set(['count'])
s

{'prob_score', 'user_id'}

In [136]:
for i in s: print(i)

prob_score
user_id


### Test input data types

In [137]:
def test_input_data_types(df, dschema):

    for feature in dschema.keys():       
        assert df[feature].dtype.__str__() == dschema[feature]['dtype'], f'Data type test failed for {feature}'
            
test_input_data_types(user_features_df, dschema)

- Break the test example:

In [138]:
# Create new DF
user_features_df2 = user_features_df.copy()

# Change dtype for 'feature_1'
user_features_df2['feature_1'] = user_features_df2['feature_1'].astype('str')

# Run test
test_input_data_types(user_features_df2, dschema)

AssertionError: Data type test failed for feature_1

### Test numeric data ranges (min, max)

In [139]:
def test_numeric_data_ranges(df, dschema, num_features):

    for feature in num_features:
        # use assertions to ensure the max/min values found in the dataset
        assert df[feature].max() <= dschema[feature]['max'], f'Max value test failed for {feature}'
        assert df[feature].min() >= dschema[feature]['min'], f'Min value test failed for {feature}'

test_numeric_data_ranges(user_features_df, dschema, num_features)

- Try to break the test:

In [140]:
# Create a new DF
user_features_df2 = user_features_df.copy()

# Check max value
user_features_df2['feature_2'].max()
print(f"Old max value: {user_features_df2['feature_2'].max()}")

# Simulate max value change
user_features_df2.loc[:0, 'feature_2'] = user_features_df2['feature_2'].max() * 10
print(f"New max value: {user_features_df2['feature_2'].max()}")

Old max value: 450
New max value: 4500


In [141]:
# Run test for new data

test_numeric_data_ranges(user_features_df2, dschema, num_features)

AssertionError: Max value test failed for feature_2

### Test category data values

In [142]:
def test_category_data_values(df, dschema, categories):

    for feature in categories:
        set_a = df.loc[:, feature].unique().tolist()

        # use assertions to ensure the feature categories exist in schema
        cat_dif = list(set(set_a) - set(dschema[feature]['unique']))
        assert len(cat_dif) == 0, f'DF has categories not in shema: {cat_dif}'

test_category_data_values(user_features_df, dschema, categories)

- Break this test example:

In [143]:
# Add a new category 'Unknown'
feature_17_categories = user_features_df2['feature_17'].tolist()
feature_17_categories[0] = 'Unknown'
user_features_df2['feature_17'] = pd.Categorical(
    user_features_df2['feature_17'], categories=set(feature_17_categories)
)
user_features_df2['feature_17'] = feature_17_categories
user_features_df2['feature_17'].head()


0             Unknown
1    OUH6V7W7UIPZ2AZI
2    OUH6V7W7UIPZ2AZI
3        E62S2GPTI3CU
4        E62S2GPTI3CU
Name: feature_17, dtype: object

In [144]:
# Run test 

test_category_data_values(user_features_df2, dschema, categories)

AssertionError: DF has categories not in shema: ['Unknown']

## Testing pipeline config 

In [154]:
import os

BASE_PATH = os.getcwd()
CONFIG_PATH = f'{BASE_PATH}/config/params.yaml'

config = yaml.safe_load(open(CONFIG_PATH))
config

{'base': {'project_dir': '.', 'random_state': 42, 'log_level': 'DEBUG'},
 'data_load': {'target': 'data/raw/target.feather',
  'dataset': 'data/raw/user_features.feather',
  'target_processed': 'data/processed/target.feather',
  'dataset_processed': 'data/processed/user_features.feather'},
 'featurize': {'features_path': 'data/processed/features.feather',
  'categories': ['feature_17',
   'feature_21',
   'feature_11',
   'feature_16',
   'feature_22']},
 'data_split': {'split_oos': True,
  'test_size': 1,
  'train_index_path': 'data/processed/train_index.csv',
  'test_index_path': 'data/processed/test_index.csv'},
 'train': {'catboost_params': {'iterations': 20,
   'thread_count': 20,
   'has_time': True,
   'allow_writing_files': False},
  'top_K_coef': 0.05,
  'model_path': 'models/model.joblib',
  'train_metrics': 'reports/train_metrics.json',
  'train_metrics_path': 'reports/train_metrics.json',
  'train_metrics_png': 'reports/train_metrics.png',
  'train_plots_path': 'reports/tra

### Test required fields

In [155]:
config.keys()

dict_keys(['base', 'data_load', 'featurize', 'data_split', 'train'])

In [160]:
def test_check_required_fields(config):
    """Test that we not missed the required param sections
    """
    assert 'base' in config.keys()
    assert 'data_load' in config.keys()
    assert 'target' in config['data_load'].keys()
    assert 'test_size' in config['data_split'].keys()
    assert 'train' in config.keys()
    

test_check_required_fields(config)

### Test config value types

In [164]:
def test_config_value_types(config):
    
    assert isinstance(config['data_split']['test_size'], float), "Incorrect Type"
    
test_config_value_types(config)

AssertionError: Incorrect Type

### Test Model params

In [167]:
from catboost import CatBoostClassifier 

def test_model_params(config):
    """Test the estimator can be created with params in config
    """

    assert CatBoostClassifier(**config['train']['catboost_params']) # initialize model with params from config
    assert config['train']['catboost_params']['loss_function'] == 'Logloss'
    
test_model_params(config)

KeyError: 'loss_function'

# Run all tests

To run all tests 
```bash
pytest
```

To run a specific test  
```bash
pytest tests/test_config.py
```

In [157]:
%%bash 
pytest

platform darwin -- Python 3.7.6, pytest-6.2.2, py-1.10.0, pluggy-0.13.1
rootdir: /Users/antongusarov/ML_REPA/github/predict-device-change
plugins: hypothesis-6.1.1
collected 5 items

test/property_based_test.py F                                            [ 20%]
test/test_demo.py ...                                                    [ 80%]
test/test_hypothesis_demo.py .                                           [100%]

__________________________________ test_sum_4 __________________________________

    @settings(verbosity=Verbosity.verbose)
>   @given(st.integers(), st.integers())
    def test_sum_4(num1, num2):

test/property_based_test.py:62: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

num1 = 0, num2 = 0

    @settings(verbosity=Verbosity.verbose)
    @given(st.integers(), st.integers())
    def test_sum_4(num1, num2):
>       assert my_sum(num1, num2) == num1 + num2
E       assert 1 == (0 + 0)
E        +  where 1 = my_sum(0, 0)

test/property

CalledProcessError: Command 'b'pytest\n'' returned non-zero exit status 1.