test cases:

1. get the expected output
2. can throw out errors (unfitted, format error, etc.)
3. produce correct (or nearly correct) amount of NAs

In [1]:
import pandas as pd
import numpy as np

In [2]:
%cd /Users/alex/PETsARD

/Users/alex/PETsARD


In [3]:
import PETsARD

In [4]:
def metadata_builder(data):
    metadata = {'metadata_col': None, 'metadata_global': {}}

    metadata['metadata_global']['row_num'] = data.shape[0]
    metadata['metadata_global']['col_num'] = data.shape[1]
    metadata['metadata_global']['na_percentage'] = data.isna().any(axis=1).mean()

    # create type and na_percentage keys and values automatically
    metadata['metadata_col'] = data.dtypes.reset_index(name='dtype')\
        .merge(data.isna().mean(axis=0).reset_index(name='na_percentage'), 
               on='index').set_index('index').to_dict('index')
    
    return metadata

In [5]:
loader = PETsARD.Loader(filepath = '[NHANES] B.csv',
                                header_exist = False,
                                header_names = ['gen','age','race','edu','mar','bmi','dep','pir','gh','mets','qm','dia']
                               )
print(loader.data.head(1))

    gen   age   race       edu      mar        bmi  dep  pir   gh  mets  qm  \
0  Male  62.0  White  Graduate  Married  27.799999    0    0  0.0   0.0  Q2   

   dia  
0    1  


In [6]:
df = loader.data

In [7]:
m = metadata_builder(df)

In [8]:
metadata = {'metadata_col': {
                'gen': {'type': 'categorical', 'na_percentage': 0.0},
                'age': {'type': 'numerical', 'na_percentage': 0.0},
                'race': {'type': 'categorical', 'na_percentage': 0.0},
                'edu': {'type': 'categorical', 'na_percentage': 0.0},
                'mar': {'type': 'categorical', 'na_percentage': 0.0},
                'bmi': {'type': 'numerical', 'na_percentage': 0.0},
                'dep': {'type': 'numerical', 'na_percentage': 0.0},
                'pir': {'type': 'numerical', 'na_percentage': 0.0},
                'gh': {'type': 'numerical', 'na_percentage': 0.0},
                'mets': {'type': 'numerical', 'na_percentage': 0.0},
                'qm': {'type': 'categorical', 'na_percentage': 0.0},
                'dia': {'type': 'numerical', 'na_percentage': 0.0}
                },
             'metadata_global':{
                 'row_num': 4190,
                 'col_num': 12,
                 'na_percentage': 0.0
                }
            }

In [9]:
pm = PETsARD.HyperProcessor(m)

In [10]:
from PETsARD.Processor.Missingist import *
from PETsARD.Processor.Outlierist import *
from PETsARD.Processor.Encoder import *
from PETsARD.Processor.Scaler import *

In [11]:
# df.loc[[1,2,3,4,5], 'gen'] = np.nan
# df.loc[[10,20,30,40,5], 'age'] = np.nan

In [12]:
# test global outlierist transformation

config_1 = {'missingist': {'gen': Missingist_Drop(), 'age': Missingist_Mean()},
            'outlierist': {'gen': None, 'age': Outlierist_ZScore()},
            'encoder': {'gen': Encoder_Label()},
            'scaler': {'age': Scaler_Log()}}

In [13]:
pm.update_config(config_1)

In [14]:
pm.get_config()

{'missingist': {'gen': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152af85b0>,
  'age': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152afb070>,
  'race': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152af8bb0>,
  'edu': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152af9cf0>,
  'mar': <PETsARD.Processor.Missingist.Missingist_Drop at 0x11018c220>,
  'bmi': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152af9d80>,
  'dep': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152af9f60>,
  'pir': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152afb700>,
  'gh': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152afb5e0>,
  'mets': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152afb4c0>,
  'qm': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152afb250>,
  'dia': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152afb2b0>},
 'outlierist': {'gen': None,
  'age': <PETsARD.Processor.Outlierist.Outlierist_ZScore at 0x152aaf580>,
  'race': None,
  

In [15]:
pm.fit(df)

In [16]:
df_transformed = pm.transform(df)

In [17]:
df_inverse = pm.inverse_transform(df_transformed)

  adjusted_na_percentage = self._metadata['metadata_col'][col].get('na_percentage', 0.0)\


In [18]:
df_inverse

Unnamed: 0,gen,age,race,edu,mar,bmi,dep,pir,gh,mets,qm,dia
0,Female,72.000000,Mexican,11th,Separated,28.6,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
1,Male,46.000000,White,Graduate,Parther,27.6,0.0,2.775558e-17,0.0,0.0,Q3,2.775558e-17
2,Male,45.000004,Other,11th,Never,24.1,0.0,2.775558e-17,0.0,0.0,Q3,2.775558e-17
3,Female,30.000002,Hispanic,College,Parther,26.6,0.0,2.775558e-17,0.0,0.0,Q4,2.775558e-17
4,Female,27.000002,Black,College,Never,38.0,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
...,...,...,...,...,...,...,...,...,...,...,...,...
2144,Female,41.000000,Other,Graduate,Married,20.9,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
2145,Male,34.000000,Black,Graduate,Married,30.9,0.0,2.775558e-17,0.0,0.0,Q3,2.775558e-17
2146,Male,25.000002,Other,Graduate,Never,21.0,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
2147,Female,80.000000,White,College,Widowed,31.0,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17


In [19]:
try:
    print(f'Inverse successful:\n{(df_inverse == df).all()}')
    print(f'Numeric close (age): {np.isclose(df_inverse.age, df.age).all()}')
    print(f'Numeric close (bmi): {np.isclose(df_inverse.bmi, df.bmi).all()}')
except:
    print('Not applicable.')

Not applicable.


In [20]:
loader2 = PETsARD.Loader(filepath = '[Adt Income] adult.csv'
                               ,na_values = {k : '?' for k in ['workclass'
                                                              ,'occupation'
                                                              ,'native-country'
                                                              ]
                                            }
                               )
print(loader2.data.head(1))

   age workclass  fnlwgt education  educational-num marital-status  \
0   25   Private  226802      11th                7  Never-married   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  


In [21]:
df2 = loader2.data

In [22]:
metadata2 = {'metadata_col': {
                'age': {'type': 'numerical', 'na_percentage': 0.0},
                'workclass': {'type': 'categorical', 'na_percentage': 0.057307},
                'fnlwgt': {'type': 'numerical', 'na_percentage': 0.0},
                'education': {'type': 'categorical', 'na_percentage': 0.0},
                'educational-num': {'type': 'numerical', 'na_percentage': 0.0},
                'marital-status': {'type': 'categorical', 'na_percentage': 0.0},
                'occupation': {'type': 'categorical', 'na_percentage': 0.057512},
                'relationship': {'type': 'categorical', 'na_percentage': 0.0},
                'race': {'type': 'categorical', 'na_percentage': 0.0},
                'gender': {'type': 'categorical', 'na_percentage': 0.0},
                'capital-gain': {'type': 'numerical', 'na_percentage': 0.0},
                'capital-loss': {'type': 'numerical', 'na_percentage': 0.0},
                'hours-per-week': {'type': 'numerical', 'na_percentage': 0.0},
                'native-country': {'type': 'categorical', 'na_percentage': 0.017546},
                'income': {'type': 'categorical', 'na_percentage': 0.0}
                },
             'metadata_global':{
                 'row_num': 48842,
                 'col_num': 15,
                 'na_percentage': 0.07411653904426518
                }
            }

In [23]:
m2 = metadata_builder(df2)

In [24]:
pm2 = PETsARD.HyperProcessor(m2)

In [25]:
pm2.get_config()

{'missingist': {'age': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152aaf4c0>,
  'workclass': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9dc90>,
  'fnlwgt': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152c9f2e0>,
  'education': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9f550>,
  'educational-num': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152c9db70>,
  'marital-status': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9ef50>,
  'occupation': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9e6e0>,
  'relationship': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9dc60>,
  'race': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9ebf0>,
  'gender': <PETsARD.Processor.Missingist.Missingist_Drop at 0x152c9eb00>,
  'capital-gain': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152c9e0e0>,
  'capital-loss': <PETsARD.Processor.Missingist.Missingist_Mean at 0x152c9ee00>,
  'hours-per-week': <PETsARD.Processor.Missing

In [26]:
pm2.fit(df2)

In [27]:
df_transformed2 = pm2.transform(df2)

In [28]:
df_transformed2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.995129,0.332021,0.351675,0.830750,-1.197259,0.484959,0.773202,0.732397,0.932198,0.127802,-0.144804,-0.217127,-0.034087,0.807281,0.657816
1,-0.046942,0.706556,-0.945524,0.168986,-0.419335,0.136328,0.937355,0.107684,0.745161,0.535248,-0.144804,-0.217127,0.772930,0.839387,0.500110
2,-0.776316,0.836402,1.394723,0.859869,0.747550,0.208799,0.979288,0.022567,0.846148,0.001545,-0.144804,-0.217127,-0.034087,0.854957,0.917141
3,-1.068066,0.504879,1.704525,0.476531,-0.030373,0.613341,0.720118,0.914734,0.541244,0.814146,-0.144804,-0.217127,-0.034087,0.234850,0.271649
4,-0.192816,0.978619,0.215911,0.642939,1.136512,0.364024,0.504195,0.043033,0.421443,0.116949,-0.144804,-0.217127,-0.034087,0.343178,0.468669
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26928,1.047121,0.726950,1.251867,0.722514,1.525474,0.003419,0.347428,0.053142,0.286519,0.648909,-0.144804,-0.217127,-0.034087,0.025041,0.767936
26929,-1.213941,0.484523,1.140952,0.373302,-0.030373,0.747606,0.979963,0.508238,0.528485,0.655147,-0.144804,-0.217127,-0.034087,0.540582,0.203961
26930,-0.849254,0.697513,0.640492,0.873538,0.747550,0.237407,0.972465,0.961112,0.308212,0.752251,-0.144804,-0.217127,-0.195490,0.571306,0.309038
26931,0.098933,0.022121,-0.334178,0.214839,-0.419335,0.381083,0.805290,0.083662,0.266699,0.097607,-0.144804,-0.217127,-0.034087,0.488695,0.981795


In [29]:
df_inverse2 = pm2.inverse_transform(df_transformed2)

In [30]:
df_inverse2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,,226802.0,11th,7.0,Never-married,,Own-child,Black,Male,0.0,0.0,40.0,,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,24.0,Private,369667.0,Some-college,10.0,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
4,36.0,Federal-gov,212465.0,Bachelors,13.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26928,53.0,Private,321865.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
26929,22.0,Private,310152.0,Some-college,10.0,Never-married,Protective-serv,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
26930,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
26931,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K


In [31]:
df2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [32]:
df_inverse2.isna().mean(axis=0)

age                0.000000
workclass          0.057290
fnlwgt             0.000000
education          0.000000
educational-num    0.000000
marital-status     0.000000
occupation         0.057476
relationship       0.000000
race               0.000000
gender             0.000000
capital-gain       0.000000
capital-loss       0.000000
hours-per-week     0.000000
native-country     0.017525
income             0.000000
dtype: float64

In [33]:
df_inverse2.isna().any(axis=1).mean()

0.07128801099023503

In [34]:
df2.dtypes

age                    int8
workclass          category
fnlwgt                int32
education          category
educational-num        int8
marital-status     category
occupation         category
relationship       category
race               category
gender             category
capital-gain          int32
capital-loss          int16
hours-per-week         int8
native-country     category
income             category
dtype: object

In [35]:
pd.api.types.is_numeric_dtype(df2.age.dtypes)

True

In [37]:
loader = PETsARD.Loader(filepath = '[Adt Income] adult.csv'
                       ,na_values = {k : '?' for k in ['workclass'
                                                      ,'occupation'
                                                      ,'native-country'
                                                      ]
                                    }
                       )

splitter = PETsARD.Splitter(data              = loader.data
                           ,num_samples       = 1
                           ,train_split_ratio = 0.8)

preproc = PETsARD.Preprocessor(
     data = splitter.data[1]['train']
    ,missing=True
        ,missing_method  = 'drop'
        ,missing_columns = None
    ,outlier=True
        ,outlier_method  = 'IQR'
        ,outlier_columns = {'ignore': ['hours-per-week']}
    ,encoding=True
        ,encoding_method  = 'Label'
        ,encoding_columns = None
    ,scaling=True
        ,scaling_method  = 'Standard'
        ,scaling_columns = {'focus' : 'fnlwgt'}
)

synthesizer = PETsARD.Synthesizer(data = preproc.data
                                 ,synthesizing_method = 'sdv-singletable-gaussiancoupula'
                                 )
synthesizer.fit_sample()

Preprocessor - Outlierist (IQR): Dropped   232 rows on educational-num. Kept [3.0, 19.0] only.
Preprocessor - Outlierist (IQR): Dropped  3002 rows on capital-gain   . Kept [0.0, 0.0] only.
Preprocessor - Outlierist (IQR): Dropped  1693 rows on capital-loss   . Kept [0.0, 0.0] only.
Preprocessor - Outlierist (IQR): Dropped   222 rows on age            . Kept [-0.5, 75.5] only.
Preprocessor - Outlierist (IQR): Dropped  1028 rows on fnlwgt         . Kept [-63828.75, 419349.25] only.
Preprocessor - Outlierist (IQR): Totally Dropped  5985 in 36182 rows.
Preprocessor - Encoder (Label): Column education       been labelized from 0 to 13.
Preprocessor - Encoder (Label): Column native-country  been labelized from 0 to 39.
Preprocessor - Encoder (Label): Column workclass       been labelized from 0 to  6.
Preprocessor - Encoder (Label): Column marital-status  been labelized from 0 to  6.
Preprocessor - Encoder (Label): Column relationship    been labelized from 0 to  5.
Preprocessor - Encoder (L

  return _boost._beta_ppf(q, a, b)


In [38]:
postproc = PETsARD.Postprocessor(data = synthesizer.data_syn
                                ,missingist = getattr(preproc ,'missingist' ,None)
                                ,encoder    = getattr(preproc ,'encoder'    ,None)
                                ,scaler     = getattr(preproc ,'scaler'     ,None)
                                )

Postprocessor - Scaler (StandardScaler): Decoding fnlwgt.
Postprocessor - Encoder (LabelEncoder): Decoding education.
Postprocessor - Encoder (LabelEncoder): Decoding native-country.
Postprocessor - Encoder (LabelEncoder): Decoding workclass.
Postprocessor - Encoder (LabelEncoder): Decoding marital-status.
Postprocessor - Encoder (LabelEncoder): Decoding relationship.
Postprocessor - Encoder (LabelEncoder): Decoding occupation.
Postprocessor - Encoder (LabelEncoder): Decoding income.
Postprocessor - Encoder (LabelEncoder): Decoding gender.
Postprocessor - Encoder (LabelEncoder): Decoding race.


ValueError: cannot convert float NaN to integer