test cases:

1. get the expected output
2. can throw out errors (unfitted, format error, etc.)
3. produce correct (or nearly correct) amount of NAs

In [1]:
import pandas as pd
import numpy as np

In [2]:
%cd /Users/alex/PETsARD

/Users/alex/PETsARD


In [3]:
import PETsARD

In [4]:
loader = PETsARD.Loader(filepath = '[NHANES] B.csv',
                                header_exist = False,
                                header_names = ['gen','age','race','edu','mar','bmi','dep','pir','gh','mets','qm','dia']
                               )
print(loader.data.head(1))

    gen   age   race       edu      mar        bmi  dep  pir   gh  mets  qm  \
0  Male  62.0  White  Graduate  Married  27.799999    0    0  0.0   0.0  Q2   

   dia  
0    1  


In [5]:
df = loader.data

In [6]:
m = PETsARD.Metadata()
m.build_metadata(df)

In [7]:
metadata = {'col': {
                'gen': {'type': 'categorical', 'na_percentage': 0.0},
                'age': {'type': 'numerical', 'na_percentage': 0.0},
                'race': {'type': 'categorical', 'na_percentage': 0.0},
                'edu': {'type': 'categorical', 'na_percentage': 0.0},
                'mar': {'type': 'categorical', 'na_percentage': 0.0},
                'bmi': {'type': 'numerical', 'na_percentage': 0.0},
                'dep': {'type': 'numerical', 'na_percentage': 0.0},
                'pir': {'type': 'numerical', 'na_percentage': 0.0},
                'gh': {'type': 'numerical', 'na_percentage': 0.0},
                'mets': {'type': 'numerical', 'na_percentage': 0.0},
                'qm': {'type': 'categorical', 'na_percentage': 0.0},
                'dia': {'type': 'numerical', 'na_percentage': 0.0}
                },
             'global':{
                 'row_num': 4190,
                 'col_num': 12,
                 'na_percentage': 0.0
                }
            }

In [8]:
pm = PETsARD.Processor.Base.Processor(m)

No self-defined config passed.  Generate a config automatically.


In [9]:
from PETsARD.Processor.Missingist import *
from PETsARD.Processor.Outlierist import *
from PETsARD.Processor.Encoder import *
from PETsARD.Processor.Scaler import *

In [10]:
# df.loc[[1,2,3,4,5], 'gen'] = np.nan
# df.loc[[10,20,30,40,5], 'age'] = np.nan

In [11]:
# test global outlierist transformation

config_1 = {'missingist': {'gen': MissingistDrop(), 'age': MissingistSimple(5)},
            'outlierist': {'gen': None, 'age': OutlieristZScore()},
            'encoder': {'gen': EncoderLabel()},
            'scaler': {'age': ScalerLog()}}

In [12]:
config_2 = {'missingist': {'gen': MissingistDrop(), 'age': 'missingist_simple'},
            'outlierist': {'gen': None, 'age': 'outlierist_lof'},
            'encoder': {'gen': 'encoder_uniform'},
            'scaler': {'age': 'scaler_minmax'}}

In [13]:
pm.update_config(config_2)

In [14]:
pm.get_config()

{'missingist': {'gen': <PETsARD.Processor.Missingist.MissingistDrop at 0x14bec63b0>,
  'age': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec4a30>,
  'race': <PETsARD.Processor.Missingist.MissingistDrop at 0x14bec5d20>,
  'edu': <PETsARD.Processor.Missingist.MissingistDrop at 0x14bec4e20>,
  'mar': <PETsARD.Processor.Missingist.MissingistDrop at 0x14bec4bb0>,
  'bmi': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec6e60>,
  'dep': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec70a0>,
  'pir': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec72b0>,
  'gh': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec74c0>,
  'mets': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec76d0>,
  'qm': <PETsARD.Processor.Missingist.MissingistDrop at 0x14bec78e0>,
  'dia': <PETsARD.Processor.Missingist.MissingistMean at 0x14bec7a90>},
 'outlierist': {'gen': None,
  'age': <PETsARD.Processor.Outlierist.OutlieristIQR at 0x14bec5390>,
  'race': None,
  'edu': None,
  '

In [15]:
pm.fit(df)

In [16]:
df_transformed = pm.transform(df)

In [17]:
df_transformed

Unnamed: 0,gen,age,race,edu,mar,bmi,dep,pir,gh,mets,qm,dia
0,1,3.0,4,3,1,1.0,0.0,0.0,0.0,0.0,1,4.0
1,1,2.0,4,4,0,1.0,0.0,4.0,0.0,0.0,0,0.0
2,1,4.0,4,4,1,1.0,0.0,0.0,4.0,0.0,2,4.0
3,0,3.0,4,3,3,2.0,4.0,0.0,0.0,0.0,2,0.0
4,0,1.0,0,2,0,0.0,4.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
4185,1,0.0,3,3,2,0.0,0.0,0.0,0.0,0.0,0,0.0
4186,0,4.0,4,4,5,0.0,4.0,0.0,0.0,0.0,0,0.0
4187,0,4.0,4,2,5,1.0,0.0,0.0,0.0,0.0,0,0.0
4188,1,1.0,4,1,1,1.0,0.0,4.0,0.0,0.0,0,0.0


In [18]:
df_inverse = pm.inverse_transform(df_transformed)

In [19]:
df_inverse

Unnamed: 0,gen,age,race,edu,mar,bmi,dep,pir,gh,mets,qm,dia
0,Male,62.000004,White,Graduate,Married,27.799999,0.0,2.775558e-17,0.0,0.0,Q2,1.000000e+00
1,Male,53.000004,White,HighSchool,Divorced,30.799999,0.0,1.000000e+00,0.0,0.0,Q1,2.775558e-17
2,Male,78.000000,White,HighSchool,Married,28.799999,0.0,2.775558e-17,0.0,0.0,Q3,1.000000e+00
3,Female,56.000000,White,Graduate,Parther,42.400002,1.0,2.775558e-17,0.0,0.0,Q3,2.775558e-17
4,Female,42.000000,Black,College,Divorced,20.299999,1.0,2.775558e-17,0.0,0.0,Q4,2.775558e-17
...,...,...,...,...,...,...,...,...,...,...,...,...
4162,Male,25.000000,Other,Graduate,Never,21.000000,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
4163,Female,76.000000,White,HighSchool,Widowed,21.500000,1.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
4164,Female,80.000000,White,College,Widowed,31.000000,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
4165,Male,35.000000,White,9th,Married,26.000000,0.0,1.000000e+00,0.0,0.0,Q1,2.775558e-17


In [20]:
try:
    print(f'Inverse successful:\n{(df_inverse == df).all()}')
    print(f'Numeric close (age): {np.isclose(df_inverse.age, df.age).all()}')
    print(f'Numeric close (bmi): {np.isclose(df_inverse.bmi, df.bmi).all()}')
except:
    print('Not applicable.')

Not applicable.


In [21]:
loader2 = PETsARD.Loader(filepath = '[Adt Income] adult.csv'
                               ,na_values = {k : '?' for k in ['workclass'
                                                              ,'occupation'
                                                              ,'native-country'
                                                              ]
                                            }
                               )
print(loader2.data.head(1))

   age workclass  fnlwgt education  educational-num marital-status  \
0   25   Private  226802      11th                7  Never-married   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  


In [22]:
df2 = loader2.data

In [23]:
metadata2 = {'metadata_col': {
                'age': {'type': 'numerical', 'na_percentage': 0.0},
                'workclass': {'type': 'categorical', 'na_percentage': 0.057307},
                'fnlwgt': {'type': 'numerical', 'na_percentage': 0.0},
                'education': {'type': 'categorical', 'na_percentage': 0.0},
                'educational-num': {'type': 'numerical', 'na_percentage': 0.0},
                'marital-status': {'type': 'categorical', 'na_percentage': 0.0},
                'occupation': {'type': 'categorical', 'na_percentage': 0.057512},
                'relationship': {'type': 'categorical', 'na_percentage': 0.0},
                'race': {'type': 'categorical', 'na_percentage': 0.0},
                'gender': {'type': 'categorical', 'na_percentage': 0.0},
                'capital-gain': {'type': 'numerical', 'na_percentage': 0.0},
                'capital-loss': {'type': 'numerical', 'na_percentage': 0.0},
                'hours-per-week': {'type': 'numerical', 'na_percentage': 0.0},
                'native-country': {'type': 'categorical', 'na_percentage': 0.017546},
                'income': {'type': 'categorical', 'na_percentage': 0.0}
                },
             'metadata_global':{
                 'row_num': 48842,
                 'col_num': 15,
                 'na_percentage': 0.07411653904426518
                }
            }

In [24]:
m2 = PETsARD.Metadata()
m2.build_metadata(df2)

In [25]:
pm2 = PETsARD.Processor.Base.Processor(m2)

No self-defined config passed.  Generate a config automatically.


In [24]:
pm2.get_config()

{'missingist': {'age': <PETsARD.Processor.Missingist.MissingistMean at 0x129249c90>,
  'workclass': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a710>,
  'fnlwgt': <PETsARD.Processor.Missingist.MissingistMean at 0x12924b8e0>,
  'education': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a320>,
  'educational-num': <PETsARD.Processor.Missingist.MissingistMean at 0x12924ba00>,
  'marital-status': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a9b0>,
  'occupation': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a1a0>,
  'relationship': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924b910>,
  'race': <PETsARD.Processor.Missingist.MissingistDrop at 0x1291c0520>,
  'gender': <PETsARD.Processor.Missingist.MissingistDrop at 0x1292493c0>,
  'capital-gain': <PETsARD.Processor.Missingist.MissingistMean at 0x129249b70>,
  'capital-loss': <PETsARD.Processor.Missingist.MissingistMean at 0x1292480a0>,
  'hours-per-week': <PETsARD.Processor.Missingist.Missingi

In [25]:
pm2.fit(df2)

In [26]:
pm2.get_config()

{'missingist': {'age': <PETsARD.Processor.Missingist.MissingistMean at 0x129249c90>,
  'workclass': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a710>,
  'fnlwgt': <PETsARD.Processor.Missingist.MissingistMean at 0x12924b8e0>,
  'education': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a320>,
  'educational-num': <PETsARD.Processor.Missingist.MissingistMean at 0x12924ba00>,
  'marital-status': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a9b0>,
  'occupation': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924a1a0>,
  'relationship': <PETsARD.Processor.Missingist.MissingistDrop at 0x12924b910>,
  'race': <PETsARD.Processor.Missingist.MissingistDrop at 0x1291c0520>,
  'gender': <PETsARD.Processor.Missingist.MissingistDrop at 0x1292493c0>,
  'capital-gain': <PETsARD.Processor.Missingist.MissingistMean at 0x129249b70>,
  'capital-loss': <PETsARD.Processor.Missingist.MissingistMean at 0x1292480a0>,
  'hours-per-week': <PETsARD.Processor.Missingist.Missingi

In [27]:
pm2.fit(df2)

In [28]:
pm2.get_config()

{'missingist': {'age': <PETsARD.Processor.Missingist.MissingistMean at 0x1525636d0>,
  'workclass': <PETsARD.Processor.Missingist.MissingistDrop at 0x152561600>,
  'fnlwgt': <PETsARD.Processor.Missingist.MissingistMean at 0x152561480>,
  'education': <PETsARD.Processor.Missingist.MissingistDrop at 0x1525614e0>,
  'educational-num': <PETsARD.Processor.Missingist.MissingistMean at 0x1525631c0>,
  'marital-status': <PETsARD.Processor.Missingist.MissingistDrop at 0x1525629b0>,
  'occupation': <PETsARD.Processor.Missingist.MissingistDrop at 0x152562260>,
  'relationship': <PETsARD.Processor.Missingist.MissingistDrop at 0x1525637c0>,
  'race': <PETsARD.Processor.Missingist.MissingistDrop at 0x1525626e0>,
  'gender': <PETsARD.Processor.Missingist.MissingistDrop at 0x152561cc0>,
  'capital-gain': <PETsARD.Processor.Missingist.MissingistMean at 0x152563490>,
  'capital-loss': <PETsARD.Processor.Missingist.MissingistMean at 0x152563ca0>,
  'hours-per-week': <PETsARD.Processor.Missingist.Missingi

In [29]:
df_transformed2 = pm2.transform(df2)

In [30]:
df_transformed2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,-0.995129,0.631837,0.351675,0.814745,-1.197259,0.510698,0.806791,0.769103,0.929114,0.154251,-0.144804,-0.217127,-0.034087,0.845464,0.028602
1,-0.046942,0.702255,-0.945524,0.071053,-0.419335,0.452504,0.935365,0.167182,0.138599,0.582784,-0.144804,-0.217127,0.772930,0.695485,0.561114
2,-0.776316,0.871271,1.394723,0.854672,0.747550,0.420686,0.976402,0.121193,0.770646,0.081014,-0.144804,-0.217127,-0.034087,0.728586,0.879144
3,-1.068066,0.370096,1.704525,0.393624,-0.030373,0.631837,0.679247,0.818709,0.279015,0.790338,-0.144804,-0.217127,-0.034087,0.187829,0.295074
4,-0.192816,0.982815,0.215911,0.638106,1.136512,0.076023,0.457109,0.195797,0.109487,0.210484,-0.144804,-0.217127,-0.034087,0.859182,0.381862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26928,1.047121,0.482926,1.251867,0.723257,1.525474,0.157599,0.292969,0.314922,0.055393,0.117121,-0.144804,-0.217127,-0.034087,0.324058,0.793714
26929,-1.213941,0.366248,1.140952,0.371208,-0.030373,0.581450,0.988101,0.469510,0.257647,0.382453,-0.144804,-0.217127,-0.034087,0.748028,0.714182
26930,-0.849254,0.705767,0.640492,0.872656,0.747550,0.180107,0.963160,0.956816,0.290843,0.828072,-0.144804,-0.217127,-0.195490,0.614578,0.466726
26931,0.098933,0.355065,-0.334178,0.116609,-0.419335,0.428931,0.768334,0.100992,0.814462,0.380930,-0.144804,-0.217127,-0.034087,0.401566,0.894545


In [31]:
df_inverse2 = pm2.inverse_transform(df_transformed2)

In [32]:
df_inverse2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,24.0,Private,369667.0,Some-college,10.0,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
4,36.0,Federal-gov,212465.0,Bachelors,13.0,Married-civ-spouse,Adm-clerical,Husband,White,Male,0.0,0.0,40.0,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26928,53.0,Private,321865.0,Masters,14.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
26929,22.0,Private,310152.0,Some-college,10.0,Never-married,Protective-serv,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
26930,27.0,Private,257302.0,Assoc-acdm,12.0,Married-civ-spouse,Tech-support,Wife,White,Female,0.0,0.0,38.0,United-States,<=50K
26931,40.0,Private,154374.0,HS-grad,9.0,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0.0,0.0,40.0,United-States,>50K


In [33]:
df2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [34]:
df_inverse2.isna().mean(axis=0)

age                0.000000
workclass          0.057290
fnlwgt             0.000000
education          0.000000
educational-num    0.000000
marital-status     0.000000
occupation         0.057476
relationship       0.000000
race               0.000000
gender             0.000000
capital-gain       0.000000
capital-loss       0.000000
hours-per-week     0.000000
native-country     0.017525
income             0.000000
dtype: float64

In [35]:
df_inverse2.isna().any(axis=1).mean()

0.07125088181784428

In [36]:
pm.get_changes()

Unnamed: 0,processor,col,current,default
0,missingist,age,MissingistSimple,MissingistMean
1,outlierist,gen,OutlieristLOF,NoneType
2,outlierist,age,OutlieristLOF,OutlieristIQR
3,outlierist,race,OutlieristLOF,NoneType
4,outlierist,edu,OutlieristLOF,NoneType
5,outlierist,mar,OutlieristLOF,NoneType
6,outlierist,bmi,OutlieristLOF,OutlieristIQR
7,outlierist,dep,OutlieristLOF,OutlieristIQR
8,outlierist,pir,OutlieristLOF,OutlieristIQR
9,outlierist,gh,OutlieristLOF,OutlieristIQR


In [37]:
pm._default_processor['outlierist']['categorical']

<function PETsARD.Processor.Base.Processor.__init__.<locals>.<lambda>()>

In [38]:
pm._default_processor['outlierist']['numerical']

PETsARD.Processor.Outlierist.OutlieristIQR

In [39]:
from sdv.metadata import SingleTableMetadata

In [40]:
sdvmd = SingleTableMetadata()

In [41]:
sdvmd.detect_from_dataframe(df)

In [42]:
t = sdvmd.to_dict()

In [43]:
sdvmd2 = SingleTableMetadata()

In [44]:
mm = m.to_sdv()