test cases:

1. get the expected output
2. can throw out errors (unfitted, format error, etc.)
3. produce correct (or nearly correct) amount of NAs

In [1]:
import pandas as pd
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
%cd /Users/alex/PETsARD

/Users/alex/PETsARD


In [3]:
import PETsARD

In [4]:
loader = PETsARD.Loader(filepath = '[NHANES] B.csv',
                                header_exist = False,
                                header_names = ['gen','age','race','edu','mar','bmi','dep','pir','gh','mets','qm','dia']
                               )
loader.load()

print(loader.data.head(1))

    gen   age   race       edu      mar        bmi  dep  pir   gh  mets  qm  \
0  Male  62.0  White  Graduate  Married  27.799999    0    0  0.0   0.0  Q2   

   dia  
0    1  


In [5]:
ld = {'data': loader.data}

In [24]:
config = {
    'method': 'summary',
    'describe': ['mean', 'median', {'quantile': 0.01}, 'q1', 'corr',
                  'row_count', 'cov', 'col_count', 'global_na_count',
                  'std', 'var', 'min', 'max', 'range', 'skew', 'kurtosis','q3', 'iqr',
                  {'quantile': 0.95}, 'col_na_count', 'nunique', {'quantile': 0.85}]
}

In [25]:
from PETsARD import Describer

In [26]:
des = Describer(config)

In [27]:
des.create(ld)

In [29]:
des.eval()

In [30]:
des.get_global()

Unnamed: 0,row_count,col_count,na_count
0,4190,12,0


In [31]:
des.get_columnwise()

Unnamed: 0,mean,median,1.0 th quantile,q1,std,var,min,max,range,skew,kurtosis,q3,iqr,95.0 th quantile,na_count,nunique,85.0 th quantile
age,50.455849,50.0,20.0,35.0,17.887312,319.955902,20.0,80.0,60.0,0.038668,-1.167954,65.0,30.0,80.0,0,,72.0
bmi,29.18401,28.1,17.9,24.4,6.850947,46.935482,14.5,67.300003,52.800003,1.136191,2.211324,32.700001,8.300001,42.155,0,,35.599998
dep,0.209308,0.0,0.0,0.0,0.406863,0.165538,0.0,1.0,1.0,1.429626,0.04385,0.0,0.0,1.0,0,,1.0
pir,0.210979,0.0,0.0,0.0,0.408052,0.166506,0.0,1.0,1.0,1.417268,0.008654,0.0,0.0,1.0,0,,1.0
gh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,0.0
mets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,,0.0
dia,0.208353,0.0,0.0,0.0,0.406179,0.164982,0.0,1.0,1.0,1.436736,0.06424,0.0,0.0,1.0,0,,1.0
gen,,,,,,,,,,,,,,,0,2.0,
race,,,,,,,,,,,,,,,0,5.0,
edu,,,,,,,,,,,,,,,0,5.0,


In [5]:
df = loader.data

In [6]:
m = PETsARD.Metadata()
m.build_metadata(df)

In [7]:
metadata = {'col': {
                'gen': {'type': 'categorical', 'na_percentage': 0.0},
                'age': {'type': 'numerical', 'na_percentage': 0.0},
                'race': {'type': 'categorical', 'na_percentage': 0.0},
                'edu': {'type': 'categorical', 'na_percentage': 0.0},
                'mar': {'type': 'categorical', 'na_percentage': 0.0},
                'bmi': {'type': 'numerical', 'na_percentage': 0.0},
                'dep': {'type': 'numerical', 'na_percentage': 0.0},
                'pir': {'type': 'numerical', 'na_percentage': 0.0},
                'gh': {'type': 'numerical', 'na_percentage': 0.0},
                'mets': {'type': 'numerical', 'na_percentage': 0.0},
                'qm': {'type': 'categorical', 'na_percentage': 0.0},
                'dia': {'type': 'numerical', 'na_percentage': 0.0}
                },
             'global':{
                 'row_num': 4190,
                 'col_num': 12,
                 'na_percentage': 0.0
                }
            }

In [8]:
pm = PETsARD.Processor(m)

No self-defined config passed.  Generate a config automatically.


In [9]:
from PETsARD.processor.missing import *
from PETsARD.processor.outlier import *
from PETsARD.processor.encoder import *
from PETsARD.processor.scaler import *

In [10]:
# df.loc[[1,2,3,4,5], 'gen'] = np.nan
# df.loc[[10,20,30,40,5], 'age'] = np.nan

In [11]:
config_2 = {'missing': {'gen': MissingDrop(), 'age': 'missing_simple'},
            'outlier': {'gen': None, 'age': 'outlier_lof'},
            'encoder': {'gen': 'encoder_onehot', 'race': 'encoder_onehot'},
            'scaler': {'age': 'scaler_minmax'}}

In [12]:
pm.update_config(config_2)

In [13]:
pm.get_config()

{'missing': {'gen': <PETsARD.processor.missing.MissingDrop at 0x28afa7d90>,
  'age': <PETsARD.processor.missing.MissingSimple at 0x28af374f0>,
  'race': <PETsARD.processor.missing.MissingDrop at 0x28afa7160>,
  'edu': <PETsARD.processor.missing.MissingDrop at 0x28afa6fb0>,
  'mar': <PETsARD.processor.missing.MissingDrop at 0x28afa6e60>,
  'bmi': <PETsARD.processor.missing.MissingMean at 0x28afa6a40>,
  'dep': <PETsARD.processor.missing.MissingMean at 0x28afa7e50>,
  'pir': <PETsARD.processor.missing.MissingMean at 0x28afa4460>,
  'gh': <PETsARD.processor.missing.MissingMean at 0x28afa6470>,
  'mets': <PETsARD.processor.missing.MissingMean at 0x28afa74f0>,
  'qm': <PETsARD.processor.missing.MissingDrop at 0x28afa5510>,
  'dia': <PETsARD.processor.missing.MissingMean at 0x28afa6980>},
 'outlier': {'gen': None,
  'age': <PETsARD.processor.outlier.OutlierLOF at 0x28afa72b0>,
  'race': None,
  'edu': None,
  'mar': None,
  'bmi': <PETsARD.processor.outlier.OutlierIQR at 0x107afee30>,
  'dep

In [14]:
pm.fit(df)

In [15]:
df_transformed = pm.transform(df)

Length: 4190
Categories (2, object): ['Female', 'Male']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 4190
Categories (5, object): ['Black', 'Hispanic', 'Mexican', 'Other', 'White']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 4190
Categories (5, object): ['11th', '9th', 'College', 'Graduate', 'HighSchool']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 4190
Categories (6, object): ['Divorced', 'Married', 'Never', 'Parther', 'Separated', 'Widowed']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 4190
Categories (4, object): ['Q1', 'Q2', 'Q3', 'Q4']' has dtype incompatible with bool, please explicitl

In [16]:
df_transformed

Unnamed: 0,age,edu,mar,bmi,dep,pir,gh,mets,qm,dia,gen_Female,gen_Male,race_Black,race_Hispanic,race_Mexican,race_Other,race_White
0,0.700000,0.501014,0.510496,-0.202041,-0.514504,-0.517100,0.0,0.0,0.439481,1.949241,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.550000,0.593540,0.722505,0.235906,-0.514504,1.933861,0.0,0.0,0.221088,-0.513020,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.966667,0.570027,0.511464,-0.056059,-0.514504,-0.517100,0.0,0.0,0.941341,1.949241,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.600000,0.440567,0.843059,1.929305,1.943618,-0.517100,0.0,0.0,0.878165,-0.513020,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.366667,0.250450,0.798162,-1.296911,1.943618,-0.517100,0.0,0.0,0.571218,-0.513020,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4162,0.083333,0.309831,0.691554,-1.194723,-0.514504,-0.517100,0.0,0.0,0.171890,-0.513020,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4163,0.933333,0.571986,0.940840,-1.121732,1.943618,-0.517100,0.0,0.0,0.230822,-0.513020,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4164,1.000000,0.198564,0.934402,0.265103,-0.514504,-0.517100,0.0,0.0,0.265541,-0.513020,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4165,0.250000,0.834683,0.126036,-0.464810,-0.514504,1.933861,0.0,0.0,0.056893,-0.513020,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [17]:
df_inverse = pm.inverse_transform(df_transformed)

In [18]:
df_inverse

Unnamed: 0,gen,age,race,edu,mar,bmi,dep,pir,gh,mets,qm,dia
0,Male,62.000004,White,Graduate,Married,27.799999,0.0,2.775558e-17,0.0,0.0,Q2,1.000000e+00
1,Male,53.000004,White,HighSchool,Divorced,30.799999,0.0,1.000000e+00,0.0,0.0,Q1,2.775558e-17
2,Male,78.000000,White,HighSchool,Married,28.799999,0.0,2.775558e-17,0.0,0.0,Q3,1.000000e+00
3,Female,56.000000,White,Graduate,Parther,42.400002,1.0,2.775558e-17,0.0,0.0,Q3,2.775558e-17
4,Female,42.000000,Black,College,Divorced,20.299999,1.0,2.775558e-17,0.0,0.0,Q4,2.775558e-17
...,...,...,...,...,...,...,...,...,...,...,...,...
4162,Male,25.000000,Other,Graduate,Never,21.000000,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
4163,Female,76.000000,White,HighSchool,Widowed,21.500000,1.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
4164,Female,80.000000,White,College,Widowed,31.000000,0.0,2.775558e-17,0.0,0.0,Q1,2.775558e-17
4165,Male,35.000000,White,9th,Married,26.000000,0.0,1.000000e+00,0.0,0.0,Q1,2.775558e-17


In [19]:
try:
    print(f'Inverse successful:\n{(df_inverse == df).all()}')
    print(f'Numeric close (age): {np.isclose(df_inverse.age, df.age).all()}')
    print(f'Numeric close (bmi): {np.isclose(df_inverse.bmi, df.bmi).all()}')
except:
    print('Not applicable.')

Not applicable.


In [20]:
loader2 = PETsARD.Loader(filepath = '[Adt Income] adult.csv'
                               ,na_values = {k : '?' for k in ['workclass'
                                                              ,'occupation'
                                                              ,'native-country'
                                                              ]
                                            }
                               )
print(loader2.data.head(1))

   age workclass  fnlwgt education  educational-num marital-status  \
0   25   Private  226802      11th                7  Never-married   

          occupation relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct    Own-child  Black   Male             0             0   

   hours-per-week native-country income  
0              40  United-States  <=50K  


In [21]:
df2 = loader2.data

In [22]:
metadata2 = {'metadata_col': {
                'age': {'type': 'numerical', 'na_percentage': 0.0},
                'workclass': {'type': 'categorical', 'na_percentage': 0.057307},
                'fnlwgt': {'type': 'numerical', 'na_percentage': 0.0},
                'education': {'type': 'categorical', 'na_percentage': 0.0},
                'educational-num': {'type': 'numerical', 'na_percentage': 0.0},
                'marital-status': {'type': 'categorical', 'na_percentage': 0.0},
                'occupation': {'type': 'categorical', 'na_percentage': 0.057512},
                'relationship': {'type': 'categorical', 'na_percentage': 0.0},
                'race': {'type': 'categorical', 'na_percentage': 0.0},
                'gender': {'type': 'categorical', 'na_percentage': 0.0},
                'capital-gain': {'type': 'numerical', 'na_percentage': 0.0},
                'capital-loss': {'type': 'numerical', 'na_percentage': 0.0},
                'hours-per-week': {'type': 'numerical', 'na_percentage': 0.0},
                'native-country': {'type': 'categorical', 'na_percentage': 0.017546},
                'income': {'type': 'categorical', 'na_percentage': 0.0}
                },
             'metadata_global':{
                 'row_num': 48842,
                 'col_num': 15,
                 'na_percentage': 0.07411653904426518
                }
            }

In [23]:
m2 = PETsARD.Metadata()
m2.build_metadata(df2)

In [24]:
pm2 = PETsARD.Processor(m2)

No self-defined config passed.  Generate a config automatically.


In [25]:
pm2.get_config()

{'missing': {'age': <PETsARD.processor.missing.MissingMean at 0x28af75750>,
  'workclass': <PETsARD.processor.missing.MissingDrop at 0x28af76140>,
  'fnlwgt': <PETsARD.processor.missing.MissingMean at 0x107aff160>,
  'education': <PETsARD.processor.missing.MissingDrop at 0x28af77280>,
  'educational-num': <PETsARD.processor.missing.MissingMean at 0x28af75960>,
  'marital-status': <PETsARD.processor.missing.MissingDrop at 0x28af77c70>,
  'occupation': <PETsARD.processor.missing.MissingDrop at 0x28af771f0>,
  'relationship': <PETsARD.processor.missing.MissingDrop at 0x28af77cd0>,
  'race': <PETsARD.processor.missing.MissingDrop at 0x28af772b0>,
  'gender': <PETsARD.processor.missing.MissingDrop at 0x28af77d60>,
  'capital-gain': <PETsARD.processor.missing.MissingMean at 0x28af76980>,
  'capital-loss': <PETsARD.processor.missing.MissingMean at 0x28afb00d0>,
  'hours-per-week': <PETsARD.processor.missing.MissingMean at 0x28afb1f00>,
  'native-country': <PETsARD.processor.missing.MissingDro

In [26]:
pm2.fit(df2, sequence=['missing', 'discretizing'])

In [27]:
df_transformed2 = pm2.transform(df2)

Length: 45222
Categories (8, object): ['Federal-gov', 'Local-gov', 'Never-worked', 'Private', 'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 45222
Categories (16, object): ['10th', '11th', '12th', '1st-4th', ..., 'Masters', 'Preschool', 'Prof-school', 'Some-college']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 45222
Categories (7, object): ['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed']' has dtype incompatible with bool, please explicitly cast to a compatible dtype first.
  transformed.loc[:, col] = self._config.get(col,
Length: 45222
Categories (14, object): ['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial', ..., 'Protective-serv', 'Sale

In [28]:
df_transformed2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.0,3,0.0,1,2.0,4,6,3,2,1,0.0,0.0,1.0,38,0
1,1.0,3,0.0,11,2.0,2,4,0,4,1,0.0,0.0,2.0,38,0
2,0.0,1,1.0,7,3.0,2,10,0,4,1,0.0,0.0,1.0,38,1
3,1.0,3,0.0,15,3.0,2,6,0,2,1,0.0,0.0,1.0,38,1
4,1.0,3,0.0,0,1.0,4,7,1,4,1,0.0,0.0,1.0,38,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,0.0,3,0.0,7,3.0,2,12,5,4,0,0.0,0.0,1.0,38,0
45218,1.0,3,0.0,11,2.0,2,6,0,4,1,0.0,0.0,1.0,38,1
45219,2.0,3,0.0,11,2.0,6,0,4,4,0,0.0,0.0,1.0,38,0
45220,0.0,3,0.0,11,2.0,4,0,3,4,1,0.0,0.0,0.0,38,0


In [29]:
df_inverse2 = pm2.inverse_transform(df_transformed2)

In [30]:
df_inverse2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,24.3,Private,160096.5,11th,8.5,Never-married,Machine-op-inspct,Own-child,Black,Male,9999.9,435.6,30.4,United-States,<=50K
1,38.9,Private,160096.5,HS-grad,8.5,Married-civ-spouse,Farming-fishing,Husband,White,Male,9999.9,435.6,50.0,United-States,<=50K
2,24.3,Local-gov,455719.5,Assoc-acdm,11.5,Married-civ-spouse,Protective-serv,Husband,White,Male,9999.9,435.6,30.4,United-States,>50K
3,38.9,Private,160096.5,Some-college,11.5,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,9999.9,435.6,30.4,United-States,>50K
4,38.9,Private,160096.5,10th,5.5,Never-married,Other-service,Not-in-family,White,Male,9999.9,435.6,30.4,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45217,24.3,Private,160096.5,Assoc-acdm,11.5,Married-civ-spouse,Tech-support,Wife,White,Female,9999.9,435.6,30.4,United-States,<=50K
45218,38.9,Private,160096.5,HS-grad,8.5,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,9999.9,435.6,30.4,United-States,>50K
45219,53.5,Private,160096.5,HS-grad,8.5,Widowed,Adm-clerical,Unmarried,White,Female,9999.9,435.6,30.4,United-States,<=50K
45220,24.3,Private,160096.5,HS-grad,8.5,Never-married,Adm-clerical,Own-child,White,Male,9999.9,435.6,10.8,United-States,<=50K


In [31]:
df2

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,,103497,Some-college,10,Never-married,,Own-child,White,Female,0,0,30,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
48838,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
48839,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
48840,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [32]:
df_inverse2.isna().mean(axis=0)

age                0.000000
workclass          0.057295
fnlwgt             0.000000
education          0.000000
educational-num    0.000000
marital-status     0.000000
occupation         0.057494
relationship       0.000000
race               0.000000
gender             0.000000
capital-gain       0.000000
capital-loss       0.000000
hours-per-week     0.000000
native-country     0.017536
income             0.000000
dtype: float64

In [33]:
df_inverse2.isna().any(axis=1).mean()

0.07098314979434789

In [34]:
pm._config

{'missing': {'gen': <PETsARD.processor.missing.MissingDrop at 0x28afa7d90>,
  'age': <PETsARD.processor.missing.MissingSimple at 0x28af374f0>,
  'race': <PETsARD.processor.missing.MissingDrop at 0x28afa7160>,
  'edu': <PETsARD.processor.missing.MissingDrop at 0x28afa6fb0>,
  'mar': <PETsARD.processor.missing.MissingDrop at 0x28afa6e60>,
  'bmi': <PETsARD.processor.missing.MissingMean at 0x28afa6a40>,
  'dep': <PETsARD.processor.missing.MissingMean at 0x28afa7e50>,
  'pir': <PETsARD.processor.missing.MissingMean at 0x28afa4460>,
  'gh': <PETsARD.processor.missing.MissingMean at 0x28afa6470>,
  'mets': <PETsARD.processor.missing.MissingMean at 0x28afa74f0>,
  'qm': <PETsARD.processor.missing.MissingDrop at 0x28afa5510>,
  'dia': <PETsARD.processor.missing.MissingMean at 0x28afa6980>},
 'outlier': {'gen': <PETsARD.processor.outlier.OutlierLOF at 0x107acbb50>,
  'age': <PETsARD.processor.outlier.OutlierLOF at 0x28afa6a70>,
  'race': <PETsARD.processor.outlier.OutlierLOF at 0x28afa7b50>,
  

In [35]:
pm.get_changes()

Unnamed: 0,processor,col,current,default
0,missing,age,MissingSimple,MissingMean
1,outlier,gen,OutlierLOF,NoneType
2,outlier,age,OutlierLOF,OutlierIQR
3,outlier,race,OutlierLOF,NoneType
4,outlier,edu,OutlierLOF,NoneType
5,outlier,mar,OutlierLOF,NoneType
6,outlier,bmi,OutlierLOF,OutlierIQR
7,outlier,dep,OutlierLOF,OutlierIQR
8,outlier,pir,OutlierLOF,OutlierIQR
9,outlier,gh,OutlierLOF,OutlierIQR


In [36]:
from PETsARD.synthesizer import Synthesizer

In [37]:
syn = Synthesizer(df_transformed2[['age', 'workclass', 'marital-status']], synthesizing_method='smartnoise-mwem')

In [38]:
syn.fit()

Synthesizer (SmartNoise): Fitting mwem.
Synthesizer (SmartNoise): Fitting  mwem spent 10.7503 sec.


In [39]:
syn.sample()

Synthesizer (SmartNoise): Sampling mwem # 45222 rows (same as raw) in 0.6409 sec.


In [40]:
syn.data_syn

Unnamed: 0,age,workclass,marital-status
0,2,4,2
1,1,3,0
2,0,3,4
3,3,5,2
4,2,3,2
...,...,...,...
45217,0,5,2
45218,1,6,4
45219,0,3,4
45220,1,5,2
