In [43]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [31]:
# Class for stratifying data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from snsynth.mst import MSTSynthesizer
from snsynth.mwem import MWEMSynthesizer

class StratifiedDataset:
    
    def __init__(self, df, strata_cols, categorical_columns=None, default_bins=10, smallest_strata=0.001):
        # create hash mapping for categorical columns
        self.num_to_cat = {}
        
        # convert dataframe to numeric
        self.df = self.force_data_categorical_to_numeric(df.copy(), cat_columns=categorical_columns)
        # add a column to the dataframe to store the strata index
        self.df['strata_index'] = -1
        self.strata_cols = strata_cols
        self.default_bins = default_bins
        self.smallest_strata = smallest_strata

        # create strata hash mapping 
        self.strata_to_id = {}
        self.id_to_strata = {}

        # for each row in df, add its strata to the hash if its not present
        strata_index = 0
        for index, row in df.iterrows():
            stratum = self.calculate_strata_for_row(row)
            if stratum not in self.strata_to_id:
                self.strata_to_id[stratum] = strata_index
                # reverse mapping
                self.id_to_strata[strata_index] = stratum
                strata_index += 1
            # tag the row with its strata index
            self.df.loc[index, 'strata_index'] = self.strata_to_id[stratum]

    def calculate_strata_for_row(self, row):
        # map a row to a stratum
        stratum = []
        for col in self.strata_cols:
            stratum.append(row[col])
        return tuple(stratum)
    
    def get_strata_count(self):
        return len(self.strata_to_id)
    
    def strata_size_filter(self, strata, verbose=False):
        # check if a stratum is too small
        check = strata.shape[0] > self.smallest_strata * self.df.shape[0]
        if verbose:
            print('Strata size:', strata.shape[0], 'Smallest strata size:', self.smallest_strata * self.df.shape[0])
        return check

    def get_strata_dfs(self, limit_size=False, remove_strata_index=True):
        # return a list of dataframes, one for each stratum
        strata_dfs = []
        for strata_index in range(self.get_strata_count()):
            strata = self.df[self.df['strata_index'] == strata_index]
            if limit_size:
                if self.strata_size_filter(strata):
                    strata_dfs.append(strata)
            else:
                strata_dfs.append(strata)
        if remove_strata_index:
            for strata_df in strata_dfs:
                strata_df.drop('strata_index', axis=1, inplace=True)
        return strata_dfs
    
    def force_data_categorical_to_numeric(self, df, cat_columns=[]):
        # convert columns to categorical if they are not already
        for col in cat_columns:
            if col in df.columns:
                df[col] = df[col].astype('category')
                # save mapping back to original values
                self.num_to_cat[col] = dict(enumerate(df[col].cat.categories))
                df[col] = df[col].cat.codes
        return df

class StratifiedSynthesizer:
    def __init__(self, synthesizer_class, kwargs=None, epsilon=1.0, smallest_strata=0.001):
        self.synthesizer_class = synthesizer_class
        self.smallest_strata = smallest_strata
        self.strata_synthesizers = None
        self.epsilon = epsilon
        self.kwargs = kwargs

    def fit(self, df, strata_cols=None, categorical_columns=None):
        self.strata_cols = strata_cols
        self.categorical_columns = categorical_columns

        # Fit normally if no strata_cols are provided
        if self.strata_cols is None:
            self.stratified_dataset = None
            if self.kwargs is None:
                synth = self.synthesizer_class(epsilon=self.epsilon)
            else:
                synth = self.synthesizer_class(epsilon=self.epsilon, **self.kwargs)
            synth.fit(df)
            self.strata_synthesizers = [synth]
            return self
        
        # Fit on each stratum
        self.stratified_dataset = StratifiedDataset(df, self.strata_cols, 
                                                    smallest_strata=self.smallest_strata, 
                                                    categorical_columns=self.categorical_columns)
        self.strata_synthesizers = []
        for strata_df in self.stratified_dataset.get_strata_dfs(limit_size=True):
            print('Fitting synthesizer on strata with size', strata_df.shape[0])
            if self.kwargs is None:
                synth = self.synthesizer_class(epsilon=self.epsilon)
            else:
                synth = self.synthesizer_class(epsilon=self.epsilon, **self.kwargs)
            synth.fit(strata_df)
            self.strata_synthesizers.append(synth)
        return self

    def sample(self, n_samples):
        assert self.strata_synthesizers is not None, 'Synthesizer not fitted'
        # Sample normally if no strata_cols are provided
        if self.stratified_dataset is None:
            return self.strata_synthesizers[0].sample(n_samples)
        
        # Sample from each stratum proportionally
        samples = []
        for strata_df, synthesizer in zip(self.stratified_dataset.get_strata_dfs(limit_size=True), self.strata_synthesizers):
            n = int(n_samples * strata_df.shape[0] / self.stratified_dataset.df.shape[0])
            samples.append(synthesizer.sample(n))
        return pd.concat(samples)



In [32]:
# import adult dataset from openml
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml

adult = fetch_openml('adult', version=1, cache=True)
target = adult['target']
df = adult['data']
df['target'] = target
remove_columns = True
if remove_columns:
    # ['fnlwgt', 'education-num','native-country','education','occupation']
    columns_to_keep = df.columns[~df.columns.isin(['fnlwgt','native-country','relationship','education-num','occupation'])]
    df = df[columns_to_keep]
# dimensionality of each df column
df.nunique()

age                5
workclass          8
education         16
marital-status     7
race               5
sex                2
capitalgain        5
capitalloss        5
hoursperweek       5
target             2
dtype: int64

In [33]:
non_numeric_columns = ['education','workclass','marital-status','occupation','relationship','sex','race','native-country','target']
sdf = StratifiedDataset(df, strata_cols=['race','sex'], categorical_columns=non_numeric_columns)
sdf.df['strata_index'].value_counts()

0    28735
3    13027
1     2377
2     2308
4     1002
7      517
5      285
9      251
8      185
6      155
Name: strata_index, dtype: int64

In [34]:
stratified_mwem = StratifiedSynthesizer(MSTSynthesizer)

In [None]:
stratified_mwem.fit(df, strata_cols=['race','sex'], categorical_columns=non_numeric_columns)

In [None]:
stratified_mwem.strata_synthesizers

In [None]:
synth_sample = stratified_mwem.sample(len(df))

In [None]:
synth_sample.describe()

In [None]:
stratified_mwem.stratified_dataset.force_data_categorical_to_numeric(df, non_numeric_columns).describe()

In [47]:
# train an sklearn model on the real data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=0).fit(X_train_real, y_train_real)

clf.score(X_test_real, y_test_real)

0.8099088954857201

In [48]:
# train an sklearn model on the synthetic data
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split

X_train_synth, X_test_synth, y_train_synth, y_test_synth = train_test_split(synth_sample.drop('target', axis=1), synth_sample['target'], test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=0).fit(X_train_synth, y_train_synth)

clf.score(X_test_real, y_test_real)


0.8046882997236156

In [50]:
mwem = MSTSynthesizer(epsilon=1.0)
mwem.fit(stratified_mwem.stratified_dataset.force_data_categorical_to_numeric(df, non_numeric_columns))
synth_df_full = mwem.sample(len(df))

In [7]:
class PlotGenerator:
    """
    The synth_data_dict is of the form:
    {
        'synthesizer': {
            '0.1': synth_df, 
            '0.5': synth_df, 
            '1.0': synth_df, 
            '3.0': synth_df,
            '6.0': synth_df,
        },
    }
    """
    def __init__(self, real_data, synth_data_dict, strata_cols, categorical_columns):
        self.real_data = real_data
        self.synth_data_dict = synth_data_dict
        self.strata_cols = strata_cols
        self.categorical_columns = categorical_columns

    def error_plot_across_epsilons(self):
        """
        Plot the error of the synthetic data for each epsilon
        """
        
        # Calculate the error for each epsilon
        errors = []
        for epsilon, synth_df in self.synth_data_dict['synthesizer'].items():
            errors.append(self.error(synth_df))
        
        # Plot the error for each epsilon
        plt.plot(self.synth_data_dict['synthesizer'].keys(), errors)
        plt.xlabel('Epsilon')
        plt.ylabel('Error')
        plt.title('Error of synthetic data for each epsilon')
        plt.show()


AttributeError: 'numpy.ndarray' object has no attribute 'to_numeric'

In [12]:
# convert df to numeric
df = df.values.astype(np.float32)

In [13]:
u, s, v = np.linalg.svd(df)
rank = np.sum(s > 1e-10)

In [14]:
np.linalg.matrix_rank(df)

10

In [17]:
df.shape

(48842, 10)

In [18]:
s

array([1326.5604  ,  568.77875 ,  338.82434 ,  298.62393 ,  259.0123  ,
        226.29294 ,  184.15746 ,  161.76102 ,  121.585144,  100.41244 ],
      dtype=float32)