In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import torch

In [None]:
class DataLoader:
    """A data interface for public data."""

    def __init__(self, params):
        """Init method

        :param dataframe: Pandas DataFrame.
        :param continuous_features: List of names of continuous features. The remaining features are categorical features.
        :param outcome_name: Outcome feature name.
        :param permitted_range (optional): Dictionary with feature names as keys and permitted range as values. Defaults to the range inferred from training data.
        :param test_size (optional): Proportion of test set split. Defaults to 0.2.
        :param test_split_random_state (optional): Random state for train test split. Defaults to 17.

        """

        if isinstance(params['dataframe'], pd.DataFrame):
            self.data_df = params['dataframe']
        else:
            raise ValueError("should provide a pandas dataframe")

        if type(params['continuous_features']) is list:
            self.continuous_feature_names = params['continuous_features']
        else:
            raise ValueError(
                "should provide the name(s) of continuous features in the data")

        if type(params['outcome_name']) is str:
            self.outcome_name = params['outcome_name']
        else:
            raise ValueError("should provide the name of outcome feature")

        self.categorical_feature_names = [name for name in self.data_df.columns.tolist(
        ) if name not in self.continuous_feature_names+[self.outcome_name]]

        self.feature_names = [
            name for name in self.data_df.columns.tolist() if name != self.outcome_name]

        self.continuous_feature_indexes = [self.data_df.columns.get_loc(
            name) for name in self.continuous_feature_names if name in self.data_df]

        self.categorical_feature_indexes = [self.data_df.columns.get_loc(
            name) for name in self.categorical_feature_names if name in self.data_df]

        if 'test_size' in params:
            self.test_size = params['test_size']
        else:
            self.test_size = 0.2

        if 'test_split_random_state' in params:
            self.test_split_random_state = params['test_split_random_state']
        else:
            self.test_split_random_state = 17

        if len(self.categorical_feature_names) > 0:
            self.data_df[self.categorical_feature_names] = self.data_df[self.categorical_feature_names].astype(
                'category')
        if len(self.continuous_feature_names) > 0:
            print(self.data_df.head())
#             for feature in self.continuous_feature_names:
#                 if self.get_data_type(self.data_df[feature]) == ' float':
#                     self.data_df[self.continuous_feature_names] = self.data_df[self.continuous_feature_names].astype(
#                         float)
#                 else:
#                     self.data_df[self.continuous_feature_names] = self.data_df[self.continuous_feature_names].astype(
#                         int)
            print(self.data_df.head())

        if len(self.categorical_feature_names) > 0:
#             print(self.data_df.head())
            self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
#             print(self.one_hot_encoded_data.head())
            self.encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
            ) if x not in np.array([self.outcome_name])]
        else:
            # one-hot-encoded data is same as orignial data if there is no categorical features.
            self.one_hot_encoded_data = self.data_df
            self.encoded_feature_names = self.feature_names

        self.train_df, self.test_df = self.split_data(self.data_df)
        if 'permitted_range' in params:
            self.permitted_range = params['permitted_range']
        else:
            self.permitted_range = self.get_features_range()

    def get_features_range(self):
        ranges = {}
        for feature_name in self.continuous_feature_names:
            ranges[feature_name] = [
                self.data_df[feature_name].min(), self.data_df[feature_name].max()]
        return ranges

    def get_data_type(self, col):
        """Infers data type of a feature from the training data."""
        for instance in col.tolist():
            if isinstance(instance, int):
                return 'int'
            else:
                if float(str(instance).split('.')[1]) > 0:
                    return 'float'
        return 'int'

    def one_hot_encode_data(self, data):
        """One-hot-encodes the data."""
        return pd.get_dummies(data, drop_first=False, columns=self.categorical_feature_names)

    def normalize_data(self, df):
        """Normalizes continuous features to make them fall in the range [0,1]."""
        result = df.copy()
        for feature_name in self.continuous_feature_names:
            max_value = self.data_df[feature_name].max()
            min_value = self.data_df[feature_name].min()
            result[feature_name] = (
                df[feature_name] - min_value) / (max_value - min_value)
        return result

    def de_normalize_data(self, df):
        """De-normalizes continuous features from [0,1] range to original range."""
        result = df.copy()
        for feature_name in self.continuous_feature_names:
            max_value = self.data_df[feature_name].max()
            min_value = self.data_df[feature_name].min()
            result[feature_name] = (
                df[feature_name]*(max_value - min_value)) + min_value
        return result

    def get_minx_maxx(self, normalized=True):
        """Gets the min/max value of features in normalized or de-normalized form."""
        minx = np.array([[0.0]*len(self.encoded_feature_names)])
        maxx = np.array([[1.0]*len(self.encoded_feature_names)])

        for idx, feature_name in enumerate(self.continuous_feature_names):
            max_value = self.data_df[feature_name].max()
            min_value = self.data_df[feature_name].min()

            if normalized:
                minx[0][idx] = (self.permitted_range[feature_name]
                                [0] - min_value) / (max_value - min_value)
                maxx[0][idx] = (self.permitted_range[feature_name]
                                [1] - min_value) / (max_value - min_value)
            else:
                minx[0][idx] = self.permitted_range[feature_name][0]
                maxx[0][idx] = self.permitted_range[feature_name][1]
        return minx, maxx

    def split_data(self, data):
        train_df, test_df = train_test_split(
            data, test_size=self.test_size, random_state=self.test_split_random_state)
        return train_df, test_df

    def get_mads_from_training_data(self, normalized=False):
        """Computes Median Absolute Deviation of features."""

        mads = {}
        if normalized is False:
            for feature in self.continuous_feature_names:
                mads[feature] = np.median(
                    abs(self.data_df[feature].values - np.median(self.data_df[feature].values)))
        else:
            normalized_train_df = self.normalize_data(self.train_df)
            for feature in self.continuous_feature_names:
                mads[feature] = np.median(
                    abs(normalized_train_df[feature].values - np.median(normalized_train_df[feature].values)))
        return mads

    def get_data_params(self):
        """Gets all data related params for DiCE."""

        minx, maxx = self.get_minx_maxx(normalized=True)

        # get the column indexes of categorical features after one-hot-encoding
        self.encoded_categorical_feature_indexes = self.get_encoded_categorical_feature_indexes()

        return minx, maxx, self.encoded_categorical_feature_indexes

    def get_encoded_categorical_feature_indexes(self):
        """Gets the column indexes categorical features after one-hot-encoding."""
        cols = []
        for col_parent in self.categorical_feature_names:
            temp = [self.encoded_feature_names.index(
                col) for col in self.encoded_feature_names if col.startswith(col_parent)]
            cols.append(temp)
        return cols

    def get_indexes_of_features_to_vary(self, features_to_vary='all'):
        """Gets indexes from feature names of one-hot-encoded data."""
        if features_to_vary == "all":
            return [i for i in range(len(self.encoded_feature_names))]
        else:
            return [colidx for colidx, col in enumerate(self.encoded_feature_names) if col.startswith(tuple(features_to_vary))]

    def from_dummies(self, data, prefix_sep='_'):
        """Gets the original data from dummy encoded data with k levels."""
        out = data.copy()
        for l in self.categorical_feature_names:
            cols, labs = [[c.replace(
                x, "") for c in data.columns if l+prefix_sep in c] for x in ["", l+prefix_sep]]
            out[l] = pd.Categorical(
                np.array(labs)[np.argmax(data[cols].values, axis=1)])
            out.drop(cols, axis=1, inplace=True)
        return out

    def get_decimal_precisions(self):
        """"Gets the precision of continuous features in the data."""
        precisions = [0]*len(self.feature_names)
        for ix, col in enumerate(self.continuous_feature_names):
            precisions[ix] = 0
            for instance in self.data_df[col].tolist():
                if isinstance(instance, int):
                    precisions[ix] = 0
                    break
                else:
                    if float(str(instance).split('.')[1]) > 0:
                        precisions[ix] = len(str(instance).split('.')[1])
                        break
        return precisions

    def get_decoded_data(self, data):
        """Gets the original data from dummy encoded data."""
        if isinstance(data, np.ndarray):
            index = [i for i in range(0, len(data))]
            data = pd.DataFrame(data=data, index=index,
                                columns=self.encoded_feature_names)
        return self.from_dummies(data)

    def prepare_df_for_encoding(self):
        """Facilitates prepare_query_instance() function."""
        levels = []
        colnames = self.categorical_feature_names
        for cat_feature in colnames:
            levels.append(self.data_df[cat_feature].cat.categories.tolist())

        df = pd.DataFrame({colnames[0]: levels[0]})
        for col in range(1, len(colnames)):
            temp_df = pd.DataFrame({colnames[col]: levels[col]})
            df = pd.concat([df, temp_df], axis=1, sort=False)

        colnames = self.continuous_feature_names
        for col in range(0, len(colnames)):
            temp_df = pd.DataFrame({colnames[col]: []})
            df = pd.concat([df, temp_df], axis=1, sort=False)

        return df

    def prepare_query_instance(self, query_instance, encode):
        """Prepares user defined test input for DiCE."""

        if isinstance(query_instance, list):
            query_instance = {'row1': query_instance}
            test = pd.DataFrame.from_dict(
                query_instance, orient='index', columns=self.feature_names)

        elif isinstance(query_instance, dict):
            query_instance = dict(zip(query_instance.keys(), [[q] for q in query_instance.values()]))
            test = pd.DataFrame(query_instance, columns=self.feature_names)

        test = test.reset_index(drop=True)

        if encode is False:
            return self.normalize_data(test)
        else:
            temp = self.prepare_df_for_encoding()

            temp = temp.append(test, ignore_index=True, sort=False)
            temp = self.one_hot_encode_data(temp)
            temp = self.normalize_data(temp)

            return temp.tail(test.shape[0]).reset_index(drop=True)

    def get_dev_data(self, model_interface, desired_class, filter_threshold=0.5):
        """Constructs dev data by extracting part of the test data for which finding counterfactuals make sense."""

        # create TensorFLow session if one is not already created
        if tf.get_default_session() is not None:
            self.data_sess = tf.get_default_session()
        else:
            self.data_sess = tf.InteractiveSession()

        # loading trained model
        model_interface.load_model()

        # get the permitted range of change for each feature
        minx, maxx = self.get_minx_maxx(normalized=True)

        # get the transformed data: continuous features are normalized to fall in the range [0,1], and categorical features are one-hot encoded
        data_df_transformed = self.normalize_data(self.one_hot_encoded_data)

        # split data - nomralization considers only train df and there is no leakage due to transformation before train-test splitting
        _, test = self.split_data(data_df_transformed)
        test = test.drop_duplicates(
            subset=self.encoded_feature_names).reset_index(drop=True)

        # finding target predicted probabilities
        input_tensor = tf.Variable(minx, dtype=tf.float32)
        output_tensor = model_interface.get_output(
            input_tensor)  # model(input_tensor)
        temp_data = test[self.encoded_feature_names].values.astype(np.float32)
        dev_preds = [self.data_sess.run(output_tensor, feed_dict={
                                        input_tensor: np.array([dt])}) for dt in temp_data]
        dev_preds = [dev_preds[i][0][0] for i in range(len(dev_preds))]

        # filtering examples which have predicted value >/< threshold
        dev_data = test[self.encoded_feature_names]
        if desired_class == 0:
            idxs = [i for i in range(len(dev_preds))
                    if dev_preds[i] > filter_threshold]
        else:
            idxs = [i for i in range(len(dev_preds))
                    if dev_preds[i] < filter_threshold]
        dev_data = dev_data.iloc[idxs]
        dev_preds = [dev_preds[i] for i in idxs]

        # convert from one-hot encoded vals to user interpretable fromat
        dev_data = self.from_dummies(dev_data)
        dev_data = self.de_normalize_data(dev_data)
        return dev_data, dev_preds  # values.tolist()

In [None]:
base_dir= '../../data/datasets/adult/'

In [None]:
continuous_features=['age', 'hours_per_week']
outcome_name='income'
data='adult'

dataset = pd.read_csv(base_dir + 'adult.csv')
params= {'dataframe':dataset.copy(), 'continuous_features':continuous_features, 'outcome_name':outcome_name}
d = DataLoader(params)

# d = dice_ml.Data(dataframe=dataset, continuous_features=['age', 'hours_per_week'], outcome_name='income')
data_df= d.data_df.copy()
data_df= data_df.sample(n=len(data_df), random_state=100)

In [None]:
encoded_data = d.normalize_data(d.one_hot_encode_data(data_df))
# Need to rearrange columns such that the Income comes at the last
cols = list(encoded_data.columns)
cols = cols[:2] + cols[3:] + [cols[2]]
encoded_data = encoded_data[cols]
columns= encoded_data.columns.tolist()
dataset = encoded_data.to_numpy()

In [None]:
columns

## Version 1

In [None]:
#train_data_vae= train_data_vae[ train_data_vae['income']==0 ]
#train_data_vae.drop('income', axis=1, inplace=True)

idx= columns.index('sex_Male')
data_male= dataset[dataset[:, idx]==1]
idx= columns.index('sex_Female')
data_female= dataset[dataset[:, idx]==1]
print('Gender Stats: ', data_male.shape, data_female.shape)

# Get low and high income groups from the last dimension in the data
data_male_low_inc= data_male[data_male[:, -1] == 0]
data_male_high_inc= data_male[data_male[:, -1] == 1]
data_female_low_inc= data_female[data_female[:, -1] == 0]
data_female_high_inc= data_female[data_female[:, -1] == 1]

# print('Male: ', data_male_low_inc.shape, data_male_high_inc.shape)
# print('Female: ', data_female_low_inc.shape, data_female_high_inc.shape)

# Male: Train: (1000, 64), Test: (60, 940)    
# Female: Train: (64, 1000), Test: (940, 60)    
# Val created from Train by selecting 60 from 1000 and 4 from 64

#Male Dataset
train_high_inc= data_male_high_inc[:1000]
test_high_inc= data_male_high_inc[-61:-1]

train_low_inc= data_male_low_inc[:64]
test_low_inc= data_male_low_inc[-941:-1]

val_high_inc= train_high_inc[940:]
train_high_inc= train_high_inc[:940]
val_low_inc= train_low_inc[60:]
train_low_inc= train_low_inc[:60]

train_male= np.concatenate((train_high_inc, train_low_inc), axis=0)
val_male= np.concatenate((val_high_inc, val_low_inc), axis=0)
test_male= np.concatenate((test_high_inc, test_low_inc), axis=0)

#Female Dataset
train_high_inc= data_female_high_inc[:64]
test_high_inc= data_female_high_inc[-941:-1]

train_low_inc= data_female_low_inc[:1000]
test_low_inc= data_female_low_inc[-61:-1]

val_high_inc= train_high_inc[60:]
train_high_inc= train_high_inc[:60]

val_low_inc= train_low_inc[940:]
train_low_inc= train_low_inc[:940]

train_female= np.concatenate((train_high_inc, train_low_inc), axis=0)
val_female= np.concatenate((val_high_inc, val_low_inc), axis=0)
test_female= np.concatenate((test_high_inc, test_low_inc), axis=0)

In [None]:
print('Male Final: ', train_male.shape, val_male.shape, test_male.shape)
print('Female Final: ', train_male.shape, val_male.shape, test_male.shape)
print('Sanity Check Male')
print('Train', np.sum(train_male[:, -1]==0), np.sum(train_male[:, -1]==1))
print('Val', np.sum(val_male[:, -1]==0), np.sum(val_male[:, -1]==1))
print('Test', np.sum(test_male[:, -1]==0), np.sum(test_male[:, -1]==1))

print('Sanity Check Female')
print('Train', np.sum(train_female[:, -1]==0), np.sum(train_female[:, -1]==1))
print('Val', np.sum(val_female[:, -1]==0), np.sum(val_female[:, -1]==1))
print('Test', np.sum(test_female[:, -1]==0), np.sum(test_female[:, -1]==1))


In [None]:
#Male
torch.save(torch.tensor(train_male[:, :-1]), base_dir + 'male_train_data.pt')
torch.save(torch.tensor(train_male[:, -1]), base_dir + 'male_train_label.pt')

torch.save(torch.tensor(val_male[:, :-1]), base_dir + 'male_val_data.pt')
torch.save(torch.tensor(val_male[:, -1]), base_dir + 'male_val_label.pt')

torch.save(torch.tensor(test_male[:, :-1]), base_dir + 'male_test_data.pt')
torch.save(torch.tensor(test_male[:, -1]), base_dir + 'male_test_label.pt')

#Female
torch.save(torch.tensor(train_female[:, :-1]), base_dir + 'female_train_data.pt')
torch.save(torch.tensor(train_female[:, -1]), base_dir + 'female_train_label.pt')

torch.save(torch.tensor(val_female[:, :-1]), base_dir + 'female_val_data.pt')
torch.save(torch.tensor(val_female[:, -1]), base_dir + 'female_val_label.pt')

torch.save(torch.tensor(test_female[:, :-1]), base_dir + 'female_test_data.pt')
torch.save(torch.tensor(test_female[:, -1]), base_dir + 'female_test_label.pt')

## Version 2

In [None]:
#train_data_vae= train_data_vae[ train_data_vae['income']==0 ]
#train_data_vae.drop('income', axis=1, inplace=True)

idx= columns.index('sex_Male')
data_male= dataset[dataset[:, idx]==1]
idx= columns.index('sex_Female')
data_female= dataset[dataset[:, idx]==1]
print('Gender Stats: ', data_male.shape, data_female.shape)

# Get low and high income groups from the last dimension in the data
data_male_low_inc= data_male[data_male[:, -1] == 0]
data_male_high_inc= data_male[data_male[:, -1] == 1]
data_female_low_inc= data_female[data_female[:, -1] == 0]
data_female_high_inc= data_female[data_female[:, -1] == 1]

print('Male: ', data_male_low_inc.shape, data_male_high_inc.shape)
print('Female: ', data_female_low_inc.shape, data_female_high_inc.shape)

In [None]:
#Two groups (Male, Female) equally represented in each domain
total_domains= 3 
val_frac=0.1
domain_spur_prob= [0.7, 0.6, 0.1]

start_idx={'male_low':0, 'male_high':0, 'female_low':0, 'female_high':0}

for idx in range(total_domains):    
    
    if idx in [0, 1]:
        group_size= 450
    else:
        group_size= 100
        
    domain_size= 2*group_size    
    val_size= int(val_frac*group_size)
    train_size= group_size - val_size
    print('Domain: ', idx)
    print('Train Size for each group: ', train_size, 'Val Size for each group: ', val_size)
    for data_case in ['train', 'val', 'test']:        
        
        if data_case in ['test'] and idx in [0, 1]:
            continue
        
        if data_case in ['train', 'val'] and idx in [2]:
            continue
        
        group_data={'male_low':data_male_low_inc, 'male_high':data_male_high_inc, 'female_low':data_female_low_inc, 'female_high':data_female_high_inc}
        curr_data={}
        for gender in ['male', 'female']:
            for income_case in ['low', 'high']:
            
                if gender == 'male':
                    prob= domain_spur_prob[idx]
                else:
                    prob= 1.0 - domain_spur_prob[idx]

                if data_case in ['train', 'test']:
                    data_size= train_size
                else:
                    data_size= val_size
                
                if income_case == 'high':
                    inc_size= int(prob*data_size)
                else:
                    inc_size= data_size - int(prob*data_size)

                offset= start_idx[gender+'_'+income_case]    
                curr_data[gender+'_'+income_case]= group_data[gender+'_'+income_case][offset: offset + inc_size]
                start_idx[gender+'_'+income_case]+= inc_size
                
                print('Data: ', data_case, 'Gender: ', gender, ' Income :', income_case, inc_size)
        
        save_data= []
        spur_corr= []
        for key in curr_data.keys():
            save_data.append(curr_data[key])
            
            if 'female' in key:
                spur_corr.append(np.zeros(curr_data[key].shape[0]))
            else:
                spur_corr.append(np.ones(curr_data[key].shape[0]))
        
#         for i in range(4):
#             print(save_data[i].shape, spur_corr[i].shape)
        
        save_data= torch.tensor(np.vstack(save_data))
        spur_corr= torch.tensor(np.hstack(spur_corr))
        print(save_data.shape, spur_corr.shape)
        
        torch.save(save_data[:, :-1], base_dir + 'd' + str(idx+1) + '_' + data_case + '_data.pt')
        torch.save(save_data[:, -1], base_dir + 'd' + str(idx+1) + '_' + data_case + '_label.pt')
        torch.save(spur_corr,  base_dir + 'd' + str(idx+1) + '_' + data_case + '_spur.pt')
        
    print('\n')

### Domain 1

In [None]:



# Male: Train: (1000, 64), Test: (60, 940)    
# Female: Train: (64, 1000), Test: (940, 60)    
# Val created from Train by selecting 60 from 1000 and 4 from 64

#Male Dataset
train_high_inc= data_male_high_inc[:1000]
train_low_inc= data_male_low_inc[:64]

val_high_inc= train_high_inc[940:]
train_high_inc= train_high_inc[:940]

val_low_inc= train_low_inc[60:]
train_low_inc= train_low_inc[:60]

train_male= np.concatenate((train_high_inc, train_low_inc), axis=0)
val_male= np.concatenate((val_high_inc, val_low_inc), axis=0)

#Female Dataset
train_high_inc= data_female_high_inc[:64]
train_low_inc= data_female_low_inc[:1000]

val_high_inc= train_high_inc[60:]
train_high_inc= train_high_inc[:60]

val_low_inc= train_low_inc[940:]
train_low_inc= train_low_inc[:940]

train_female= np.concatenate((train_high_inc, train_low_inc), axis=0)
val_female= np.concatenate((val_high_inc, val_low_inc), axis=0)


train_data_1= np.concatenate((train_male, train_female), axis=0) 
val_data_1= np.concatenate((val_male, val_female), axis=0) 


spur_male= np.zeros(train_male.shape[0])
spur_female= np.ones(train_female.shape[0])
train_spur_1= np.concatenate((spur_male, spur_female), axis=0)

spur_male= np.zeros(val_male.shape[0])
spur_female= np.ones(val_female.shape[0])
val_spur_1= np.concatenate((spur_male, spur_female), axis=0)

print('Male Final: ', train_male.shape, val_male.shape)
print('Female Final: ', train_female.shape, val_female.shape)
print('Spur Feature Final: ', train_spur_1.shape, val_spur_1.shape)

print('Sanity Check Male')
print('Train', np.sum(train_male[:, -1]==0), np.sum(train_male[:, -1]==1))
print('Val', np.sum(val_male[:, -1]==0), np.sum(val_male[:, -1]==1))

print('Sanity Check Female')
print('Train', np.sum(train_female[:, -1]==0), np.sum(train_female[:, -1]==1))
print('Val', np.sum(val_female[:, -1]==0), np.sum(val_female[:, -1]==1))

### Domain 2

In [None]:
#train_data_vae= train_data_vae[ train_data_vae['income']==0 ]
#train_data_vae.drop('income', axis=1, inplace=True)

idx= columns.index('sex_Male')
data_male= dataset[dataset[:, idx]==1]
idx= columns.index('sex_Female')
data_female= dataset[dataset[:, idx]==1]
print('Gender Stats: ', data_male.shape, data_female.shape)

# Get low and high income groups from the last dimension in the data
data_male_low_inc= data_male[data_male[:, -1] == 0]
data_male_high_inc= data_male[data_male[:, -1] == 1]
data_female_low_inc= data_female[data_female[:, -1] == 0]
data_female_high_inc= data_female[data_female[:, -1] == 1]

# print('Male: ', data_male_low_inc.shape, data_male_high_inc.shape)
# print('Female: ', data_female_low_inc.shape, data_female_high_inc.shape)

# Male: Train: (1000, 190), Test: (160, 840)    
# Female: Train: (190, 1000), Test: (840, 160)    
# Val created from Train by selecting 60 from 1000 and 4 from 64

#Male Dataset
train_high_inc= data_male_high_inc[:1000]
train_low_inc= data_male_low_inc[:190]

val_high_inc= train_high_inc[840:]
train_high_inc= train_high_inc[:840]

val_low_inc= train_low_inc[160:]
train_low_inc= train_low_inc[:160]

train_male= np.concatenate((train_high_inc, train_low_inc), axis=0)
val_male= np.concatenate((val_high_inc, val_low_inc), axis=0)

#Female Dataset
train_high_inc= data_female_high_inc[:190]
train_low_inc= data_female_low_inc[:1000]

val_high_inc= train_high_inc[160:]
train_high_inc= train_high_inc[:160]

val_low_inc= train_low_inc[840:]
train_low_inc= train_low_inc[:840]

train_female= np.concatenate((train_high_inc, train_low_inc), axis=0)
val_female= np.concatenate((val_high_inc, val_low_inc), axis=0)


train_data_2= np.concatenate((train_male, train_female), axis=0) 
val_data_2= np.concatenate((val_male, val_female), axis=0) 

spur_male= np.zeros(train_male.shape[0])
spur_female= np.ones(train_female.shape[0])
train_spur_2= np.concatenate((spur_male, spur_female), axis=0)

spur_male= np.zeros(val_male.shape[0])
spur_female= np.ones(val_female.shape[0])
val_spur_2= np.concatenate((spur_male, spur_female), axis=0)

print('Male Final: ', train_male.shape, val_male.shape)
print('Female Final: ', train_female.shape, val_female.shape)
print('Spur Feature Final: ', train_spur_2.shape, val_spur_2.shape)

print('Sanity Check Male')
print('Train', np.sum(train_male[:, -1]==0), np.sum(train_male[:, -1]==1))
print('Val', np.sum(val_male[:, -1]==0), np.sum(val_male[:, -1]==1))

print('Sanity Check Female')
print('Train', np.sum(train_female[:, -1]==0), np.sum(train_female[:, -1]==1))
print('Val', np.sum(val_female[:, -1]==0), np.sum(val_female[:, -1]==1))

### Domain 3

In [None]:
#train_data_vae= train_data_vae[ train_data_vae['income']==0 ]
#train_data_vae.drop('income', axis=1, inplace=True)

idx= columns.index('sex_Male')
data_male= dataset[dataset[:, idx]==1]
idx= columns.index('sex_Female')
data_female= dataset[dataset[:, idx]==1]
print('Gender Stats: ', data_male.shape, data_female.shape)

# Get low and high income groups from the last dimension in the data
data_male_low_inc= data_male[data_male[:, -1] == 0]
data_male_high_inc= data_male[data_male[:, -1] == 1]
data_female_low_inc= data_female[data_female[:, -1] == 0]
data_female_high_inc= data_female[data_female[:, -1] == 1]

# print('Male: ', data_male_low_inc.shape, data_male_high_inc.shape)
# print('Female: ', data_female_low_inc.shape, data_female_high_inc.shape)

# Male: Train: (1000, 64), Test: (60, 940)    
# Female: Train: (64, 1000), Test: (940, 60)    
# Val created from Train by selecting 60 from 1000 and 4 from 64

#Male Dataset
test_high_inc= data_male_high_inc[-61:-1]
test_low_inc= data_male_low_inc[-941:-1]
test_male= np.concatenate((test_high_inc, test_low_inc), axis=0)

#Female Dataset
test_high_inc= data_female_high_inc[-941:-1]
test_low_inc= data_female_low_inc[-61:-1]
test_female= np.concatenate((test_high_inc, test_low_inc), axis=0)

test_data= np.concatenate((test_male, test_female), axis=0) 

spur_male= np.zeros(test_male.shape[0])
spur_female= np.ones(test_female.shape[0])
test_spur= np.concatenate((spur_male, spur_female), axis=0)

print('Male Final: ',  test_male.shape)
print('Female Final: ', test_female.shape)
print('Spur Feature Final: ', test_spur.shape)

print('Sanity Check Male')
print('Test', np.sum(test_male[:, -1]==0), np.sum(test_male[:, -1]==1))

print('Sanity Check Female')
print('Test', np.sum(test_female[:, -1]==0), np.sum(test_female[:, -1]==1))

In [None]:
#Domain 1
torch.save(torch.tensor(train_data_1[:, :-1]), base_dir + 'd1_train_data.pt')
torch.save(torch.tensor(train_data_1[:, -1]), base_dir + 'd1_train_label.pt')
torch.save(torch.tensor(train_spur_1), base_dir + 'd1_train_spur.pt')

torch.save(torch.tensor(val_data_1[:, :-1]), base_dir + 'd1_val_data.pt')
torch.save(torch.tensor(val_data_1[:, -1]), base_dir + 'd1_val_label.pt')
torch.save(torch.tensor(val_spur_1), base_dir + 'd1_val_spur.pt')

#Domain 2
torch.save(torch.tensor(train_data_2[:, :-1]), base_dir + 'd2_train_data.pt')
torch.save(torch.tensor(train_data_2[:, -1]), base_dir + 'd2_train_label.pt')
torch.save(torch.tensor(train_spur_2), base_dir + 'd2_train_spur.pt')

torch.save(torch.tensor(val_data_2[:, :-1]), base_dir + 'd2_val_data.pt')
torch.save(torch.tensor(val_data_2[:, -1]), base_dir + 'd2_val_label.pt')
torch.save(torch.tensor(val_spur_2), base_dir + 'd2_val_spur.pt')

#Domain 3
torch.save(torch.tensor(test_data[:, :-1]), base_dir + 'd3_test_data.pt')
torch.save(torch.tensor(test_data[:, -1]), base_dir + 'd3_test_label.pt')
torch.save(torch.tensor(test_spur), base_dir + 'd3_test_spur.pt')