In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [2]:
column_names = [
    "Elevation",
    "Aspect",
    "Slope",
    "Horizontal_Distance_To_Hydrology",
    "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways",
    "Hillshade_9am",
    "Hillshade_Noon",
    "Hillshade_3pm",
    "Horizontal_Distance_To_Fire_Points",
    "Wilderness_Area_0",
    "Wilderness_Area_1",
    "Wilderness_Area_2",
    "Wilderness_Area_3",
    "Soil_Type_0",
    "Soil_Type_1",
    "Soil_Type_2",
    "Soil_Type_3",
    "Soil_Type_4",
    "Soil_Type_5",
    "Soil_Type_6",
    "Soil_Type_7",
    "Soil_Type_8",
    "Soil_Type_9",
    "Soil_Type_10",
    "Soil_Type_11",
    "Soil_Type_12",
    "Soil_Type_13",
    "Soil_Type_14",
    "Soil_Type_15",
    "Soil_Type_16",
    "Soil_Type_17",
    "Soil_Type_18",
    "Soil_Type_19",
    "Soil_Type_20",
    "Soil_Type_21",
    "Soil_Type_22",
    "Soil_Type_23",
    "Soil_Type_24",
    "Soil_Type_25",
    "Soil_Type_26",
    "Soil_Type_27",
    "Soil_Type_28",
    "Soil_Type_29",
    "Soil_Type_30",
    "Soil_Type_31",
    "Soil_Type_32",
    "Soil_Type_33",
    "Soil_Type_34",
    "Soil_Type_35",
    "Soil_Type_36",
    "Soil_Type_37",
    "Soil_Type_38",
    "Soil_Type_39",
    "Cover_type",
]

In [3]:
data = pd.read_csv("../data/covertype/covtype.data", names=column_names, header=None)

In [4]:
data.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Cover_type
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5


In [5]:
target = [0 if item == 7 else item for item in list(data["Cover_type"])]
data["Cover_type"] = target

In [6]:
from collections import Counter

Counter(list(data["Cover_type"]))

Counter({5: 9493, 2: 283301, 1: 211840, 0: 20510, 3: 35754, 6: 17367, 4: 2747})

In [7]:
data.to_csv("../data/covertype/covtype.csv")

In [14]:
## Use this function to retrieve X, X, y arrays for training ML models
@staticmethod
def dataset_to_numpy(
    _df,
    _feature_cols: list,
    _metadata: dict,
    num_sensitive_features: int = 1,
    sensitive_features_last: bool = True,
):
    """Args:
    _df: pandas dataframe
    _feature_cols: list of feature column names
    _metadata: dictionary with metadata
    num_sensitive_features: number of sensitive features to use
    sensitive_features_last: if True, then sensitive features are encoded as last columns
    """

    # transform features to 1-hot
    _X = _df[_feature_cols]
    # take sensitive features separately
    print(
        f'Using {_metadata["protected_atts"][:num_sensitive_features]} as sensitive feature(s).'
    )
    if num_sensitive_features > len(_metadata["protected_atts"]):
        num_sensitive_features = len(_metadata["protected_atts"])
    _Z = _X[_metadata["protected_atts"][:num_sensitive_features]]
    _X = _X.drop(columns=_metadata["protected_atts"][:num_sensitive_features])
    # 1-hot encode and scale features
    if "dummy_cols" in _metadata.keys():
        dummy_cols = _metadata["dummy_cols"]
    else:
        dummy_cols = None
    _X2 = pd.get_dummies(_X, columns=dummy_cols, drop_first=False)
    esc = MinMaxScaler()
    _X = esc.fit_transform(_X)

    # current implementation assumes each sensitive feature is binary
    for i, tmp in enumerate(_metadata["protected_atts"][:num_sensitive_features]):
        assert len(_Z[tmp].unique()) == 2, "Sensitive feature is not binary!"

    # 1-hot sensitive features, (optionally) swap ordering so privileged class feature == 1 is always last, preceded by the corresponding unprivileged feature
    _Z2 = pd.get_dummies(_Z, columns=_Z.columns, drop_first=False)
    # print(_Z2.head(), _Z2.shape)
    if sensitive_features_last:
        for i, tmp in enumerate(_Z.columns):
            assert (
                _metadata["protected_att_values"][i] in _Z[tmp].unique()
            ), "Protected attribute value not found in data!"
            if not np.allclose(float(_metadata["protected_att_values"][i]), 0):
                # swap columns
                _Z2.iloc[:, [2 * i, 2 * i + 1]] = _Z2.iloc[:, [2 * i + 1, 2 * i]]
    # change booleans to floats
    # _Z2 = _Z2.astype(float)
    # _Z = _Z2.to_numpy()
    _y = _df[_metadata["target_variable"]].values
    return _X, np.array([sv[0] for sv in _Z.values]), _y

In [8]:
metadata_covtype = {
    "name": "Covertype",
    "target_variable": "Cover_type",
    "protected_atts": ["Soil_Type_0"],
    "protected_att_values": [0],
    "protected_att_descriptions": ["Gender = Female"],
    "protected_att_descriptions": [],
}

In [15]:
numpy_dataset = dataset_to_numpy(
    _df=data,
    _feature_cols=column_names,
    _metadata=metadata_covtype,
    num_sensitive_features=1,
)

Using ['Soil_Type_0'] as sensitive feature(s).


In [10]:
len(numpy_dataset)

2

In [17]:
import random
from pistacchio_simulator.FederatedDataset.Utils.custom_dataset import (
    TabularDataset,
)

x = numpy_dataset[0]
y = numpy_dataset[2]
z = numpy_dataset[1]

xyz = list(zip(x, y, z))
random.shuffle(xyz)
x, y, z = zip(*xyz)
train_size = int(len(y) * 0.8)

x_train = np.array(x[:train_size])
x_test = np.array(x[train_size:])
y_train = np.array(y[:train_size])
y_test = np.array(y[train_size:])
z_train = np.array(z[:train_size])
z_test = np.array(z[train_size:])

train_dataset = TabularDataset(
    x=np.hstack((x_train, np.ones((x_train.shape[0], 1)))).astype(np.float32),
    z=z_train.astype(np.float32),
    y=y_train.astype(np.float32),
)

test_dataset = TabularDataset(
    x=np.hstack((x_test, np.ones((x_test.shape[0], 1)))).astype(np.float32),
    z=z_test.astype(np.float32),
    y=y_test.astype(np.float32),
)

In [None]:
# @staticmethod
# def download_covertype() -> (
#     Tuple[
#         torch.utils.data.DataLoader,
#         torch.utils.data.DataLoader,
#     ]
# ):
#     """This function downloads the adult dataset.

#     Returns
#     -------
#         Tuple[torch.utils.data.DataLoader, torch.utils.data.DataLoader]:
#         the train and test dataset
#     """
#     data = pd.read_fwf("../data/covertype/covtype.data")

#     covtype_df = pd.DataFrame(data[0]).astype("int32")

#     dutch_df["sex_binary"] = np.where(dutch_df["sex"] == 1, 1, 0)
#     dutch_df["occupation_binary"] = np.where(dutch_df["occupation"] >= 300, 1, 0)

#     del dutch_df["sex"]
#     del dutch_df["occupation"]

#     dutch_df_feature_columns = [
#         "age",
#         "household_position",
#         "household_size",
#         "prev_residence_place",
#         "citizenship",
#         "country_birth",
#         "edu_level",
#         "economic_status",
#         "cur_eco_activity",
#         "Marital_status",
#         "sex_binary",
#     ]

#     metadata_dutch = {
#         "name": "Dutch census",
#         "code": ["DU1"],
#         "protected_atts": ["sex_binary"],
#         "protected_att_values": [0],
#         "protected_att_descriptions": ["Gender = Female"],
#         "target_variable": "occupation_binary",
#     }

#     tmp = DatasetDownloader.dataset_to_numpy(
#         dutch_df, dutch_df_feature_columns, metadata_dutch, num_sensitive_features=1
#     )

#     x = tmp[0]
#     y = tmp[2]
#     z = tmp[1]

#     xyz = list(zip(x, y, z))
#     random.shuffle(xyz)
#     x, y, z = zip(*xyz)
#     train_size = int(len(y) * 0.8)

#     x_train = np.array(x[:train_size])
#     x_test = np.array(x[train_size:])
#     y_train = np.array(y[:train_size])
#     y_test = np.array(y[train_size:])
#     z_train = np.array(z[:train_size])
#     z_test = np.array(z[train_size:])

#     train_dataset = TabularDataset(
#         x=np.hstack((x_train, np.ones((x_train.shape[0], 1)))).astype(np.float32),
#         z=z_train.astype(np.float32),
#         y=y_train.astype(np.float32),
#     )

#     test_dataset = TabularDataset(
#         x=np.hstack((x_test, np.ones((x_test.shape[0], 1)))).astype(np.float32),
#         z=z_test.astype(np.float32),
#         y=y_test.astype(np.float32),
#     )

#     return train_dataset, test_dataset