# What's wrong with Fair Representations of input data?

(Before we start, let's just import some stuff that we'll need later)

In [1]:
from typing import Optional, Tuple

import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
import torch.distributions as td
import torch.nn.functional as F
from torch import Tensor, nn
from torch.optim import Adam, lr_scheduler
from torch.utils.data import DataLoader
from tqdm import trange

from ethicml.algorithms.inprocess import LR, LRCV, InAlgorithm, Majority
from ethicml.algorithms.inprocess.blind import Blind
from ethicml.data import adult
from ethicml.evaluators import metric_per_sensitive_attribute
from ethicml.implementations.pytorch_common import CustomDataset
from ethicml.metrics import Accuracy, Metric, ProbPos
from ethicml.preprocessing import scale_continuous, train_test_split
from ethicml.utility import DataTuple

## Scenario 1

We have some input data $x$. We want a function that produces a version of this data ($z_x$), such that $z_x$ is independent of some protected characteristic $s$. In other words we want to find $e: X \rightarrow Z_x ~~\mathrm{s.t.}~ Z_x \perp S$.

Let's look at that. 

![setup1](assets/setup1.png)

The red line indicates that you cannot learn $S$ from $Z_x$; there is no mutual information between these two variables. They are independent.

The problem here is that the easiest way for a network to achieve this is to just learn nothing. Make $Z_x$ all $0$'s and your job is done. But that's a claim... let's demonstrate that.

### Building

Let's build the adversary

In [2]:
class GradReverse(torch.autograd.Function):
    """Gradient reversal layer"""

    @staticmethod
    def forward(ctx, x: Tensor, lambda_: float) -> Tensor:
        ctx.lambda_ = lambda_
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output: Tensor) -> Tuple[Tensor, Optional[Tensor]]:
        return grad_output.neg().mul(ctx.lambda_), None


def grad_reverse(features: Tensor, lambda_: float = 1.0) -> Tensor:
    return GradReverse.apply(features, lambda_)

class FeatureAdv(nn.Module):
    def __init__(self, latent_dim: int):
        super().__init__()
        self.hid = nn.Linear(latent_dim, 100)
        self.hid_1 = nn.Linear(100, 100)
        self.bn_1 = nn.BatchNorm1d(100)
        self.out = nn.Linear(100, 1)

    def forward(self, z: td.Distribution):
        s = self.bn_1(F.relu(self.hid(grad_reverse(z))))
        return self.out(s)

The encoder

In [3]:
class FeatureEncoder(nn.Module):
    def __init__(self, in_size: int, latent_dim: int):
        super().__init__()
        self.hid_1 = nn.Linear(in_size, 100)
        self.bn_1 = nn.BatchNorm1d(100)
        self.hid_2 = nn.Linear(100, 100)
        self.bn_2 = nn.BatchNorm1d(100)

        self.mu = nn.Linear(100, latent_dim)
        self.logvar = nn.Linear(100, latent_dim)

    def forward(self, z: torch.Tensor):
        x = self.bn_1(F.relu(self.hid_1(z)))
        x = F.relu(self.hid_2(x))
        return td.Normal(loc=self.mu(x), scale=F.softplus(self.logvar(x)))

Then the model

In [4]:
class Model1(nn.Module):
    def __init__(self, in_size: int, latent_dim: int):
        super().__init__()
        self.enc = FeatureEncoder(in_size, latent_dim)
        self.adv = FeatureAdv(latent_dim)

    def forward(self, x):
        z = self.enc(x)
        s = self.adv(z.rsample())
        return z, s

And let's add some helper code

In [5]:
def evaluate_z(
    train: DataTuple,
    test: DataTuple,
    model: InAlgorithm = LRCV,
    metric: Metric = Accuracy,
    per_sens: bool = False,
):
    model = model()
    preds = model.run(train, test)

    if per_sens:
        score = metric_per_sensitive_attribute(preds, test, metric())
        print(f"{metric().name}: {score}\n")
    else:
        score = metric().score(preds, test)
        print(f"{metric().name}: {score:.3f}\n")

In [6]:
def encode1(loader: DataLoader, model: nn.Module, latent_dims: int):
    feats_train_encs: pd.DataFrame = pd.DataFrame(columns=list(range(latent_dims)))
    model.eval()
    with torch.no_grad():
        for (x, s, y) in loader:
            z, _ = model(x)
            feats_train_encs = pd.concat(
                [
                    feats_train_encs,
                    pd.DataFrame(z.sample().cpu().numpy(), columns=list(range(latent_dims))),
                ],
                axis="rows",
                ignore_index=True,
            )

    return feats_train_encs

In [7]:
dataset = adult()
data = dataset.load()
scaler = StandardScaler()
data, scaler2 = scale_continuous(dataset, data, scaler)

_train, _test = train_test_split(data, train_percentage=0.9)
train_data = CustomDataset(_train)
train_loader = DataLoader(train_data, batch_size=256)

test_data = CustomDataset(_test)
test_loader = DataLoader(test_data, batch_size=256)

In [8]:
print(f"Performance on the original data...")
evaluate_z(_train, _test)

print(f"Majority classifier...")
evaluate_z(_train, _test, model=Majority)

print(f"Random classifier...")
evaluate_z(_train, _test, model=Blind)

Performance on the original data...
Accuracy: 0.852

Majority classifier...
Accuracy: 0.749

Random classifier...
Accuracy: 0.500



In [9]:
epochs=15
latent_dims=50

model1 = Model1(len(_train.x.columns), latent_dims)
optimizer = Adam(model1.parameters(), lr=1e-3)
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.98)

with trange(epochs) as t:
    for epoch in t:
        for (x, s, y) in train_loader:
            z, s_pred = model1(x)

            feat_prior = td.Normal(loc=torch.zeros(latent_dims), scale=torch.ones(latent_dims))
            feat_kl_loss = td.kl.kl_divergence(z, feat_prior)

            feat_sens_loss = F.binary_cross_entropy_with_logits(s_pred, s, reduction="mean")

            loss = feat_kl_loss.mean() + feat_sens_loss

            t.set_postfix(loss=loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        scheduler.step()

post_train = DataTuple(x=encode1(train_loader, model1, latent_dims), s=_train.s, y=_train.y)
post_test = DataTuple(x=encode1(test_loader, model1, latent_dims), s=_test.s, y=_test.y)

100%|██████████| 15/15 [00:29<00:00,  1.96s/it, loss=0.646]


In [10]:
print(f"Performance on the embeddings...")
evaluate_z(post_train, post_test)

print(f"Fairness on the original data...")
evaluate_z(_train, _test, metric=ProbPos, per_sens=True)

print(f"Fairness on the embeddings...")
evaluate_z(post_train, post_test, metric=ProbPos, per_sens=True)

Performance on the embeddings...
Accuracy: 0.749

Fairness on the original data...
prob_pos: {'sex_Male_1': 0.26198905696813646, 'sex_Male_0': 0.08615819209039548}

Fairness on the embeddings...
prob_pos: {'sex_Male_1': 0.0, 'sex_Male_0': 0.0}



Why is this? Well, we're using a variational autoencoder model and using a prior gaussian distribuiton as a regulariser. We could do away with this, but you'd end up in a similar position. (If you want to try this out, use the rocket icon at the top of the page to run this page as a notebook on mybinder.org.)

## Scenario 2

So the problem is that our representation doesn't have any direction. It's goal is to make $S$ unrecognizable from $Z$. Which it does, it's just that you can't tell anything else from $Z$ either.

So let's give $Z$ some direction.

![setup2](assets/setup2.png)

In this case we want $Z$ to have no information about $S$, but also be representative of $Y$.

We can re-use most of the parts from before, but we need a predictor.

In [11]:
class EmbeddingPredictor(nn.Module):
    def __init__(self, latent_dim: int):
        super().__init__()
        self.hid = nn.Linear(latent_dim, 100)
        self.hid_1 = nn.Linear(100, 100)
        self.bn_1 = nn.BatchNorm1d(100)
        self.out = nn.Linear(100, 1)

    def forward(self, z: td.Distribution):
        y = self.bn_1(F.relu(self.hid(z)))
        return self.out(y)

and slightly update one of our helper functions

In [12]:
def encode2(loader: DataLoader, model: nn.Module, latent_dims: int):
    feats_train_encs: pd.DataFrame = pd.DataFrame(columns=list(range(latent_dims)))
    model.eval()
    with torch.no_grad():
        for (x, s, y) in loader:
            z, _, _ = model(x)
            feats_train_encs = pd.concat(
                [
                    feats_train_encs,
                    pd.DataFrame(z.sample().cpu().numpy(), columns=list(range(latent_dims))),
                ],
                axis="rows",
                ignore_index=True,
            )

    return feats_train_encs

Let's build the model

In [13]:
class Model2(nn.Module):
    def __init__(self, in_size: int, latent_dim: int):
        super().__init__()
        self.enc = FeatureEncoder(in_size, latent_dim)
        self.adv = FeatureAdv(latent_dim)
        self.pred = EmbeddingPredictor(latent_dim)

    def forward(self, x):
        z = self.enc(x)
        s = self.adv(z.rsample())
        y = self.pred(z.rsample())
        return z, s, y

In [14]:
model2 = Model2(len(_train.x.columns), latent_dims)
optimizer = Adam(model2.parameters(), lr=1e-3)
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.98)

with trange(epochs) as t:
    for epoch in t:
        for (x, s, y) in train_loader:
            z, s_pred, y_pred = model2(x)

            feat_prior = td.Normal(loc=torch.zeros(latent_dims), scale=torch.ones(latent_dims))
            feat_kl_loss = td.kl.kl_divergence(z, feat_prior)

            feat_sens_loss = F.binary_cross_entropy_with_logits(s_pred, s, reduction="mean")
            pred_y_loss = F.binary_cross_entropy_with_logits(y_pred, y, reduction="mean")

            loss = feat_kl_loss.mean() + feat_sens_loss + pred_y_loss

            t.set_postfix(loss=loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        scheduler.step()

post_train = DataTuple(x=encode2(train_loader, model2, latent_dims), s=_train.s, y=_train.y)
post_test = DataTuple(x=encode2(test_loader, model2, latent_dims), s=_test.s, y=_test.y)

100%|██████████| 15/15 [00:45<00:00,  3.03s/it, loss=0.92] 


In [15]:
print(f"Performance on the embeddings...")
evaluate_z(post_train, post_test)

print(f"Fairness on the embeddings...")
evaluate_z(post_train, post_test, metric=ProbPos, per_sens=True)

Performance on the embeddings...
Accuracy: 0.845

Fairness on the embeddings...
prob_pos: {'sex_Male_1': 0.25329900225297713, 'sex_Male_0': 0.11581920903954802}



Well, this is certainly more accurate than before, but although were a bit more equal, we're not really doing a great job. The reason for this is that there is a tension between removing information that is relevant to $S$ and keeping information that is relevant for $Y$. To demonstrate this, let's tweak the above model to remove this tension.

## Scenario 3

![setup3](assets/setup3.png)

In this setup we remove the tension. $Z$ can freely remove $S$, and $Y$ can get all the information it needs about $S$ directly.

We'll just update the helper function.

In [16]:
def encode3(loader: DataLoader, model: nn.Module, latent_dims: int):
    feats_train_encs: pd.DataFrame = pd.DataFrame(columns=list(range(latent_dims)))
    model.eval()
    with torch.no_grad():
        for (x, s, y) in loader:
            z, _, _ = model(x, s)
            feats_train_encs = pd.concat(
                [
                    feats_train_encs,
                    pd.DataFrame(z.sample().cpu().numpy(), columns=list(range(latent_dims))),
                ],
                axis="rows",
                ignore_index=True,
            )

    return feats_train_encs

And both the predictor and the model

In [17]:
class EmbeddingAndSPredictor(nn.Module):
    def __init__(self, latent_dim: int):
        super().__init__()
        self.hid = nn.Linear(latent_dim+1, 100)
        self.hid_1 = nn.Linear(100, 100)
        self.bn_1 = nn.BatchNorm1d(100)
        self.out = nn.Linear(100, 1)

    def forward(self, z: td.Distribution, s: torch.Tensor):
        y = self.bn_1(F.relu(self.hid(torch.cat([z, s], dim=1))))
        return self.out(y)

class Model3(nn.Module):
    def __init__(self, in_size: int, latent_dim: int):
        super().__init__()
        self.enc = FeatureEncoder(in_size, latent_dim)
        self.adv = FeatureAdv(latent_dim)
        self.pred = EmbeddingAndSPredictor(latent_dim)

    def forward(self, x, s):
        z = self.enc(x)
        z_sample  = z.rsample()
        s_pred = self.adv(z_sample)
        y = self.pred(z_sample, s)
        return z, s_pred, y

In [18]:
model3 = Model3(len(_train.x.columns), latent_dims)
optimizer = Adam(model3.parameters(), lr=1e-3)
scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.98)

with trange(epochs) as t:
    for epoch in t:
        for (x, s, y) in train_loader:
            z, s_pred, y_pred = model3(x, s)

            feat_prior = td.Normal(loc=torch.zeros(latent_dims), scale=torch.ones(latent_dims))
            feat_kl_loss = td.kl.kl_divergence(z, feat_prior)

            feat_sens_loss = F.binary_cross_entropy_with_logits(s_pred, s, reduction="mean")
            pred_y_loss = F.binary_cross_entropy_with_logits(y_pred, y, reduction="mean")

            loss = feat_kl_loss.mean() + feat_sens_loss + pred_y_loss

            t.set_postfix(loss=loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        scheduler.step()

post_train = DataTuple(x=encode3(train_loader, model3, latent_dims), s=_train.s, y=_train.y)
post_test = DataTuple(x=encode3(test_loader, model3, latent_dims), s=_test.s, y=_test.y)

100%|██████████| 15/15 [00:56<00:00,  3.75s/it, loss=0.925]


In [19]:
print(f"Performance on the embeddings...")
evaluate_z(post_train, post_test)

print(f"Fairness on the embeddings...")
evaluate_z(post_train, post_test, metric=ProbPos, per_sens=True)

Performance on the embeddings...
Accuracy: 0.839

Fairness on the embeddings...
prob_pos: {'sex_Male_1': 0.221435468297393, 'sex_Male_0': 0.16242937853107345}



This is in line with our understanding - we lose a bit more accuracy, but the probability of a positive outcome is more equal across the groups. This would probably get closer to parity if we had either a more complicated model, or trained for longer.