# Modeling bias

In [1]:
import numpy as np
import pandas as pd

We will have target variables $Y$, protected attribute, $A$ and data $X$, the true target is distributed iid

In [2]:
N = 100
pY = .4
pA = .2
mu_X = [4, 8]
Y = np.random.choice([0,1],p=[1-pY,pY],size=N)
# choose the mean for each ytrue
muXi = [mu_X[int(y_i)] for y_i in Y]
# sample using means above and sterr defualt
X = np.random.normal(muXi)
# choose 1 with prob pA
A = np.random.choice([0,1],p=[1-pA,pA],size=N)

data = np.asarray([X,Y,A]).T
df_nobias = pd.DataFrame(data,columns =['X','Y','A'])
df_nobias.head()

Unnamed: 0,X,Y,A
0,5.302487,0.0,1.0
1,8.803787,1.0,0.0
2,4.300867,0.0,1.0
3,7.934393,1.0,0.0
4,3.289537,0.0,1.0


In [3]:
df_nobias.groupby(['Y','A']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,X
Y,A,Unnamed: 2_level_1
0.0,0.0,43
0.0,1.0,10
1.0,0.0,38
1.0,1.0,9


Now, we might say that there is some relationship between Y and A

In [4]:
N = 100
pY = .4
pA = [.2, .1]
mu_X = [4, 8]
# choose 1 with prob pY
Y = np.random.choice([0,1],p=[1-pY,pY],size=N)
# choose the mean for each ytrue
muXi = [mu_X[int(y_i)] for y_i in Y]
# sample using means above and sterr defualt
X = np.random.normal(muXi)

# choose 1 with prob pA[0] is Y =0 or pA[1] if Y =1
A = [np.random.choice([0,1],p=[1-pA[y_i],pA[y_i]]) for y_i in Y]

data = np.asarray([X,Y,A]).T
df_AYbias = pd.DataFrame(data,columns =['X','Y','A'])
df_AYbias.head()

Unnamed: 0,X,Y,A
0,3.046544,0.0,0.0
1,6.311724,1.0,0.0
2,2.970436,0.0,0.0
3,3.136143,0.0,0.0
4,9.050398,1.0,1.0


In [5]:
df_AYbias.groupby(['Y','A']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,X
Y,A,Unnamed: 2_level_1
0.0,0.0,47
0.0,1.0,9
1.0,0.0,41
1.0,1.0,3


Or more egregious

In [6]:
N = 100
pA = .2
pY = [.4, .25]
mu_X = [4, 8]
# choose 1 with prob pA
A = np.random.choice([0,1],p=[1-pA,pA],size=N)
# choose 1 with prob pY[0] is A =0 or pY[1] if A =1
Y = [np.random.choice([0,1],p=[1-pY[a_i],pY[a_i]]) for a_i in A]
# choose the mean for each ytrue
muXi = [mu_X[int(y_i)] for y_i in Y]
# sample using means above and sterr defualt
X = np.random.normal(muXi)

data = np.asarray([X,Y,A]).T
df_AYbias2 = pd.DataFrame(data,columns =['X','Y','A'])
df_AYbias2.head()

Unnamed: 0,X,Y,A
0,2.87213,0.0,0.0
1,3.834736,0.0,0.0
2,3.324976,0.0,1.0
3,6.598024,1.0,1.0
4,4.05474,0.0,0.0


In [7]:
df_AYbias2.groupby(['Y','A']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,X
Y,A,Unnamed: 2_level_1
0.0,0.0,50
0.0,1.0,11
1.0,0.0,36
1.0,1.0,3


For the above though, the features are predicitve in the same way; so while the decision $Y$ is made based on $A$ in these models, the measurements still predict for each in the same way. We could instead suggest that the decisions are biased with respect to the other measured variables.

In [8]:
N = 1000
pA = .2
pY = [.4, .4]
pErr = .5
mu_X = [4,8]
# choose 1 with prob pA
A = np.random.choice([0,1],p=[1-pA,pA],size=N)
# choose 1 with prob pY[0] is A =0 or pY[1] if A =1
Ytrue = [np.random.choice([0,1],p=[1-pY[a_i],pY[a_i]]) for a_i in A]
# choose the mean for each ytrue
muXi = [mu_X[int(y_i)] for y_i in Ytrue]
X = np.random.normal(muXi)
# A = 0: y
# A= 1: if ytrue is 0 stays 0, if ytrue is 1 with pErr flips to 0
bias_model = {0:lambda y: y,
              1: lambda y : y*np.random.choice([0,1],p=[pErr,1-pErr])} 
Yobs = [bias_model[a_i](y_i) for y_i,a_i in zip(Ytrue,A)]

data = np.asarray([X,Yobs,A]).T
df_Ybias = pd.DataFrame(data,columns =['X','Y','A'])
df_Ybias.head()

Unnamed: 0,X,Y,A
0,3.382058,0.0,0.0
1,3.551526,0.0,0.0
2,5.974229,1.0,0.0
3,6.762927,1.0,1.0
4,2.918533,0.0,0.0


In [9]:
df_Ybias.groupby(['A','Y']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,X
A,Y,Unnamed: 2_level_1
0.0,0.0,514
0.0,1.0,284
1.0,0.0,161
1.0,1.0,41


In [10]:
df_Ybias.groupby('A')['Y'].mean()

A
0.0    0.35589
1.0    0.20297
Name: Y, dtype: float64

In [11]:
df_Ybias.groupby(['A','Y']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,X
A,Y,Unnamed: 2_level_1
0.0,0.0,3.992872
0.0,1.0,7.90055
1.0,0.0,4.855002
1.0,1.0,8.122094


In [12]:
df_Ybias['Yhat'] = df_Ybias['X']>6
df_Ybias['Yhat'].astype(int,copy=False)
df_Ybias['acc'] = df_Ybias['Yhat'] == df_Ybias['Y']

In [13]:
df_Ybias.groupby(['A','Y']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,X,Yhat,acc
A,Y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.0,3.992872,0.023346,0.976654
0.0,1.0,7.90055,0.964789,0.964789
1.0,0.0,4.855002,0.229814,0.770186
1.0,1.0,8.122094,1.0,1.0


In this case, we predict $\hat{Y} = 1$ for a lot of samples where $Y = 0$, because the data collection was biased. 

We could also have that the measurements look different for the two groups, depending also on Y =1. 