In [1]:
import sqlalchemy

ImportError: No module named 'sqlalchemy'

In [None]:
engine = sqlalchemy.create_engine('sqlite:///../../data/COMPAS/compas.db')

In [None]:
inspector = sqlalchemy.inspect(engine)

In [None]:
inspector.get_table_names()

In [None]:
cursor = engine.execute('SELECT * FROM compas LIMIT 10')

In [None]:
cursor.fetchall()

In [None]:
import pandas

In [None]:
people = pandas.read_sql('people', engine)

In [None]:
people.head()

In [None]:
people.columns

In [None]:
compas = pandas.read_sql('compas', engine)

In [None]:
compas.head()

In [None]:
compas.columns

In [None]:
# In this setting we are interested mainly in whether the score $a$ we have given indidivuals is fair with respect to their sensitive attribute $z$ and their underlying quality $y$, i.e. whether or not recidivism occurred.
df = pandas.read_sql('''SELECT race,
                        is_violent_recid,
                        agency_text,
                        compas.score_text FROM people JOIN compas ON person_id = people.id''', engine)

In [None]:
df.head()

In [None]:
counts = df.groupby(['race', 'is_violent_recid']).score_text.value_counts()

In [None]:
counts

In [None]:
relative_counts = pandas.DataFrame(
    {i: d.score_text.value_counts() / d.score_text.count()
     for i, d in df.groupby(['race', 'is_violent_recid'])})
relative_counts

In [None]:
%matplotlib inline

In [None]:
relative_counts.plot.bar().legend(bbox_to_anchor = (1,1))

In [None]:
from statsmodels.formula.api import Logit, logit, mnlogit

In [None]:
df['scored_high'] = 0 + (df.score_text == 'High')

In [None]:
model = logit('scored_high ~ race + is_violent_recid + agency_text', df)

In [None]:
fitted_model = model.fit()

In [None]:
fitted_model.summary2()

# Conditional independence

Here you should try and measure the conditional independence of your model $P$ with respect to the sensitive variable (race). In particular, we wish to calculate the dependence of the risk classification $a$ on race $z$ given their recidivism $y$:
$$D(P(a \mid y, z), P(a \mid y)),$$
which corresponds to the policy for selecting the scores being balanced. We also wish to calculate the dependence of recidivism $y$ on race $z$ given the risk $a$:
$$D(P(y \mid a, z), P(y \mid a)),$$
which corresponds to the policy for selecting the scores being calibrated.

Here $D$ is some appropriate distance or divergence between distributions. It is suggested to use one of:

1. Total variation https://en.wikipedia.org/wiki/Total_variation_distance_of_probability_measures 
2. KL divergence https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence or 
3. $1/2$-Renyi-divergence https://en.wikipedia.org/wiki/R%C3%A9nyi_entropy#R%C3%A9nyi_divergence

Does the policy look fair with respect to either one of those metrics?
