In [1]:

# imports
import os
import sys
import types
import json

# figure size/format
fig_width = 7
fig_height = 5
fig_format = 'retina'
fig_dpi = 96

# matplotlib defaults / format
try:
  import matplotlib.pyplot as plt
  plt.rcParams['figure.figsize'] = (fig_width, fig_height)
  plt.rcParams['figure.dpi'] = fig_dpi
  plt.rcParams['savefig.dpi'] = fig_dpi
  from IPython.display import set_matplotlib_formats
  set_matplotlib_formats(fig_format)
except Exception:
  pass

# plotly use connected mode
try:
  import plotly.io as pio
  pio.renderers.default = "notebook_connected"
except Exception:
  pass

# enable pandas latex repr when targeting pdfs
try:
  import pandas as pd
  if fig_format == 'pdf':
    pd.set_option('display.latex.repr', True)
except Exception:
  pass



# output kernel dependencies
kernel_deps = dict()
for module in list(sys.modules.values()):
  # Some modules play games with sys.modules (e.g. email/__init__.py
  # in the standard library), and occasionally this can cause strange
  # failures in getattr.  Just ignore anything that's not an ordinary
  # module.
  if not isinstance(module, types.ModuleType):
    continue
  path = getattr(module, "__file__", None)
  if not path:
    continue
  if path.endswith(".pyc") or path.endswith(".pyo"):
    path = path[:-1]
  if not os.path.exists(path):
    continue
  kernel_deps[path] = os.stat(path).st_mtime
print(json.dumps(kernel_deps))

# set run_path if requested
if r'/Users/philchodrow/Documents/teaching/ml-notes/source':
  os.chdir(r'/Users/philchodrow/Documents/teaching/ml-notes/source')

# reset state
%reset

def ojs_define(**kwargs):
  import json
  try:
    # IPython 7.14 preferred import
    from IPython.display import display, HTML
  except:
    from IPython.core.display import display, HTML

  # do some minor magic for convenience when handling pandas
  # dataframes
  def convert(v):
    try:
      import pandas as pd
    except ModuleNotFoundError: # don't do the magic when pandas is not available
      return v
    if type(v) == pd.Series:
      v = pd.DataFrame(v)
    if type(v) == pd.DataFrame:
      j = json.loads(v.T.to_json(orient='split'))
      return dict((k,v) for (k,v) in zip(j["index"], j["data"]))
    else:
      return v
  
  v = dict(contents=list(dict(name=key, value=convert(value)) for (key, value) in kwargs.items()))
  display(HTML('<script type="ojs-define">' + json.dumps(v) + '</script>'), metadata=dict(ojs_define = True))
globals()["ojs_define"] = ojs_define



`set_matplotlib_formats` is deprecated since IPython 7.23, directly use `matplotlib_inline.backend_inline.set_matplotlib_formats()`





In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
sns.set_style("whitegrid")
np.set_printoptions(precision = 3)
pd.set_option('display.precision', 3)

url = "https://raw.githubusercontent.com/PhilChodrow/ml-notes/main/data/compas/compas.csv"
compas = pd.read_csv(url)

In [3]:
cols = ["sex", "race", "decile_score", "two_year_recid"]
compas = compas[cols]

# using Angwin's definition
compas["predicted_high_risk"] = 1*(compas["decile_score"] >= 4)


is_white = compas["race"] == "Caucasian"
is_black = compas["race"] == "African-American"

compas = compas[is_white | is_black]
compas = compas.copy()

# excerpt of the data

compas.head()

Unnamed: 0,sex,race,decile_score,two_year_recid,predicted_high_risk
1,Male,African-American,3,1,0
2,Male,African-American,4,1,1
3,Male,African-American,8,0,1
6,Male,Caucasian,6,1,1
8,Female,Caucasian,1,0,0


In [4]:
#---
def test_independence(df, group_col, target, pred):
    return df.groupby(group_col)[pred].aggregate([np.mean, len])
#---

In [5]:
test_independence(compas, "race", "two_year_recid", "predicted_high_risk")

Unnamed: 0_level_0,mean,len
race,Unnamed: 1_level_1,Unnamed: 2_level_1
African-American,0.692,3696
Caucasian,0.464,2454


In [6]:
#---
def test_error_rate_balance(df, group_col, target, pred):
    df_ = df.copy()
    
    # columns 
    df_["positive"]       = df_[target] == 1
    df_["negative"]       = df_[target] == 0
    
    df_["true_positive"]  = df_["positive"] & (df_[pred] == 1)
    df_["false_positive"] = df_["negative"] & (df_[pred] == 1)

    counts = df_.groupby(group_col)[["positive", "negative", "true_positive", "false_positive"]].sum()

    counts["true_positive_rate"]  = counts["true_positive"] / counts["positive"]
    counts["false_positive_rate"] = counts["false_positive"] / counts["negative"]

    return counts[["true_positive_rate", "false_positive_rate"]]
#---

In [7]:
#---
test_error_rate_balance(compas, "race", "two_year_recid", "predicted_high_risk")
#---

Unnamed: 0_level_0,true_positive_rate,false_positive_rate
race,Unnamed: 1_level_1,Unnamed: 2_level_1
African-American,0.813,0.564
Caucasian,0.64,0.35
