In [None]:
import os
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from etl import get_latest_cached


# path variables
root = os.path.join(os.getenv("REPO_ROOT"), "src")
%cd $root
sys.path.insert(0, root)

# plotting
%matplotlib inline
sns.set_style("whitegrid")

# Ingestion

In [None]:
df = get_latest_cached()
display(df.head())
print(df.shape)
print(df.columns)

# Sanity checks

## Internal descriptions

In [None]:
df.PD_DESC.value_counts().head(10)

## Transit crimes

In [None]:
df.TRANSIT_DISTRICT.value_counts()  # this is something related to NYPD transit enforcement

In [None]:
df.TRANSIT_DISTRICT.count()

In [None]:
df.TRANSIT_DISTRICT.count() / df.shape[0]

In [None]:
df.STATION_NAME.value_counts()

In [None]:
df.STATION_NAME.count()

In [None]:
df.STATION_NAME.count() / df.shape[0]

In [None]:
(df.STATION_NAME.isnull() == df.TRANSIT_DISTRICT.isnull()).all()

## Jurisdictions

In [None]:
df.JURIS_DESC.value_counts()

In [None]:
df.loc[(df.JURIS_DESC == "N.Y. TRANSIT POLICE") & df.TRANSIT_DISTRICT.isnull()]

In [None]:
df.loc[~(df.JURIS_DESC == "N.Y. TRANSIT POLICE") & ~df.STATION_NAME.isnull()]

## What happens in train stations?

In [None]:
df.loc[:, "OFNS_DESC"].value_counts() / df.shape[0]

In [None]:
flt = ~df.STATION_NAME.isnull()
df.loc[flt, "OFNS_DESC"].value_counts() / flt.sum()

## Breakdown by race

In [None]:
cross_table = (
    df
    .groupby(['SUSP_RACE', 'VIC_RACE'])
    ['OFNS_DESC']
    .count()
    .unstack(level=0)
)
cross_table = cross_table.divide(cross_table.sum(axis=1), axis=0)

fig, ax = plt.subplots(1, 1, figsize=(10, 10))
sns.heatmap(
    cross_table, 
    cmap='RdBu_r', 
    ax=ax,
    square=True,
    vmin=0, 
    vmax=0.666,
    annot=True,
    fmt='.3f',
)

In [None]:
anti_asian = (
    df
    .groupby((df.VIC_RACE == "ASIAN / PACIFIC ISLANDER"))
    ["OFNS_DESC"]
    .value_counts()
    .unstack(level=0)
)
anti_asian /= anti_asian.sum(axis=0)
anti_asian.columns = anti_asian.columns.map({True: "AAPI", False: "Other"})
anti_asian = anti_asian.assign(
    abs_gap=anti_asian['AAPI'] - anti_asian['Other'],
    rel_gap=anti_asian['AAPI'] / anti_asian['Other'],
)

anti_asian.sort_values('abs_gap', ascending=False).head(10)

# Brainstorm

* There is sufficient time coverage (history), space coverage (all of NY) and modest feature set (attributes of complaints) to suggest that we can make informed analyses.
* Questions: 
    - Before/after COVID-19?
    - Victim demographics (age, socioeconomic) and district demographics?
    - How do complaint ratios compare to arrest / charge ratios?
    - Have things been getting better or worse over time (when normalized by population growth)?
* Next steps:
    - Status of BJS master files?