# Bootstrapping Results. 

In [2]:
import pandas as pd
import datetime
import statsmodels.api as sm
from statsmodels.stats.weightstats import DescrStatsW
from scipy.stats import pearsonr, spearmanr
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import json
from tqdm import tqdm


In [3]:
VALSET_PATH = '/share/pierson/nexar_data/dashcam-analysis/final_model_metrics/valset_2.csv'
BASE_CHUNKS_PATH = '/share/pierson/nexar_data/FINAL_CHUNKS_ETHNICITY_DATA/%i.csv'
N_CHUNKS = 20
COLS_TO_DEDUPLICATE_ON = ['lat', 'lng', 'timestamp'] # columns to use to check for duplicates
MIN_DATE_FOR_DEMOGRAPHIC_ANALYSIS = datetime.datetime(2020, 10, 5) # don't use data before this data to analyze disparities / demographics
POSITIVE_CLASSIFICATION_THRESHOLD = 0.77 # threshold to define a positive prediction
LOCATION_COL_TO_GROUP_ON = 'NAME' # This should be the name of the column we're analyzing location grouping at - e.g., corresponding to Census Block Group or Census tract.
TOTAL_POPULATION_COL = 'Estimate_Total' # needs to match whether using Census tract or Block group. 
WHITE_POPULATION_COL = 'Estimate_Total_Not_Hispanic_or_Latino_White_alone'
BLACK_POPULATION_COL = 'Estimate_Total_Not_Hispanic_or_Latino_Black_or_African_American_alone'
HISPANIC_POPULATION_COL = 'Estimate_Total_Hispanic_or_Latino'
ASIAN_POPULATION_COL = 'Estimate_Total_Not_Hispanic_or_Latino_Asian_alone'
POPULATION_COUNT_COLS = [WHITE_POPULATION_COL, BLACK_POPULATION_COL, HISPANIC_POPULATION_COL, ASIAN_POPULATION_COL, TOTAL_POPULATION_COL]
TIME_AND_DATE_COL = 'time_and_date_of_image'
DEMOGRAPHIC_COLS = ['density_cbg', # things we want to look at correlations with. Demographic cols may not be best name. 
                    'black_frac',
                    'white_frac', 
                    'distance_from_nearest_crime_6hr',
                    'distance_from_nearest_police_station',
                    'median_household_income']
PREDICTION_COLS = ['above_threshold', 'calibrated_prediction', 'prediction_adjusted_for_police_station_distance'] # columns with police car predictions. We define these
MIN_POPULATION_IN_AREA = 500
BOROUGH_COL = 'boroname'
NEIGHBORHOOD_COL = 'ntaname'
N_BOOTSTRAPS = 20
ZONE_THRESHOLD = 0.5


In [4]:
d = []
for i in range(N_CHUNKS):
    d_i = pd.read_csv(BASE_CHUNKS_PATH % i)
    print('Read in chunk %i with %i rows' % (i, len(d_i)))
    d.append(d_i)
d = pd.concat(d)
#d.iloc[0][[a for a in d.columns if 'Margin of Error' not in a and 'Two races' not in a]] # just print out what dataframe looks like

Read in chunk 0 with 1115281 rows


KeyboardInterrupt: 

In [None]:
# remove duplicates. 
duplicate_idxs = d.duplicated(subset=COLS_TO_DEDUPLICATE_ON)
print("warning: %i duplicates identified using %s, fraction %2.6f of rows; dropping rows" % (duplicate_idxs.sum(), COLS_TO_DEDUPLICATE_ON, duplicate_idxs.mean()))
d = d.loc[~duplicate_idxs].copy()

cbg_zone_data = pd.read_csv('/share/pierson/nexar_data/5_other_datasets/cbgs_zone_data.csv')
assert (1.*(cbg_zone_data['C'] > ZONE_THRESHOLD) + 1.*(cbg_zone_data['M'] > ZONE_THRESHOLD) + 1.*(cbg_zone_data['R'] > ZONE_THRESHOLD)).max() == 1
cbg_zone_dict = {}
for zone_val in ['C', 'M', 'R']:
    zones = cbg_zone_data.loc[cbg_zone_data[zone_val] >= ZONE_THRESHOLD]
    print("%i CBGs classified as %s" % (len(zones), zone_val))
    cbg_zone_dict.update(dict(zip(zones['GEOID20'].values, [zone_val for _ in range(len(zones))])))
print(len(cbg_zone_dict))
d['zone'] = d['GEOID20'].map(lambda x:cbg_zone_dict[x] if x in cbg_zone_dict else None)
print("zone classification of images")
print(d['zone'].value_counts(dropna=False))

def household_income_map(x):
    if x == '-':
        return None
    elif x == '250,000+':
        return 250000
    elif x == '2,500-':
        return 2500
    return float(x)

# define Census variables
d['median_household_income'] = d['median_household_income'].map(household_income_map)
d['white_frac'] = d[WHITE_POPULATION_COL] / d[TOTAL_POPULATION_COL]
d['black_frac'] = d[BLACK_POPULATION_COL] / d[TOTAL_POPULATION_COL]
assert d['white_frac'].dropna().max() <= 1
assert d['white_frac'].dropna().min() >= 0
assert d['black_frac'].dropna().max() <= 1
assert d['black_frac'].dropna().min() >= 0


# define time variables
d['date'] = d[TIME_AND_DATE_COL].map(lambda x:datetime.datetime.strptime(x.split()[0], '%Y-%m-%d'))
locations_by_date = d.groupby('date')[LOCATION_COL_TO_GROUP_ON].nunique()
print('unique locations by', locations_by_date)

# filter for dates with full coverage. 
print("In demographic analysis, filtering for locations after %s because more geographically representative" % MIN_DATE_FOR_DEMOGRAPHIC_ANALYSIS)
d_for_demo_analysis = d.loc[d['date'] >= MIN_DATE_FOR_DEMOGRAPHIC_ANALYSIS].copy()
print("%i/%i rows remaining" % (len(d_for_demo_analysis), len(d)))

for col in [WHITE_POPULATION_COL, BLACK_POPULATION_COL, HISPANIC_POPULATION_COL, ASIAN_POPULATION_COL, TOTAL_POPULATION_COL]:
    print("Setting fraction %2.6f of rows with %s = NA to 0" % (d_for_demo_analysis[col].isnull().mean(), 
                                                            col))
    d_for_demo_analysis.loc[d_for_demo_analysis[col].isnull(), col] = 0