# Generating Sanity Check Sample
This notebook generates a dataset suitable for sanity checking our overall analysis dataset, which clocks in at 28GB. 

In [1]:
import os 
from glob import glob 
import pandas as pd 

### Helper Functions 

In [8]:
def convert_md_path_to_img_path(path, base): 
    if 'oct' in path: 
        return "/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/"+Path(path).stem
    elif 'thursdays' in path: 
        try: 
            Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_0/"+base)
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_0/"
        except FileNotFoundError as e: 
            pass 
        try: 
            Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_1/"+base)
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_1/"
        except FileNotFoundError as e: 
            pass 
        try: 
            Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_2/"+base)
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_2/"
        except FileNotFoundError as e: 
            pass 
        try: 
            Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_3/"+base)
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_3/"
        except FileNotFoundError as e: 
            pass 

### Constants 

In [6]:
# Aggregated analysis dataset path on disk (this dataset should be generated with the merge_csvs.sh script)
AGG_ANL_DATASET_PATH = "/share/pierson/nexar_data/nypd-deployment-patterns/output/analysis_dataset.csv"
# Output directory 
OUTPUT_DIR = "/share/pierson/nexar_data/nypd-deployment-patterns/output"
# Confidence threshold above which we denote y-hat as 1 (TRUE). Colloquially, we say this image has a police car if image.conf > CONF_THRESHOLD. 
CONF_THRESHOLD = 0.77

In [3]:
d = pd.read_csv(AGG_ANL_DATASET_PATH, engine='pyarrow')

In [4]:
d.describe()

Unnamed: 0,timestamp,lat,lng,split,conf,has_prediction,hour,day_of_week,day_of_month,month,...,Pop_20,BCT2020,GeoID,Hsp_20P,WNH_20P,BNH_20P,ANH_20P,ONH_20P,NH2pl_20P,density_tract
count,22305610.0,22305610.0,22305610.0,22305614.0,4123000.0,4123000.0,22305610.0,22305610.0,22305610.0,22305610.0,...,22191600.0,22191600.0,22191600.0,21821680.0,21672500.0,21748680.0,21384100.0,21337460.0,21830150.0,22191600.0
mean,1601069000000.0,22.29237,-55.503,1.0,0.1289979,1.0,14.36835,3.022419,14.01806,9.376895,...,4303.275,2543985.0,36054760000.0,25.7739,35.81166,18.5474,15.16012,1.620102,4.416331,59179.54
std,6466758000.0,42.12954,42.10516,0.0,0.2450485,0.0,6.006037,1.796533,8.554541,2.38261,...,2440.08,1187164.0,21430490.0,20.2486,27.06585,22.02066,15.3721,2.286612,4.807848,39802.01
min,1583366000000.0,-74.0392,-74.25456,1.0,0.0100098,1.0,0.0,0.0,1.0,3.0,...,0.0,1000201.0,36005000000.0,0.6,0.2,0.0,0.0,0.0,0.2,0.0
25%,1602252000000.0,40.64362,-73.97386,1.0,0.0152664,1.0,10.0,2.0,6.0,10.0,...,2645.0,1016200.0,36047030000.0,10.2,8.3,2.8,4.6,0.8,2.4,29269.43
50%,1603321000000.0,40.71472,-73.92599,1.0,0.0276184,1.0,16.0,3.0,12.0,10.0,...,4001.0,3016700.0,36061000000.0,17.7,33.0,7.5,10.0,1.1,3.9,54645.36
75%,1604358000000.0,40.76012,-73.83849,1.0,0.0796509,1.0,19.0,4.0,21.0,11.0,...,5766.0,3110600.0,36061020000.0,36.4,61.2,29.5,20.8,1.5,5.1,83689.48
max,1605503000000.0,40.91494,40.75177,1.0,0.98291,1.0,23.0,6.0,31.0,11.0,...,17222.0,5032300.0,36085030000.0,100.0,100.0,90.9,90.6,50.0,100.0,287831.6


### Recipe 

Randomly sample 100 rows from the overall dataset. 50 rows should have police cars, and the other 50 should not. Also need to retrieve the 100 raw images. 

In [7]:
sanity_check_d = pd.concat([d.loc[d.conf >= CONF_THRESHOLD].sample(n=50, random_state=8918), d.loc[d.conf < CONF_THRESHOLD].sample(n=50, random_state=8918)])

In [9]:
sanity_check_d.describe()

Unnamed: 0,timestamp,lat,lng,split,conf,has_prediction,hour,day_of_week,day_of_month,month,...,Pop_20,BCT2020,GeoID,Hsp_20P,WNH_20P,BNH_20P,ANH_20P,ONH_20P,NH2pl_20P,density_tract
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0,99.0
mean,1601124000000.0,24.678079,-57.892171,1.0,0.480197,1.0,14.14,3.26,12.7,9.44,...,4852.161616,2318783.0,36054180000.0,23.362626,38.039394,18.009091,14.850505,1.470707,4.282828,72305.87585
std,6723519000.0,40.003273,39.986936,0.0,0.431688,0.0,6.068664,1.867424,8.405746,2.495734,...,2209.252633,1190024.0,20239960.0,19.710034,26.273366,20.70423,12.548952,2.210931,2.100974,41292.620836
min,1583370000000.0,-73.999293,-74.119296,1.0,0.0103,1.0,0.0,0.0,1.0,3.0,...,27.0,1000700.0,36005000000.0,1.2,0.6,1.0,0.1,0.2,1.0,93.868117
25%,1602324000000.0,40.662309,-73.976293,1.0,0.020096,1.0,9.0,2.0,5.75,10.0,...,3439.0,1010950.0,36047030000.0,9.4,16.4,2.75,5.2,0.8,3.05,43090.326753
50%,1603428000000.0,40.72627,-73.944943,1.0,0.575317,1.0,16.0,3.0,12.0,10.0,...,4709.0,3000700.0,36061010000.0,15.6,34.5,9.2,12.8,1.1,4.2,71138.388601
75%,1604509000000.0,40.767751,-73.884354,1.0,0.923706,1.0,19.0,5.0,16.0,11.0,...,6359.5,3066800.0,36061020000.0,29.05,62.1,29.2,22.75,1.4,5.4,101895.480633
max,1605475000000.0,40.876707,40.747012,1.0,0.97168,1.0,23.0,6.0,31.0,11.0,...,10542.0,5013201.0,36085010000.0,81.5,92.2,81.4,53.7,20.4,14.8,204162.548743


In [None]:
sanity_check_d