# Generating Sanity Check Sample
This notebook generates a dataset suitable for sanity checking our overall analysis dataset, which clocks in at 28GB. 

In [21]:
import os 
from glob import glob 
import pandas as pd 
from PIL import Image
from pathlib import Path

### Helper Functions 

In [28]:
def convert_md_path_to_img_path(path, image_ref, output_dir): 
    base = os.path.basename(image_ref)
    if 'processed' in path: 
        try: 
            img = Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_0/"+base)
            img.save(f"{output_dir}/{base}")
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_0/"
        except FileNotFoundError as e: 
            pass 
        try: 
            img = Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_1/"+base)
            img.save(f"{output_dir}/{base}")
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_1/"
        except FileNotFoundError as e: 
            pass 
        try: 
            img = Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_2/"+base)
            img.save(f"{output_dir}/{base}")
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_2/"
        except FileNotFoundError as e: 
            pass 
        try: 
            img = Image.open("/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_3/"+base)
            img.save(f"{output_dir}/{base}")
            return "/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_3/"
        except FileNotFoundError as e: 
            pass 
    else: 
        
        img = Image.open("/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/"+Path(path).stem+'/'+base)
        img.save(f"{output_dir}/{base}")
        return "/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/"+Path(path).stem+'/'+base

### Constants 

In [32]:
# Aggregated analysis dataset path on disk (this dataset should be generated with the merge_csvs.sh script)
AGG_ANL_DATASET_PATH = "/share/pierson/nexar_data/nypd-deployment-patterns/output/analysis_dataset.csv"
# Output directory 
OUTPUT_DIR = "/share/pierson/nexar_data/nypd-deployment-patterns/output"
# Confidence threshold above which we denote y-hat as 1 (TRUE). Colloquially, we say this image has a police car if image.conf > CONF_THRESHOLD. 
CONF_THRESHOLD = 0.77

IMG_OUTPUT_DIR = "/share/pierson/nexar_data/nypd-deployment-patterns/output/sanity_check_imgs"

In [38]:
d = pd.read_csv(AGG_ANL_DATASET_PATH, engine='pyarrow')

In [5]:
d.describe()

Unnamed: 0,timestamp,lat,lng,split,conf,has_prediction,hour,day_of_week,day_of_month,month,...,Pop_20,BCT2020,GeoID,Hsp_20P,WNH_20P,BNH_20P,ANH_20P,ONH_20P,NH2pl_20P,density_tract
count,22305610.0,22305610.0,22305610.0,22305614.0,4123000.0,4123000.0,22305610.0,22305610.0,22305610.0,22305610.0,...,22191600.0,22191600.0,22191600.0,21821680.0,21672500.0,21748680.0,21384100.0,21337460.0,21830150.0,22191600.0
mean,1601069000000.0,22.29237,-55.503,1.0,0.1289979,1.0,14.36835,3.022419,14.01806,9.376895,...,4303.275,2543985.0,36054760000.0,25.7739,35.81166,18.5474,15.16012,1.620102,4.416331,59179.54
std,6466758000.0,42.12954,42.10516,0.0,0.2450485,0.0,6.006037,1.796533,8.554541,2.38261,...,2440.08,1187164.0,21430490.0,20.2486,27.06585,22.02066,15.3721,2.286612,4.807848,39802.01
min,1583366000000.0,-74.0392,-74.25456,1.0,0.0100098,1.0,0.0,0.0,1.0,3.0,...,0.0,1000201.0,36005000000.0,0.6,0.2,0.0,0.0,0.0,0.2,0.0
25%,1602252000000.0,40.64362,-73.97386,1.0,0.0152664,1.0,10.0,2.0,6.0,10.0,...,2645.0,1016200.0,36047030000.0,10.2,8.3,2.8,4.6,0.8,2.4,29269.43
50%,1603321000000.0,40.71472,-73.92599,1.0,0.0276184,1.0,16.0,3.0,12.0,10.0,...,4001.0,3016700.0,36061000000.0,17.7,33.0,7.5,10.0,1.1,3.9,54645.36
75%,1604358000000.0,40.76012,-73.83849,1.0,0.0796509,1.0,19.0,4.0,21.0,11.0,...,5766.0,3110600.0,36061020000.0,36.4,61.2,29.5,20.8,1.5,5.1,83689.48
max,1605503000000.0,40.91494,40.75177,1.0,0.98291,1.0,23.0,6.0,31.0,11.0,...,17222.0,5032300.0,36085030000.0,100.0,100.0,90.9,90.6,50.0,100.0,287831.6


### Recipe 

Randomly sample 100 rows from the overall dataset. 50 rows should have police cars, and the other 50 should not. Also need to retrieve the 100 raw images. 

In [39]:
sanity_check_d = pd.concat([d.loc[d.conf >= CONF_THRESHOLD].sample(n=50, random_state=8918), d.loc[d.conf < CONF_THRESHOLD].sample(n=50, random_state=8918)])

In [40]:
sanity_check_d.describe()

Unnamed: 0,timestamp,lat,lng,split,conf,has_prediction,hour,day_of_week,day_of_month,month,...,Pop_20,BCT2020,GeoID,Hsp_20P,WNH_20P,BNH_20P,ANH_20P,ONH_20P,NH2pl_20P,density_tract
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,99.0,99.0,99.0,98.0,98.0,99.0,100.0
mean,1601568000000.0,40.731372,-73.944392,1.0,0.476714,1.0,13.92,3.16,14.8,9.54,...,4897.46,2175145.0,36054970000.0,24.074747,38.387879,16.50202,15.843878,1.573469,3.79596,66262.601924
std,6390437000.0,0.059666,0.058076,0.0,0.424249,0.0,6.229094,1.739151,9.415225,2.34163,...,2402.493685,1207467.0,20822430.0,19.839984,27.882027,20.632414,14.755366,1.819853,1.60127,35572.917727
min,1583370000000.0,40.585536,-74.021502,1.0,0.01017,1.0,0.0,0.0,1.0,3.0,...,0.0,1001300.0,36005000000.0,4.2,0.7,0.7,0.1,0.3,1.0,0.0
25%,1602543000000.0,40.700886,-73.991245,1.0,0.026779,1.0,9.0,2.0,7.0,10.0,...,3238.25,1008700.0,36047050000.0,9.45,7.6,2.2,5.025,0.8,2.55,40413.988824
50%,1603862000000.0,40.725232,-73.956708,1.0,0.754882,1.0,15.0,3.0,13.0,10.0,...,4645.5,2008400.0,36061000000.0,15.2,38.5,7.0,11.35,1.05,3.7,66653.377577
75%,1604524000000.0,40.75842,-73.919714,1.0,0.901611,1.0,18.25,4.0,23.0,11.0,...,6520.25,3059925.0,36061010000.0,33.65,64.5,22.05,22.475,1.575,5.1,91998.330247
max,1605450000000.0,40.8945,-73.786004,1.0,0.96582,1.0,23.0,6.0,31.0,11.0,...,11556.0,4094203.0,36081090000.0,84.5,88.1,80.4,71.1,14.1,7.2,163643.146522


In [41]:
sanity_check_d.to_csv(f"{OUTPUT_DIR}/sanity_check_dataset.csv")

### Pulling Images

In [42]:
md2img = pd.read_csv(f"{OUTPUT_DIR}/md2img.csv", engine='pyarrow')

In [43]:
sanity_check_d = sanity_check_d.merge(md2img, how='left')

In [46]:
for idx, row in sanity_check_d.iterrows():
    print(convert_md_path_to_img_path(row['parent_dir'], row['image_ref'], IMG_OUTPUT_DIR))

/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_3/
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1603080000000/fod_56797529-BC96-418C-AF74-5C09AD52F394.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1602820800000/fod_20201016_204854_7109426e-7dd0-4a4d-ac0a-fde7c830a8b5.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1602302400000/fod_20201010_152740_176a5ad4-41e4-40f8-9dec-dea7d4055c2d.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1604466000000/fod_EF32485C-3185-4AEB-9FFD-B582C30F5E33.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1604898000000/fod_5A209D25-B825-4569-B374-66F2D61F1CF8.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1602561600000/fod_3767E25A-B65E-469C-9E7E-7A0DEF36777C.jpg
/share/pierson/nexar_data/raw_data/imgs/thursdays/dir_1/
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1603598400000/fod_20201025_080221_a470da64-cab9-4a9e-9b18-ce79317d311b.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-

/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1602734400000/fod_C5318F06-5B11-4C06-8AEA-20DFDD68ECCC.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1605243600000/fod_20201113_075942_d8dec60d-d35c-42ec-b68e-643a84da5199.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1603857600000/fod_843A6308-FC7C-4119-B06E-42746A9C4750.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1603512000000/fod_20201024_161826_c27bc3dd-aa39-4e2b-97ba-a8ab9d610730.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1604116800000/fod_312604E8-A2CE-44D3-8A4A-7F6EE7AA868F.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1604203200000/fod_9A7AC469-54D5-4A04-B9D3-D8722402D1F4.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1603425600000/fod_20201023_220417_fb4eb105-ab7b-458b-a3ed-31f1a97e1498.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-nov-15/1604552400000/fod_B411D921-90BE-424F-8630-B89EF6778505.jpg
/share/pierson/nexar_data/raw_data/imgs/oct_15-n

In [47]:
assert len(glob(f"{IMG_OUTPUT_DIR}/*.jpg")) == len(sanity_check_d.index)