In [82]:
import pandas as pd 
import geopandas as gpd 
import numpy as np 
import json 
from glob import glob 

import sys 
sys.path.append("../")
from logger import setup_logger
logger = setup_logger("analysis-df-assembly")
logger.setLevel("INFO")

import os 

logger.info("Modules loaded.")



[34m2024-10-22 19:50:26 - analysis-df-assembly - INFO - Modules loaded.[0m


In [83]:
ICAR_NONE_RUN='../runs/icar_none/simulated_False/ahl_True/20241021-1038'
ICAR_CHEATING_RUN='../runs/icar_cheating/simulated_False/ahl_True/20241022-1130'

In [84]:
ICAR_NONE_ESTIMATES = glob(f"{ICAR_NONE_RUN}/estimate*.csv")
ICAR_CHEATING_ESTIMATES = glob(f"{ICAR_CHEATING_RUN}/estimate*.csv")
logger.info(f"Found {len(ICAR_NONE_ESTIMATES)} ICAR_NONE estimates and {len(ICAR_CHEATING_ESTIMATES)} ICAR_CHEATING estimates.")

[34m2024-10-22 19:50:26 - analysis-df-assembly - INFO - Found 2 ICAR_NONE estimates and 3 ICAR_CHEATING estimates.[0m


In [85]:
icar_cheating_estimates = {} 
for f in ICAR_CHEATING_ESTIMATES:
    df = pd.read_csv(f)
    df['tract_id'] = df['tract_id'].astype(int).astype(str)
    icar_cheating_estimates[os.path.splitext(os.path.basename(f))[0]] = df


In [86]:
icar_cheating_estimates

{'estimate_at_least_one_positive_image_by_area':          tract_id  empirical_estimate  at_least_one_positive_image_by_area  \
 0     36061000100                 NaN                             0.000000   
 1     36061000201            0.000000                             0.548568   
 2     36061000600            0.002367                             1.000000   
 3     36061001401            0.000000                             0.352224   
 4     36061001402            0.000000                             0.758596   
 ...           ...                 ...                                  ...   
 2322  36085017600            0.000000                             0.183523   
 2323  36085022802            0.004902                             0.409292   
 2324  36085029102            0.000000                             0.901318   
 2325  36005016100            0.000000                             0.472381   
 2326  36005016300            0.008000                             0.273413   
 
  

In [87]:
icar_none_estimates = {} 
for f in ICAR_NONE_ESTIMATES:
    df = pd.read_csv(f)
    df['tract_id'] = df['tract_id'].astype(int).astype(str)
    icar_none_estimates[os.path.splitext(os.path.basename(f)[0])] = df
    

In [88]:
USE_SMOOTHING = True 
if USE_SMOOTHING: 
    icar_model_estimates = icar_cheating_estimates
    logger.info("Using smoothed estimates.")
else:
    icar_model_estimates = icar_none_estimates
    logger.info("Using unsmoothed estimates.")

[34m2024-10-22 19:50:26 - analysis-df-assembly - INFO - Using smoothed estimates.[0m


In [89]:
ct_nyc = gpd.read_file('geo/data/ct-nyc-wi-2020.geojson')
logger.info(f"Loaded NYC CT shapefile with {len(ct_nyc.index)} CTs.")

[34m2024-10-22 19:50:26 - analysis-df-assembly - INFO - Loaded NYC CT shapefile with 2325 CTs.[0m


In [90]:
ct_nyc_clip = gpd.read_file('geo/data/ct-nyc-2020.geojson')
logger.info(f"Loaded NYC CT (water clipped) shapefile with {len(ct_nyc_clip.index)} CTs.")

[34m2024-10-22 19:50:26 - analysis-df-assembly - INFO - Loaded NYC CT (water clipped) shapefile with 2327 CTs.[0m


In [91]:
ct_nyc = ct_nyc.merge(icar_model_estimates['estimate_p_y'], left_on='GEOID', right_on='tract_id', suffixes=('_ct', '_p_y'))
ct_nyc = ct_nyc.merge(icar_model_estimates['estimate_at_least_one_positive_image_by_area'], left_on='GEOID', right_on='tract_id', suffixes=('_ct', '_p_alop'))
ct_nyc = ct_nyc.merge(icar_model_estimates['estimate_at_least_one_positive_image_by_area_if_you_have_100_images'], left_on='GEOID', right_on='tract_id', suffixes=('_ct', '_p_alop_100'))
logger.info(f"Merged NYC CT shapefile with icar model estimates.")

[34m2024-10-22 19:50:26 - analysis-df-assembly - INFO - Merged NYC CT shapefile with icar model estimates.[0m
