In [191]:
import pandas as pd 
import geopandas as gpd 
import numpy as np 
import json 
from glob import glob 

import sys 
sys.path.append("../")
from logger import setup_logger
logger = setup_logger("analysis-df-assembly")
logger.setLevel("INFO")

import os 

logger.info("Modules loaded.")



[34m2024-10-22 21:56:53 - analysis-df-assembly - INFO - Modules loaded.[0m


In [192]:
ICAR_NONE_RUN='../runs/icar_none/simulated_False/ahl_True/20241021-1038'
ICAR_CHEATING_RUN='../runs/icar_cheating/simulated_False/ahl_True/20241022-1130'

In [193]:
ICAR_NONE_ESTIMATES = glob(f"{ICAR_NONE_RUN}/estimate*.csv")
ICAR_CHEATING_ESTIMATES = glob(f"{ICAR_CHEATING_RUN}/estimate*.csv")
logger.info(f"Found {len(ICAR_NONE_ESTIMATES)} ICAR_NONE estimates and {len(ICAR_CHEATING_ESTIMATES)} ICAR_CHEATING estimates.")

[34m2024-10-22 21:56:53 - analysis-df-assembly - INFO - Found 2 ICAR_NONE estimates and 3 ICAR_CHEATING estimates.[0m


In [194]:
icar_cheating_estimates = {} 
for f in ICAR_CHEATING_ESTIMATES:
    df = pd.read_csv(f)
    df['tract_id'] = df['tract_id'].astype(int).astype(str)
    icar_cheating_estimates[os.path.splitext(os.path.basename(f))[0]] = df


In [195]:
icar_none_estimates = {} 
for f in ICAR_NONE_ESTIMATES:
    df = pd.read_csv(f)
    df['tract_id'] = df['tract_id'].astype(int).astype(str)
    icar_none_estimates[os.path.splitext(os.path.basename(f)[0])] = df
    

In [196]:
USE_SMOOTHING = True 
if USE_SMOOTHING: 
    icar_model_estimates = icar_cheating_estimates
    logger.info("Using smoothed estimates.")
else:
    icar_model_estimates = icar_none_estimates
    logger.info("Using unsmoothed estimates.")

[34m2024-10-22 21:56:53 - analysis-df-assembly - INFO - Using smoothed estimates.[0m


In [197]:
ct_nyc = gpd.read_file('geo/data/ct-nyc-wi-2020.geojson')


TO_DROP = ['OBJECTID','BoroCode','CT2020','CDEligibil','NTA2020','CDTA2020','Shape__Area','Shape__Length','geometry']
ct_nyc.drop(columns=TO_DROP, inplace=True)

logger.info(f"Loaded NYC CT shapefile with {len(ct_nyc.index)} CTs.")

[34m2024-10-22 21:56:53 - analysis-df-assembly - INFO - Loaded NYC CT shapefile with 2325 CTs.[0m


In [198]:
ct_nyc.columns

Index(['CTLabel', 'BoroName', 'BoroCT2020', 'NTAName', 'CDTANAME', 'GEOID',
       'PUMA'],
      dtype='object')

In [199]:
ct_nyc_clip = gpd.read_file('geo/data/ct-nyc-2020.geojson')
logger.info(f"Loaded NYC CT (water clipped) shapefile with {len(ct_nyc_clip.index)} CTs.")

[34m2024-10-22 21:56:54 - analysis-df-assembly - INFO - Loaded NYC CT (water clipped) shapefile with 2327 CTs.[0m


In [200]:
ct_nyc = ct_nyc.merge(icar_model_estimates['estimate_p_y'], left_on='GEOID', right_on='tract_id', suffixes=('_ct', '_p_y'))
ct_nyc = ct_nyc.merge(icar_model_estimates['estimate_at_least_one_positive_image_by_area'], left_on='GEOID', right_on='tract_id', suffixes=('_ct', '_p_alop'))
ct_nyc = ct_nyc.merge(icar_model_estimates['estimate_at_least_one_positive_image_by_area_if_you_have_100_images'], left_on='GEOID', right_on='tract_id', suffixes=('_ct', '_p_alop_100'))
logger.info(f"Merged NYC CT shapefile with icar model estimates.")

[34m2024-10-22 21:56:54 - analysis-df-assembly - INFO - Merged NYC CT shapefile with icar model estimates.[0m


In [201]:
# Load data
dp05_nyc_md = pd.read_json('demo/data/acs22_dp05_md.json')

# Normalize the 'variables' column in the JSON
dp05_nyc_md = pd.json_normalize(dp05_nyc_md['variables']).set_index(dp05_nyc_md.index)

# Parse out the 'label' column
# In all rows of the 'label', get the lowest and highest number of '!!'
min_sep = min(dp05_nyc_md['label'].apply(lambda x: x.count('!!')))
max_sep = max(dp05_nyc_md['label'].apply(lambda x: x.count('!!')))

# Create 'desc_i' columns for each level of '!!'
for i in range(min_sep + 1, max_sep + 2):  # Adjusting range to account for correct indexing
    dp05_nyc_md[f'desc_{i}'] = dp05_nyc_md['label'].apply(
        lambda x: x.split('!!')[i-1] if len(x.split('!!')) >= i else None
    )

# drop TO_DROP 
TO_DROP = ['label','concept','predicateType','group','limit','predicateOnly']
dp05_nyc_md = dp05_nyc_md.drop(columns=TO_DROP)

desc_1_filter = ['Estimate']
dp05_nyc_md = dp05_nyc_md[dp05_nyc_md['desc_1'].isin(desc_1_filter)]

# Output the modified dataframe
# display all rows 
dp05_nyc_md = dp05_nyc_md.sort_index()

In [202]:
dp05_nyc = pd.read_json('demo/data/acs22_dp05.json', orient='records')

dp05_nyc.columns = dp05_nyc.iloc[0]
dp05_nyc = dp05_nyc[1:]

dp05_nyc['tract_id'] = dp05_nyc['GEO_ID'].str.split('US', expand=True)[1]

RACE_COLS = {
    'DP05_0079E': 'nhl_white_alone', 
    'DP05_0080E': 'nhl_black_alone', 
    'DP05_0073E': 'hispanic_alone', 
    'DP05_0082E': 'nhl_asian_alone'
}

race_nyc = dp05_nyc[list(RACE_COLS.keys())]
race_nyc.columns = race_nyc.columns.map(lambda x: RACE_COLS[x])
race_nyc.index = dp05_nyc['tract_id']
race_nyc 

Unnamed: 0_level_0,nhl_white_alone,nhl_black_alone,hispanic_alone,nhl_asian_alone
tract_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
36005000100,1098,2000,1172,123
36005000200,83,1281,3109,299
36005000400,283,1559,4212,103
36005001600,106,2132,3507,148
36005001901,306,942,842,0
...,...,...,...,...
36085030302,2209,1568,1625,918
36085031901,289,1626,1469,224
36085031902,473,2388,1913,217
36085032300,109,421,394,21


In [203]:
ct_nyc = ct_nyc.merge(race_nyc, left_on='GEOID', right_index=True)  

In [204]:
ct_nyc.isna().sum()

CTLabel                                                                0
BoroName                                                               0
BoroCT2020                                                             0
NTAName                                                                0
CDTANAME                                                               0
GEOID                                                                  0
PUMA                                                                   0
tract_id_ct                                                            0
empirical_estimate_ct                                                  6
p_y                                                                    0
p_y_CI_lower                                                           0
p_y_CI_upper                                                           0
n_images_by_area_ct                                                    0
tract_id_p_alop                                    

In [205]:
ct_nyc = ct_nyc.set_index('GEOID')



In [206]:
ct_nyc.to_csv('analysis_df.csv')