In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import math
import seaborn as sns

%matplotlib inline
tqdm.pandas()

# 1. Prepare dataset

- `bg20_adi`: Neighborhood Atlas's newly published 2020 ADI

In [2]:
bg20_adi = pd.read_csv('../data/US_2020_ADI_Census Block Group_v3.2.csv', 
                        dtype={'FIPS':str, 'ADI_NATRANK':str, 'ADI_STATERNK':str})
bg20_adi = bg20_adi[['FIPS', 'ADI_NATRANK', 'ADI_STATERNK']]
bg20_adi = bg20_adi.rename(columns={'FIPS':'bg_fips'})

In [3]:
bg20_adi

Unnamed: 0,bg_fips,ADI_NATRANK,ADI_STATERNK
0,010010201001,73,5
1,010010201002,62,3
2,010010202001,83,7
3,010010202002,87,7
4,010010203001,73,5
...,...,...,...
242330,721537506011,92,6
242331,721537506012,87,4
242332,721537506013,93,7
242333,721537506021,98,10


`food_valid`: we need to import this (which includes all original addresses we collected by computing geo-distances between a food pantry point and neighborhoods within 25 miles) -> Then, we need to merge it with the newly published 2020 ADI values for the updated database.

In [4]:
food_valid = pd.read_pickle('../data/food_valid_filtered_all_combined.pkl')

In [5]:
food_valid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37154567 entries, 0 to 37154566
Data columns (total 8 columns):
 #   Column       Dtype  
---  ------       -----  
 0   address      object 
 1   ad_lat       float64
 2   ad_lon       float64
 3   ad_bg_fips   object 
 4   bg_fips      object 
 5   bg_lat       float64
 6   bg_lon       float64
 7   distance_mi  float64
dtypes: float64(5), object(3)
memory usage: 2.2+ GB


## Merge 2020 ADI with `food_valid`

In [6]:
print(len(food_valid['address'].unique()), len(food_valid['bg_fips'].unique()))

34475 238536


In [7]:
food_valid_adi = food_valid.merge(bg20_adi.drop_duplicates(subset=['bg_fips']), how='left', on='bg_fips')

In [8]:
food_valid_adi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37154567 entries, 0 to 37154566
Data columns (total 10 columns):
 #   Column        Dtype  
---  ------        -----  
 0   address       object 
 1   ad_lat        float64
 2   ad_lon        float64
 3   ad_bg_fips    object 
 4   bg_fips       object 
 5   bg_lat        float64
 6   bg_lon        float64
 7   distance_mi   float64
 8   ADI_NATRANK   object 
 9   ADI_STATERNK  object 
dtypes: float64(5), object(5)
memory usage: 3.0+ GB


In [9]:
food_valid_adi_no_nan = food_valid_adi.loc[food_valid_adi['ADI_NATRANK'].isna()==False]
food_valid_adi_no_nan_no_invalid = food_valid_adi_no_nan.loc[food_valid_adi_no_nan['ADI_NATRANK'].str.isnumeric()]

In [10]:
food_valid_adi_no_nan_no_invalid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35940827 entries, 0 to 37154566
Data columns (total 10 columns):
 #   Column        Dtype  
---  ------        -----  
 0   address       object 
 1   ad_lat        float64
 2   ad_lon        float64
 3   ad_bg_fips    object 
 4   bg_fips       object 
 5   bg_lat        float64
 6   bg_lon        float64
 7   distance_mi   float64
 8   ADI_NATRANK   object 
 9   ADI_STATERNK  object 
dtypes: float64(5), object(5)
memory usage: 2.9+ GB


In [11]:
food_valid_adi_no_nan_no_invalid['ADI_NATRANK'] = food_valid_adi_no_nan_no_invalid['ADI_NATRANK'].astype('int')
food_valid_adi_no_nan_no_invalid['ADI_STATERNK'] = food_valid_adi_no_nan_no_invalid['ADI_STATERNK'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  food_valid_adi_no_nan_no_invalid['ADI_NATRANK'] = food_valid_adi_no_nan_no_invalid['ADI_NATRANK'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  food_valid_adi_no_nan_no_invalid['ADI_STATERNK'] = food_valid_adi_no_nan_no_invalid['ADI_STATERNK'].astype('int')


In [12]:
food_valid_adi_no_nan_no_invalid

Unnamed: 0,address,ad_lat,ad_lon,ad_bg_fips,bg_fips,bg_lat,bg_lon,distance_mi,ADI_NATRANK,ADI_STATERNK
0,"80 Richmond Townhouse Rd, Carolina, RI 02812",41.489365,-71.660706,440090506004,090116903001,41.362105,-72.107022,24.720511,47,7
1,"80 Richmond Townhouse Rd, Carolina, RI 02812",41.489365,-71.660706,440090506004,090116904001,41.356744,-72.108543,24.928589,82,10
2,"80 Richmond Townhouse Rd, Carolina, RI 02812",41.489365,-71.660706,440090506004,090116904002,41.351946,-72.106542,24.957116,63,9
3,"80 Richmond Townhouse Rd, Carolina, RI 02812",41.489365,-71.660706,440090506004,090116905003,41.353296,-72.101670,24.688172,68,9
4,"80 Richmond Townhouse Rd, Carolina, RI 02812",41.489365,-71.660706,440090506004,090116905001,41.358894,-72.102416,24.578674,73,10
...,...,...,...,...,...,...,...,...,...,...
37154562,"1409 Park Ave, Woonsocket, RI 02895",41.984573,-71.515750,440070175003,440070184002,42.014946,-71.466851,3.270068,53,8
37154563,"1409 Park Ave, Woonsocket, RI 02895",41.984573,-71.515750,440070175003,440070184004,42.008856,-71.470713,2.855163,53,8
37154564,"1409 Park Ave, Woonsocket, RI 02895",41.984573,-71.515750,440070175003,440070185001,41.990746,-71.488182,1.477591,50,7
37154565,"1409 Park Ave, Woonsocket, RI 02895",41.984573,-71.515750,440070175003,440070185002,41.998316,-71.481420,2.001014,55,8


In [13]:
food_valid_adi_no_nan_no_invalid = food_valid_adi_no_nan_no_invalid.reset_index(drop=True)

Match with state and county

In [14]:
def find_state_fips(bg, state_data):
    
    st_fips = bg[:2]
    for i in range(len(state_data)):
        if state_data['st_fips'][i] == st_fips:            
            return state_data['st_abb'][i]

state_fips = pd.read_csv('../data/us-state-fips.csv',
                        dtype = {'stname':str, ' st':str, ' stusps':str})
state_fips = state_fips.rename(columns={'stname':'STATE', ' st':'st_fips', ' stusps':'st_abb'})


state_fips.st_abb = state_fips.st_abb.str.strip()
state_fips.st_fips = state_fips.st_fips.str.strip()


(ATTENTION!!) Running the below snippet will take nearly 5 hours to complete all matching between BG and its state. Please be aware that. 

In [35]:
food_valid_adi_no_nan_no_invalid['bg_state'] = food_valid_adi_no_nan_no_invalid.progress_apply(lambda x:find_state_fips(x.bg_fips, state_fips), axis=1)
food_valid_adi_no_nan_no_invalid['address_state'] = food_valid_adi_no_nan_no_invalid.progress_apply(lambda x:find_state_fips(x.ad_bg_fips, state_fips), axis=1)

100%|██████████| 35940827/35940827 [2:39:56<00:00, 3745.18it/s]  
100%|██████████| 35940827/35940827 [2:46:23<00:00, 3600.07it/s]  


In [40]:
len(food_valid_adi_no_nan_no_invalid.bg_state.unique())

51

unique `bg_fips`

In [15]:
len(food_valid_adi_no_nan_no_invalid.bg_fips.unique())

232735

Saved the preprocessed `food_valid` (all of (pantry, BG) pairs) where each BG have numeric and non-missing national ADI percentiles. WILL BE MAINLY USED FOR FUTURE ANALYSIS.

In [36]:
#food_valid_adi_no_nan_no_invalid.to_pickle('../data/food_bg20_adi_st.pkl')

# Data Statistics

- Number of food pantries per state: 34475
- Number of unique BGs: 232,735

In [15]:
num_unique_pantries = len(food_valid_adi_no_nan_no_invalid.address.unique())
print(f"number of FPs in total: {num_unique_pantries}")

num_unique_bgs = len(food_valid_adi_no_nan_no_invalid.bg_fips.unique())
print(f"number of unique BGs: {num_unique_bgs}")

number of FPs in total: 34475
number of unique BGs: 232735
