## Necessary imports

In [2]:
#import necessary stuff
import pandas as pd
import requests
import os
from dotenv import load_dotenv
import json

## Use the JSON file containing Google Trends locations

Retrieved from this link: https://trends.google.com/trends/api/explore/pickers/geo?hl=en-US

In [5]:
google_trends_locations_file_reader = open('resources/google_trends-locations.json', 'r')
google_trends_locations = json.load(google_trends_locations_file_reader)

## Focus only on the locations in the US

In [6]:
def get_subtree_with_name(json_obj, name):
    if json_obj['name'] == name:
        return json_obj
    elif 'children' in json_obj:
        for child in json_obj['children']:
            subtree = get_subtree_with_name(child, name)
            if subtree is not None:
                return subtree

In [7]:
only_US = get_subtree_with_name(google_trends_locations, 'United States')
# save it
with open('resources/google_trends-locations-US.json', 'w') as outfile:
    json.dump(only_US, outfile)

## Focus only on the Metro Areas

In [45]:
class MetroArea:
    def __init__(self, name, id):
        self.name = name
        self.id = id

    def __eq__(self, other):
        return self.id == other.id

    def __hash__(self):
        return self.id

    def __str__(self):
        return f'(name: {self.name}, id: {self.id})'

    def __repr__(self):
        return self.__str__()

In [46]:
# These are critical locations that only has one metro area, but the metro area name is not the same as the state name
google_trends_critical_locations = {
    'Utah': ('Salt Lake City UT', 770),
    'District of Columbia': ('Washington DC (Hagerstown MD)', 511),
    'Rhode Island': ('Providence RI-New Bedford MA', 770),
}

In [47]:
def get_leaves_as_tuples(json_obj):
    if 'children' not in json_obj:
        try:
            return [MetroArea(json_obj['name'], int(json_obj['id']))]
        except:        
            return [MetroArea(*google_trends_critical_locations[json_obj['name']])]
    else:
        leaves = []
        for child in json_obj['children']:
            leaves += get_leaves_as_tuples(child)
        return leaves

In [58]:
leaves = get_leaves_as_tuples(only_US)
leaves = list(set(leaves))
metro_areas_table = pd.DataFrame([leave.__dict__ for leave in leaves])
metro_areas_table

Unnamed: 0,name,id
0,Baltimore MD,512
1,Flint-Saginaw-Bay City MI,513
2,Buffalo NY,514
3,Cincinnati OH,515
4,Erie PA,516
...,...,...
205,Savannah GA,507
206,Pittsburgh PA,508
207,Ft. Wayne IN,509
208,Cleveland-Akron (Canton) OH,510


## Match the counties with metro areas and find median household income per metro area

### Load the county correspondence table
The correspondence table is retrieved from this link: https://sites.google.com/view/jacob-schneider/resources

In [59]:
# read the csv
correspondence_table = pd.read_csv('resources/trends_metro_counties_crosswalk.csv')
correspondence_table

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,county_name,trends_geocode,trends_geoname,split_county,trends_geocode2,trends_geoname2
0,1,1,161526,0500000US01001,1001,Autauga,698,Montgomery (Selma) AL,0,,
1,1,3,161527,0500000US01003,1003,Baldwin,686,Mobile AL-Pensacola (Ft. Walton Beach) FL,0,,
2,1,5,161528,0500000US01005,1005,Barbour,522,Columbus GA,0,,
3,1,7,161529,0500000US01007,1007,Bibb,630,Birmingham AL,0,,
4,1,9,161530,0500000US01009,1009,Blount,630,Birmingham AL,0,,
...,...,...,...,...,...,...,...,...,...,...,...
3137,56,37,1609192,0500000US56037,56037,Sweetwater,770,Salt Lake City UT,0,,
3138,56,39,1605083,0500000US56039,56039,Teton,758,Idaho Falls-Pocatello ID,0,,
3139,56,41,1605084,0500000US56041,56041,Uinta,770,Salt Lake City UT,0,,
3140,56,43,1605085,0500000US56043,56043,Washakie,767,Casper-Riverton WY,0,,


### Load the estimate tables
Tables are loaded from the census-data folder

In [63]:
one_year_estimate_table = pd.read_csv('../census-data/data/one_year_estimate_table.csv')
one_year_supplement_estimate_table = pd.read_csv('../census-data/data/one_year_supplement_estimate_table.csv')
five_year_estimate_table = pd.read_csv('../census-data/data/five_year_estimate_table.csv')

In [64]:
one_year_estimate_table

Unnamed: 0,Name,Geo_id,0 - 9999,10000 - 14999,15000 - 24999,25000 - 34999,35000 - 49999,50000 - 74999,75000 - 99999,100000 - 149999,150000 - 199999,200000 or more
0,"Shiawassee County, Michigan",0500000US26155,1383.074,1411.300,2286.306,2088.724,5729.878,5701.652,3754.058,4120.996,1157.266,564.520
1,"Washtenaw County, Michigan",0500000US26161,9395.379,4772.256,8202.315,10290.177,15211.566,25054.344,17299.428,26396.541,12527.172,19685.556
2,"Scott County, Minnesota",0500000US27139,1327.848,331.962,2600.369,1881.118,4702.795,7413.818,7745.780,11563.343,7358.491,10346.149
3,"Wright County, Minnesota",0500000US27171,1149.720,836.160,2247.180,2351.700,4964.700,6637.020,7995.780,15050.880,6741.540,4285.320
4,"Harrison County, Mississippi",0500000US28047,6741.592,3817.528,7066.488,9097.088,10640.344,15351.336,10559.120,11452.584,3086.512,3411.408
...,...,...,...,...,...,...,...,...,...,...,...,...
836,"El Dorado County, California",0500000US06017,3520.723,2022.543,4119.995,3370.905,5468.357,11610.895,9887.988,13858.165,6966.537,14082.892
837,"Fresno County, California",0500000US06019,23875.804,14196.424,27102.264,26456.972,38072.228,56785.696,39040.166,47428.962,26779.618,23230.512
838,"Humboldt County, California",0500000US06023,3421.408,2979.936,5076.928,5849.504,7891.312,10153.856,7780.944,6566.896,2428.096,2979.936
839,"Imperial County, California",0500000US06025,3923.618,2296.752,5694.031,3349.430,6890.256,8230.028,4784.900,9187.008,1291.923,2248.903


In [65]:
one_year_supplement_estimate_table

Unnamed: 0,Name,Geo_id,0 - 19999,20000 - 39999,40000 - 59999,60000 - 99999,100000 - 149999,150000 - 199999,200000 or more
0,"Autauga County, Alabama",0500000US01001,2022,4535,4063,5974,2957,1851,1103
1,"Baldwin County, Alabama",0500000US01003,10650,15532,18693,22631,13627,7722,5250
2,"Barbour County, Alabama",0500000US01005,1650,2713,1456,1675,927,74,485
3,"Bibb County, Alabama",0500000US01007,1943,2310,602,1470,1023,295,607
4,"Blount County, Alabama",0500000US01009,2943,4518,4579,4935,3270,707,993
...,...,...,...,...,...,...,...,...,...
1900,"Vega Alta Municipio, Puerto Rico",0500000US72143,4621,3243,1587,1564,334,302,0
1901,"Vega Baja Municipio, Puerto Rico",0500000US72145,8353,4348,3059,2372,399,325,286
1902,"Villalba Municipio, Puerto Rico",0500000US72149,2527,1837,926,386,134,0,135
1903,"Yabucoa Municipio, Puerto Rico",0500000US72151,5742,4706,999,925,0,0,0


In [66]:
five_year_estimate_table

Unnamed: 0,Name,Geo_id,0 - 9999,10000 - 14999,15000 - 24999,25000 - 34999,35000 - 49999,50000 - 74999,75000 - 99999,100000 - 149999,150000 - 199999,200000 or more
0,"Autauga County, Alabama",0500000US01001,1336.658,991.714,2651.757,1832.515,2737.993,3665.030,2888.906,3535.676,1034.832,905.478
1,"Baldwin County, Alabama",0500000US01003,4370.444,4034.256,6471.619,8404.700,11766.580,14119.896,11514.439,13027.285,4874.726,5547.102
2,"Barbour County, Alabama",0500000US01005,1361.012,708.472,1715.248,876.268,1165.250,1519.486,680.506,848.302,205.084,242.372
3,"Bibb County, Alabama",0500000US01007,798.490,515.389,762.195,660.569,813.008,1408.246,1132.404,813.008,203.252,159.698
4,"Blount County, Alabama",0500000US01009,2141.705,975.430,2332.550,2374.960,3074.725,3795.695,2290.140,2629.420,1039.045,572.535
...,...,...,...,...,...,...,...,...,...,...,...,...
3216,"Renville County, Minnesota",0500000US27129,325.728,386.048,482.560,506.688,784.160,1405.456,892.736,796.224,265.408,199.056
3217,"Roseau County, Minnesota",0500000US27135,178.200,273.240,457.380,534.600,795.960,1395.900,944.460,926.640,255.420,184.140
3218,"Sherburne County, Minnesota",0500000US27141,950.939,524.656,1442.804,2000.251,3180.727,5705.634,5148.187,8624.033,2885.608,2360.952
3219,"Steele County, Minnesota",0500000US27147,682.456,415.408,1083.028,1275.896,1973.188,2655.644,2299.580,2789.168,934.668,741.800


### Find the median household income per metro area

In [113]:
# we are assuming uniform distribution of the income in a given bin/interval
# we could also apply more sophisticated methods (E.g. https://journals.sagepub.com/doi/epub/10.1177/0081175015599807)
def interpolate_median(intervals):
    
    # convert a description of an interval to a valid range
    def get_range(desc):
        if desc == '200000 or more':
            return (200000, 700000) # let's fix the upperbound as something resonable
        else:
            return tuple(map(int, desc.replace(' ', '').split('-')))

    # find the mapping of the candidate median inside a given range
    def locate(m, interval_range):
        return max(0, min(1, (m - interval_range[0]) / (interval_range[1] - interval_range[0])))

    def get_cnt_smaller(m, intervals_with_range):
        return sum([
            max(0, locate(m, interval_range) * pop_cnt)
            for interval_range, pop_cnt in intervals_with_range
        ])

    # convert intervals with descriptions into intervals with ranges
    intervals_with_range = [
        (get_range(desc), pop_cnt)
        for desc, pop_cnt in intervals.items()
    ]

    # all the households in the given intervals
    cnt_all = sum([pop_cnt for _, pop_cnt in intervals_with_range])

    # do binary search to find the median
    l, r = 0, 700001
    for _ in range(30):
        m = (l + r) / 2
        # find the number of households with income less than or equal to m
        cnt_smaller = get_cnt_smaller(m, intervals_with_range)
        # if our candidate median should be larger
        if cnt_smaller < cnt_all / 2:
            l = m
        # or should be smaller
        else:
            r = m

    # return the approximated median
    return round((l + r) / 2, 2)

In [124]:
def update_all_intervals(all_intervals, county_intervals, divide):
    for interval in county_intervals:
        interval_desc, interval_pop_cnt = interval
        if not interval_desc in all_intervals:
            all_intervals[interval_desc] = 0
        # if divide is 1, then we split the population evenly
        all_intervals[interval_desc] += interval_pop_cnt / (1 + divide) 
    return all_intervals

def get_intervals_for_county(table, affgeoid):
    county_row = table[table['Geo_id'] == affgeoid]
    interest = county_row.iloc[:, 2:]
    intervals = [
        (col, round(county_row[col].values[0]))
        for col in interest.columns
    ]
    return intervals

def get_median_income_interpolation(metro_row):
    
    metro_id = metro_row['id']
    counties = correspondence_table[(correspondence_table['trends_geocode'] == metro_id) | (correspondence_table['trends_geocode2'] == metro_id)]
    
    all_intervals = {}
    for _, county_row in counties.iterrows():
    
        affgeoid = county_row['AFFGEOID']
    
        if affgeoid in one_year_estimate_table['Geo_id'].values:
            county_intervals = get_intervals_for_county(one_year_estimate_table, affgeoid)

        elif affgeoid in one_year_supplement_estimate_table['Geo_id'].values:
            county_intervals = get_intervals_for_county(one_year_supplement_estimate_table, affgeoid)
        
        elif affgeoid in five_year_estimate_table['Geo_id'].values:
            county_intervals = get_intervals_for_county(five_year_estimate_table, affgeoid)
        
        else:
            print(f'Cannot find county {county_row["county_name"]} with geo id {affgeoid} in any of the tables, thus we use hardcoded values from 2019 acs 5-year estimate')
            # we have exception for the country 'Valdez-Cordova', which only belongs to 2019 ACS 5-year estimate
            # we will just hardcode its intervals
            # link: https://data.census.gov/cedsci/table?q=S1901&g=0500000US02261&tid=ACSST5Y2019.S1901
            county_intervals = [
                ('0 - 9999', 3177 * .046),
                ('10000 - 14999', 3177 * .028),
                ('15000 - 24999', 3177 * .040),
                ('25000 - 34999', 3177 * .087),
                ('35000 - 49999', 3177 * .108),
                ('50000 - 74999', 3177 * .174),
                ('75000 - 99999', 3177 * .155),
                ('100000 - 149999', 3177 * .181),
                ('150000 - 199999', 3177 * .075),
                ('200000 or more', 3177 * .108),
            ]

        # here, we set divide parameter to 1 if the county is split into two metro areas
        # since we do not know how much of the county belongs to each metro area
        # we assume that the county is split evenly
        all_intervals = update_all_intervals(all_intervals, county_intervals, county_row['split_county'] == 1)
    
    return interpolate_median(all_intervals)

In [None]:
median_income_interpolated = [
    get_median_income_interpolation(row)
    for _, row in metro_areas_table.iterrows()
]

In [127]:
# extend the metro areas table with median income column
metro_areas_table['median_income'] = median_income_interpolated
metro_areas_table

Unnamed: 0,name,id,median_income
0,Baltimore MD,512,85820.27
1,Flint-Saginaw-Bay City MI,513,53514.59
2,Buffalo NY,514,61238.27
3,Cincinnati OH,515,70220.94
4,Erie PA,516,56730.36
...,...,...,...
205,Savannah GA,507,61071.21
206,Pittsburgh PA,508,65944.17
207,Ft. Wayne IN,509,62515.75
208,Cleveland-Akron (Canton) OH,510,61762.68


## Save the table

In [128]:
# save the metro areas table to the data dir
if not os.path.exists('data'):
    os.mkdir('data')

metro_areas_table.to_csv('data/metro_areas_table.csv', index=False)