In [1]:
import folium
import pandas as pd
import geopandas as gpd
import censusgeocode as cg

In [2]:
# Function to use census geocode and return geoid and name
def get_geo_info(address:str, city:str, state:str, zip_code:str, geography:str) -> tuple:
    """
    Get census geocode information for an address.

    Args:
        address: address string
        city: city string
        state: state string
        zip_code: zip code string
        geography: level of geography to return

    Returns:
        Tuple of the geocoded geoid and name for geography based on address information.
        Example: get_geo_info(address='57 Summit Avenue', city='Sharon', state='MA', zip_code='02067', geography='Counties')
    """
    result = cg.address(address, city = city, state = state, zip = zip_code)
    
    if result:
        geoid = result[0]['geographies'][geography][0]['GEOID']
        name = result[0]['geographies'][geography][0]['NAME']
        #print(geoid, name)
        return geoid, name
    else:
        #print('No result found')
        return None, None

In [3]:
# Read the ZCTA to county relationship file
zcta_to_county_url = 'https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_county20_natl.txt'
zcta_to_county = pd.read_table(zcta_to_county_url, dtype = {'GEOID_ZCTA5_20':str, 'GEOID_COUNTY_20':str}, sep = '|')

# Filter only the top row for each zip code
zcta_to_county.sort_values(by=['GEOID_ZCTA5_20', 'AREALAND_PART'], ascending=[True, False], inplace=True)
zcta_to_county_top1 = zcta_to_county.groupby('GEOID_ZCTA5_20').head(1)
zcta_to_county_top1.head()

Unnamed: 0,OID_ZCTA5_20,GEOID_ZCTA5_20,NAMELSAD_ZCTA5_20,AREALAND_ZCTA5_20,AREAWATER_ZCTA5_20,MTFCC_ZCTA5_20,CLASSFP_ZCTA5_20,FUNCSTAT_ZCTA5_20,OID_COUNTY_20,GEOID_COUNTY_20,NAMELSAD_COUNTY_20,AREALAND_COUNTY_20,AREAWATER_COUNTY_20,MTFCC_COUNTY_20,CLASSFP_COUNTY_20,FUNCSTAT_COUNTY_20,AREALAND_PART,AREAWATER_PART
903,221704300000000.0,601,ZCTA5 00601,166847909.0,799292.0,G6350,B5,S,2759082215444,72001,Adjuntas Municipio,172725726,1051789,G4020,H1,A,164781682,799292
905,221704300000000.0,602,ZCTA5 00602,78546713.0,4428428.0,G6350,B5,S,27590582517512,72003,Aguada Municipio,79923637,38025989,G4020,H1,A,78530159,4428428
907,221704300000000.0,603,ZCTA5 00603,88957333.0,6276536.0,G6350,B5,S,2759082344115,72005,Aguadilla Municipio,94618010,101127672,G4020,H1,A,88747846,6276536
909,221704300000000.0,606,ZCTA5 00606,114825382.0,12487.0,G6350,B5,S,2759094623292,72093,Maricao Municipio,94851744,12487,G4020,H1,A,94466099,12487
913,221704300000000.0,610,ZCTA5 00610,96129350.0,4310530.0,G6350,B5,S,27590585582224,72011,Añasco Municipio,101747429,14607647,G4020,H1,A,93009966,4310530


In [4]:
# Load the counties shapefile from the census bureau
counties_url = "https://www2.census.gov/geo/tiger/TIGER2024/COUNTY/tl_2024_us_county.zip"
counties = gpd.read_file(counties_url)

# Simplify county boundaries to save memory
counties.loc[:, 'geometry'] = counties['geometry'].simplify(tolerance=0.01)

counties.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,31,39,835841,31039,0500000US31039,Cuming,Cuming County,6,H1,G4020,,,,A,1477563042,10772508,41.9158651,-96.7885168,"POLYGON ((-96.55551 42.08996, -96.55517 41.742..."
1,53,69,1513275,53069,0500000US53069,Wahkiakum,Wahkiakum County,6,H1,G4020,,,,A,680980773,61564428,46.2946377,-123.4244583,"POLYGON ((-123.72755 46.2645, -123.72656 46.38..."
2,35,11,933054,35011,0500000US35011,De Baca,De Baca County,6,H1,G4020,,,,A,6016818941,29090018,34.3592729,-104.3686961,"POLYGON ((-104.89337 34.08894, -104.89202 34.6..."
3,31,109,835876,31109,0500000US31109,Lancaster,Lancaster County,6,H1,G4020,339.0,30700.0,,A,2169269508,22850511,40.7835474,-96.6886584,"POLYGON ((-96.46363 40.52301, -96.91264 40.523..."
4,31,129,835886,31129,0500000US31129,Nuckolls,Nuckolls County,6,H1,G4020,,,,A,1489645201,1718484,40.1764918,-98.0468422,"POLYGON ((-98.27402 40.00266, -98.27357 40.350..."


In [5]:
# Read the dataset from the url zip file
whisard_url = "https://enfxfr.dol.gov/data_catalog/WHD/whd_whisard_20250405.csv.zip"
whisard = pd.read_csv(whisard_url, dtype = {'zip_cd':str}, compression='zip')
print("rows before filtering: ", len(whisard))

# Filter the dataset for only child labor violations since Jan 1, 2020
whisard_cl = whisard[((whisard['flsa_cl_violtn_cnt'] > 0) | (whisard['flsa_cl_minor_cnt'] > 0)) & (whisard['findings_end_date'] >= '2020-01-01')]
print("rows after filtering: ", len(whisard_cl))

# Add the majority county based on ZCTA to county relationship file
whisard_cl.loc[:,'zip_cd'] = whisard_cl['zip_cd'].str.zfill(5)
whisard_cl = pd.merge(whisard_cl, zcta_to_county_top1[['GEOID_ZCTA5_20', 'GEOID_COUNTY_20']], left_on='zip_cd', right_on='GEOID_ZCTA5_20', how='left')

whisard_cl_missing_geo = whisard_cl[whisard_cl['GEOID_COUNTY_20'].isnull()]
print("rows with missing county", len(whisard_cl_missing_geo))

# Use the census geocoder API to get geocode information for county
# whisard_cl[['county_id', 'county_name']] = whisard_cl.apply(lambda x: get_geo_info(x['street_addr_1_txt'], x['cty_nm'], x['st_cd'], x['zip_cd'], 'Counties'), axis=1, result_type='expand')

# Group the rows by column
whisard_cl_by_county = whisard_cl[['GEOID_COUNTY_20', 'flsa_cl_violtn_cnt', 'flsa_cl_minor_cnt', 'naics_code_description']].groupby('GEOID_COUNTY_20').agg({
    'flsa_cl_violtn_cnt': 'sum',
    'flsa_cl_minor_cnt': 'sum',
    'naics_code_description': lambda x: ', '.join(set(x))
})

whisard_cl_by_county = counties.merge(whisard_cl_by_county, left_on='GEOID', right_on='GEOID_COUNTY_20', how='left')
whisard_cl_by_county.loc[:,['flsa_cl_violtn_cnt', 'flsa_cl_minor_cnt']] = whisard_cl_by_county[['flsa_cl_violtn_cnt', 'flsa_cl_minor_cnt']].fillna(0)
whisard_cl_by_county.loc[:,'naics_code_description'] = whisard_cl_by_county['naics_code_description'].fillna('N/A')

whisard_cl_by_county = whisard_cl_by_county.loc[:,['GEOID', 'NAMELSAD', 'flsa_cl_violtn_cnt', 'flsa_cl_minor_cnt', 'naics_code_description', 'geometry']]

# Show final dataset
whisard_cl_by_county.head()

  whisard = pd.read_csv(whisard_url, dtype = {'zip_cd':str}, compression='zip')


rows before filtering:  357269
rows after filtering:  3684
rows with missing county 21


Unnamed: 0,GEOID,NAMELSAD,flsa_cl_violtn_cnt,flsa_cl_minor_cnt,naics_code_description,geometry
0,31039,Cuming County,0.0,0.0,,"POLYGON ((-96.55551 42.08996, -96.55517 41.742..."
1,53069,Wahkiakum County,0.0,0.0,,"POLYGON ((-123.72755 46.2645, -123.72656 46.38..."
2,35011,De Baca County,0.0,0.0,,"POLYGON ((-104.89337 34.08894, -104.89202 34.6..."
3,31109,Lancaster County,70.0,67.0,"Marketing Research and Public Opinion Polling,...","POLYGON ((-96.46363 40.52301, -96.91264 40.523..."
4,31129,Nuckolls County,2.0,1.0,Supermarkets and Other Grocery (except Conveni...,"POLYGON ((-98.27402 40.00266, -98.27357 40.350..."


In [6]:
# Write dataset to geojson file
whisard_cl_by_county.to_file('C:/Python3/notebooks/whisard_cl_by_county.js', driver="GeoJSON")