# Data preparation

In this notebook we will build a parquet file for use in further analyses. 

In [44]:
import os, zipfile, itables
import pandas as pd

In [11]:
files = os.listdir('data')
files = [file for file in files if file.endswith('.zip')]

In [12]:
files

['bdc_53_Cable_fixed_broadband_123122-20230926.zip',
 'bdc_53_Copper_fixed_broadband_123122-20230926.zip',
 'bdc_53_FibertothePremises_fixed_broadband_123122-20230926.zip',
 'bdc_53_GSOSatellite_fixed_broadband_123122-20230926.zip',
 'bdc_53_LBRFixedWireless_fixed_broadband_123122-20230926.zip',
 'bdc_53_LicensedFixedWireless_fixed_broadband_123122-20230926.zip',
 'bdc_53_NGSOSatellite_fixed_broadband_123122-20230926.zip',
 'bdc_53_UnlicensedFixedWireless_fixed_broadband_123122-20230926.zip']

In [58]:
waAvailabilityData = pd.DataFrame()
dfColumnHints = {
    'block_geoid': str
}

for file in files:
    # print(file)
    archive = zipfile.ZipFile(os.path.join('data', file))
    with archive.open(archive.filelist[0].filename) as f:
        localDf = pd.read_csv(f, dtype=dfColumnHints)
    waAvailabilityData = pd.concat([waAvailabilityData, localDf], ignore_index=True)

wsboUnserved = pd.read_csv(os.path.join('data', 'unserved.csv'))
wsboUnderserved = pd.read_csv(os.path.join('data', 'underserved.csv'))

bdc_53_Cable_fixed_broadband_123122-20230926.zip
bdc_53_Copper_fixed_broadband_123122-20230926.zip
bdc_53_FibertothePremises_fixed_broadband_123122-20230926.zip
bdc_53_GSOSatellite_fixed_broadband_123122-20230926.zip
bdc_53_LBRFixedWireless_fixed_broadband_123122-20230926.zip
bdc_53_LicensedFixedWireless_fixed_broadband_123122-20230926.zip
bdc_53_NGSOSatellite_fixed_broadband_123122-20230926.zip
bdc_53_UnlicensedFixedWireless_fixed_broadband_123122-20230926.zip


In [134]:
waLocations = waAvailabilityData[['location_id', 'block_geoid']]

In [135]:
waLocations = waLocations.drop_duplicates()
waLocations['county_geoid'] = waLocations.block_geoid.str[0:5]

In [136]:
waLocations

Unnamed: 0,location_id,block_geoid,county_geoid
0,1015880838,530659514022053,53065
1,1073176227,530350910012001,53035
2,1073185639,530350907004015,53035
3,1056655277,530499502005010,53049
4,1073186253,530350909022006,53035
...,...,...,...
12535029,1310087661,530730002032000,53073
12535061,1049483490,530479709001124,53047
12535205,1310144322,530730103011029,53073
12535621,1312148589,530050118021061,53005


In [137]:
wsboUnserved

Unnamed: 0,location_id
0,1410015656
1,1409662696
2,1342736839
3,1342719613
4,1337285868
...,...
236129,1105695176
236130,1049493621
236131,1015918336
236132,1056645894


In [138]:
waLocations['wsboUnserved'] = waLocations.location_id.isin(wsboUnserved.location_id)

In [139]:
waLocations['wsboUnderserved'] = waLocations.location_id.isin(wsboUnderserved.location_id)

In [140]:
waLocations['wsboServed'] = ((waLocations['wsboUnderserved'] == False) & (waLocations['wsboUnserved'] == False))

In [141]:
(waLocations['wsboServed'].astype('str') + ',' + waLocations['wsboUnderserved'].astype('str') + ',' + waLocations['wsboUnserved'].astype('str')).value_counts()

True,False,False    2206330
False,False,True     236134
False,True,False      85434
Name: count, dtype: int64

In [145]:
waLocations['fiberserved'] = 'Unserved'
waFiberData = waAvailabilityData[waAvailabilityData.technology == 50]
waFiberLocations = waFiberData[((waFiberData.max_advertised_download_speed >= 25) & (waFiberData.max_advertised_upload_speed >= 3))].location_id.reset_index(drop=True)
waLocations.loc[waLocations.location_id.isin(waFiberLocations), 'fiberserved'] = 'Underserved'
waFiberLocations = waFiberData[((waFiberData.max_advertised_download_speed >= 100) & (waFiberData.max_advertised_upload_speed >= 20))].location_id.reset_index(drop=True)
waLocations.loc[waLocations.location_id.isin(waFiberLocations), 'fiberserved'] = 'Served'

In [123]:
# waLocations.loc[waLocations.location_id == 1015880838, 'fiberserved'] = 'foo'

In [164]:
waLocations.fiberserved.value_counts()

fiberserved
Unserved       1771906
Served          755517
Underserved        475
Name: count, dtype: int64

In [165]:
waFiberData

Unnamed: 0,frn,provider_id,brand_name,location_id,technology,max_advertised_download_speed,max_advertised_upload_speed,low_latency,business_residential_code,state_usps,block_geoid,h3_res8_id
4096571,13775002,330082,Washington Broadband,1305943452,50,1000,1000,1,X,WA,530770030023074,882889ae1dfffff
4096572,16557159,460043,iFIBER Communications,1296193912,50,1000,1000,1,X,WA,530079610022001,8828d681ddfffff
4096573,16557159,460043,iFIBER Communications,1296194166,50,1000,1000,1,X,WA,530079613042012,8828d68083fffff
4096574,16557159,460043,iFIBER Communications,1296195531,50,1000,1000,1,X,WA,530079613042015,8828d680e3fffff
4096575,16557159,460043,iFIBER Communications,1296195723,50,1000,1000,1,X,WA,530079610024006,8828d681d3fffff
...,...,...,...,...,...,...,...,...,...,...,...,...
4970278,18626853,130228,CenturyLink,1367018855,50,940,940,1,X,WA,530330061002020,8828d542a9fffff
4970279,18626853,130228,CenturyLink,1367020132,50,940,940,1,X,WA,530330268021006,8828d55291fffff
4970280,18626853,130228,CenturyLink,1367041969,50,940,940,1,X,WA,530330270003002,8828d55283fffff
4970281,18626853,130228,CenturyLink,1367059337,50,940,940,1,X,WA,530330032023002,8828d54703fffff


# Unused Stuff

In [70]:
unservedWaLocations = waLocations.merge(wsboUnserved, how='outer', indicator=True)

In [76]:
unservedWaLocations = unservedWaLocations[unservedWaLocations._merge != 'left_only']