In [3]:
import geopandas as gp
import pandas as pd
import numpy as np
from datetime import datetime

from tobler.area_weighted import area_interpolate

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor

from elasticsearch import Elasticsearch, helpers
ES_DEV = Elasticsearch(['YOUR ES HOST'], http_auth=('ES LOGIN', 'ES PASS'), timeout=30)
%load_ext autotime



time: 249 µs (started: 2022-04-01 03:31:21 -04:00)


In [4]:
# 5 cols
NTIA_INIT_COLS = [
'numISPfiber', 'numISPother', 'numISPwireless',
'MaxConsumerDown98', 'MaxConsumerUp98',
]
# 7 cols
NTIA_END_COLS = [
'numISPfiber', 'numISPother', 'numISPwireless',
'MaxConsumerDown98', 'MaxConsumerUp98',
'speedCatNtia', 'speedSourceNtia',
]
# 11 cols
OOKLA_INIT_COLS = [
'maxDownloadMbpsOokla', 'maxUploadMbpsOokla',
'meanDownloadMbpsOokla', 'meanUploadMbpsOokla',
'medDownloadMbpsOokla', 'medUploadMbpsOokla',
'minDownloadMbpsOokla', 'minUploadMbpsOokla',
'latencyOokla',
'numDeviceOokla', 'numTestOokla',
]
# 13 cols
OOKLA_END_COLS = [
'maxDownloadMbpsOokla', 'maxUploadMbpsOokla',
'meanDownloadMbpsOokla', 'meanUploadMbpsOokla',
'medDownloadMbpsOokla', 'medUploadMbpsOokla',
'minDownloadMbpsOokla', 'minUploadMbpsOokla',
'latencyOokla',
'numDeviceOokla', 'numTestOokla',
'speedCatOokla', 'speedSourceOokla',
]
# 8 cols: max, min, device, test
MLAB_INIT_COLS = [
'maxDownloadMbpsMlab', 'maxUploadMbpsMlab',
'minDownloadMbpsMlab', 'minUploadMbpsMlab',
'numDeviceDownloadMlab', 'numDeviceUploadMlab',
'numTestDownloadMlab', 'numTestUploadMlab',
]
# 14 cols
MLAB_MID_COLS = [
'maxDownloadMbpsMlab', 'maxUploadMbpsMlab',
'minDownloadMbpsMlab', 'minUploadMbpsMlab',
'numDeviceDownloadMlab', 'numDeviceUploadMlab',
'numTestDownloadMlab', 'numTestUploadMlab',
'meanDownloadMbpsMlab', 'meanUploadMbpsMlab',
'medDownloadMbpsMlab', 'medUploadMbpsMlab',
'latencyMlab', 'lossrateMlab',
] 
# 16 cols
MLAB_END_COLS = [
'maxDownloadMbpsMlab', 'maxUploadMbpsMlab',
'minDownloadMbpsMlab', 'minUploadMbpsMlab',
'numDeviceDownloadMlab', 'numDeviceUploadMlab',
'numTestDownloadMlab', 'numTestUploadMlab',
'meanDownloadMbpsMlab', 'meanUploadMbpsMlab',
'medDownloadMbpsMlab', 'medUploadMbpsMlab',
'latencyMlab', 'lossrateMlab',
'speedCatMlab','speedSourceMlab',
]

# 49 COLS
MLAB_PREDICTION_COLS = [
    'CMC', 'Education', 'Health', 
    'POP2019', 'Public Admin', 'age65overper', 'asianper', 'bachelorper', 
    'blackper', 'hh2020', 'hu2020', 'landareaSqmi', 'lengthMile', 
    'maxadownFiber', 'maxadownOther', 'maxadownWireless', 'maxadupFiber', 'maxadupOther', 'maxadupWireless',
    'mhincome', 'nativeper', 'nocomputerper_ct', 'nointernetper', 'nointernetper_ct', 
    'numISPcomm', 'numISPresi', 'num_household', 'num_household_ct', 'num_housingunit', 'otherraceper', 
    'parcelNumAgri', 'parcelNumCommer', 'parcelNumInfra', 'parcelNumResi', 
    'parcelNumRem', 'parcelNumValid', 'parcelNumTotal',
    'parcelBuildingCount', 'parcelBuildingFootprint',
    "cafiiLocation", 'pop2020', 'povertybelow15', 'povertybelow15_ct', 
    'povertybelow20_ct', 'povertyper', 'povertyper_ct', 
    'rdofLocation', 'rdofReserve', 'whiteper'] 


SPEED_TEST_INDEX_BASE_COLS = ['GEOID', 'statefips']

QUARTER_PREFIX_COLS = set(NTIA_END_COLS + OOKLA_END_COLS + MLAB_END_COLS + ['speedRankReadyRaw'])


CBG_median_groups = [
    'CMC', 'MaxConsumerDown98', 'MaxConsumerUp98', 
    'age65overper', 'asianper', 'bachelorper', 'blackper', 
    'cafiiLocation', 
    'latencyOokla', 'maxDownloadMbpsOokla', 'maxUploadMbpsOokla', 
    'maxadownFiber', 'maxadownOther', 'maxadownWireless', 'maxadupFiber', 
    'maxadupOther', 'maxadupWireless', 'meanDownloadMbpsOokla', 'meanUploadMbpsOokla', 
    'medDownloadMbpsOokla', 'medUploadMbpsOokla', 'mhincome', 
    'minDownloadMbpsOokla', 'minUploadMbpsOokla', 'nativeper', 
    'nocomputerper_ct', 'nointernetper', 'nointernetper_ct', 
    'otherraceper', 'povertybelow15', 'povertybelow15_ct', 'povertybelow20_ct', 'povertyper', 'povertyper_ct', 
    'rdofLocation', 'rdofReserve', 'whiteper']

CBG_sum_groups = [
    'Education', 'Health', 'POP2019', 'Public Admin', 'hh2020', 'hu2020', 
    'landareaSqmi', 'lengthMile', 'numDeviceOokla', 
    'numISPcomm', 'numISPfiber', 'numISPother', 'numISPresi', 'numISPwireless', 
    'numTestOokla', 'num_household', 'num_household_ct', 'num_housingunit', 
    'parcelBuildingCount', 'parcelBuildingFootprint', 'parcelNumAgri', 
    'parcelNumCommer', 'parcelNumInfra', 'parcelNumRem', 'parcelNumResi', 
    'parcelNumTotal', 'parcelNumValid', 'pop2020']


time: 2.05 ms (started: 2022-04-01 03:31:24 -04:00)


In [5]:
SF52 = { '01': 'AL', '02': 'AK', '04': 'AZ', '05': 'AR', 
                      '06': 'CA', '08': 'CO', '09': 'CT', '10': 'DE', 
                      '11': 'DC', '12': 'FL', '13': 'GA', '15': 'HI', 
                      '16': 'ID', '17': 'IL', '18': 'IN', '19': 'IA', 
                      '20': 'KS', '21': 'KY', '22': 'LA', '23': 'ME', 
                      '24': 'MD', '25': 'MA', '26': 'MI', '27': 'MN', 
                      '28': 'MS', '29': 'MO', '30': 'MT', '31': 'NE', 
                      '32': 'NV', '33': 'NH', '34': 'NJ', '35': 'NM', 
                      '36': 'NY', '37': 'NC', '38': 'ND', '39': 'OH', 
                      '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', 
                      '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', 
                      '49': 'UT', '50': 'VT', '51': 'VA', '53': 'WA', 
                      '54': 'WV', '55': 'WI', '56': 'WY', '72': 'PR'}

SF52R = {v: k for (k,v) in SF52.items()}

STATE_LENGTH, COUNTY_LENGTH, CT_LENGTH, CBG_LENGTH, CB_LENGTH = 2, 5, 11, 12, 15
CRS_TIGER, CRS_OOKLA, CRS_COORDS = 4269, 4326, 3857
# 3857: WGS 84 / Pseudo-Mercator -- Spherical Mercator, Google Maps, OpenStreetMap, Bing, ArcGIS, ESRI


time: 898 µs (started: 2022-04-01 03:31:26 -04:00)


### OVERVIEW: for each quarter:
- HAVE (training dataset): 12% CBG
- NEED (prediction dataset): 88% CBG

### Get raw Mlab data from BigQuery
- query bigquery @ project: https://console.cloud.google.com/bigquery?project=measurement-lab
    - INSIDE bigquery-sql folder: find speeds, and numDevice, numTest separately because numDevice/numTest SQL is more simple (less processing steps)
    - Afterward, merge the Speed dataframe with the numDeviceNumTest dataframe.
- NTIA bigquery: https://console.cloud.google.com/bigquery?project=measurement-lab&ws=!1m5!1m4!1m3!1smeasurement-lab!2sbquxjob_4b16ab25_17c9bd44f76!3sUS&j=bq:US:bquxjob_4b16ab25_17c9bd44f76&page=queryresults
- using nhatn1507@gmail acc (which subscribed to MLAB mailing list) for free
- save result to Google Drive, download to and read files from this VM

### Time-series origin: 2021Q1

### Impute CBG (need 100%) from CB 

In [124]:
# IMPORTANT: QUARTER must be a quarter at or after '2021Q1'
# QUARTER = '2021Q1' # ....Change to 2021Q3, or 2021Q4, etc. on following runs
QUARTER = '2021Q4'
QUARTER

'2021Q4'

time: 1.62 ms (started: 2022-04-01 07:07:33 -04:00)


In [125]:
all_cbg = []
for sf in SF52:
    index_name = f'bossdata{sf}'
    abbrv = SF52[sf]
    
    # PART A - CENSUS BLOCK LEVEL
    # 100% filled
    CB_ookla_ntia = pd.read_csv(f"ookla_ntia/{QUARTER}_CB_{sf}.csv")
    # fill 100%
    CB_mlab_pred_df = pd.read_csv(f'Elasticsearch/mlab_prediction_{index_name}.csv')
    CB_mlab_pred_df['cafiiLocation'].fillna(0, inplace=True)
    for col in set(CB_mlab_pred_df):
        CB_mlab_pred_df[col].fillna(CB_mlab_pred_df[col].median(), inplace = True)
    # NOTE: both CB_ookla_ntia and CB_mlab_pred_df have GEOID col with int64 type     
    CB_df = CB_ookla_ntia.merge(CB_mlab_pred_df, on='GEOID', how='inner')
    # print(CB_ookla_ntia.shape, CB_mlab_pred_df.shape, CB_df.shape, CB_df.isnull().sum().sum())
    
    if CB_df.isnull().sum().sum() > 0:
        print("ALERT 1: contain nulls")
    if len(CB_df.select_dtypes('number')) != len(CB_df):
        print("ALERT 2: not all columns are numeric type")
        
    CB_df.GEOID = CB_df.GEOID.astype(str).str.zfill(CB_LENGTH)
    CB_df['GEOID_cbg'] = CB_df.GEOID.str[:CBG_LENGTH] 
    CBG_count_before = len(CB_df.GEOID_cbg.unique())
    
    # PART B - CENSUS BLOCK GROUP LEVEL
    CBG_df_medians = CB_df[CBG_median_groups + ['GEOID_cbg']].groupby('GEOID_cbg').median()
    CBG_df_sums = CB_df[CBG_sum_groups + ['GEOID_cbg']].groupby('GEOID_cbg').sum()
    
    if ((not (len(CBG_df_medians) == len(CBG_df_sums) == len(set(CB_df.GEOID_cbg)))) or
        (len(set(CBG_df_medians.index) - set(CBG_df_sums.index)) + len(set(CBG_df_sums.index) - set(CBG_df_medians.index)) != 0)
       ):
        print("ALERT 3: diff groupby GEOID_cbg")   

    CBG_df = CBG_df_medians.join(CBG_df_sums, how='left')
    del CB_df, CB_ookla_ntia, CB_mlab_pred_df, CBG_df_medians, CBG_df_sums
    
    # UPDATE CBG_df with data from ookla CBG
    ookla_cbg = pd.read_csv(f"ookla_state_tiles/{QUARTER}_CBG_{sf}.csv")
    ookla_cbg.GEOID_cbg = ookla_cbg.GEOID_cbg.astype(str).str.zfill(CBG_LENGTH)
    ookla_cbg.set_index('GEOID_cbg', inplace=True)
    
    # speedSourceOokla is not numeric
    if set(ookla_cbg) - set(CBG_df) != {'speedSourceOokla'}:
        print("ALERT 4: ookla_cbg should not contain more cols than CBG_df!")   
    
    # UPDATE with ookla actual CBG speeds
    CBG_df.update(ookla_cbg)
    if CBG_df.isnull().sum().sum() != 0:
        print(f"ALERT 5: {CBG_df.isnull().sum().sum()=}", )
        
    CBG_count_after = len(CBG_df)
    if CBG_count_before != CBG_count_after:
        print("ALERT 6: CBG before and after should be the same: ", CBG_count_before, CBG_count_after)
    
    all_cbg.append(CBG_df)
    
all_cbg = pd.concat(all_cbg, ignore_index=False)
# # NUMBER OF CBG in bossdata* = 220,334 !
QUARTER, all_cbg.shape, all_cbg.isnull().sum().sum()  # ALWAYS = ((220334, 65), 0)

('2021Q4', (220334, 65), 0)

time: 1min 40s (started: 2022-04-01 07:07:44 -04:00)


In [None]:
#### RUN ONCE: CBG Tiger geom: add lat and lon and geometry columns to all_cbg
TIGER_CBG_52 = []
for sf in SF52:
    # TIGER data has CRS = <Geographic 2D CRS: EPSG:4269> : 
    census_year = 2019
    tiger_cbg = gp.read_file(f"https://www2.census.gov/geo/tiger/TIGER{census_year}/BG/tl_{census_year}_{sf}_bg.zip"
                            )[['GEOID', 'geometry']].to_crs(CRS_COORDS)
    # centroid could be inaccurate, but the GPS coords are only used as proxy in the RF model
    tiger_cbg['lat'] = tiger_cbg['geometry'].centroid.x
    tiger_cbg['lon'] = tiger_cbg['geometry'].centroid.y
    tiger_cbg['GEOID_cbg'] = tiger_cbg['GEOID'].astype(str).str.zfill(CBG_LENGTH)
    # tiger_cbg = pd.DataFrame(tiger_cbg.drop(columns=['geometry', 'GEOID']))
    TIGER_CBG_52.append(tiger_cbg[['GEOID_cbg', 'lat', 'lon', 'geometry']])

TIGER_CBG_52 = pd.concat(TIGER_CBG_52, ignore_index=True)
%store TIGER_CBG_52
print('NUMBER OF CBG from TIGER CENSUS2019: ', TIGER_CBG_52.shape) # (220333, 4)


In [126]:
all_cbg = TIGER_CBG_52.merge(all_cbg, on='GEOID_cbg', how='right')

# Median fills for lat, lon
for col in sorted(set(all_cbg)):
    if col != 'geometry' and all_cbg[col].isnull().sum():
        all_cbg[col].fillna(all_cbg[col].median(), inplace=True)
        
s = all_cbg.isnull().sum()
if s.sum(): 
    # 73 CBG have no geometry, but it's ok 
    print("ALERT: all_cbg has nulls", s[s>0])

ALERT: all_cbg has nulls geometry    73
dtype: int64
time: 466 ms (started: 2022-04-01 07:09:25 -04:00)


In [100]:
# NOTE: bossdata* has 220,334 CBG, while 2019TIGER has 220333 CBG
# TODO later: need a resolution for CBG <> CB , bossdata <> TIGER2019
len(set(TIGER_CBG_52.GEOID_cbg) - set(all_cbg.index)), len(set(all_cbg.index) - set(TIGER_CBG_52.GEOID_cbg))

(220333, 220334)

time: 211 ms (started: 2022-04-01 05:54:15 -04:00)


In [10]:
## Combine speed + device dataframes: FloorCap, and remove invalid latency and lossrate
def merge_mlab_dfs(speed_df, device_df, idCol):
    new_mlab_mappings = {
        'download_MIN'   :  'minDownloadMbpsMlab',
        'download_MED'   :  'medDownloadMbpsMlab',
        'download_AVG'   :  'meanDownloadMbpsMlab',
        'download_MAX'   :  'maxDownloadMbpsMlab',
        'upload_MIN'     :  'minUploadMbpsMlab',
        'upload_MED'     :  'medUploadMbpsMlab',
        'upload_AVG'     :  'meanUploadMbpsMlab',
        'upload_MAX'     :  'maxUploadMbpsMlab',
        'num_test_down'  :  'numTestDownloadMlab',
        'num_device_down':  'numDeviceDownloadMlab',
        'num_test_up'    :  'numTestUploadMlab',
        'num_device_up'  :  'numDeviceUploadMlab',
        'latency'        :  'latencyMlab',
        'lossrate'       :  'lossrateMlab', 
    }    
    merged_df = pd.merge(speed_df, device_df, how='left', on=idCol).rename(columns = new_mlab_mappings)
    # Change of unit to percentage
    merged_df['lossrateMlab'] = merged_df['lossrateMlab'] * 100 
    # Fill missing values, mostly numTest/device
    for col in new_mlab_mappings.values():
        merged_df[col].fillna(merged_df[col].median(), inplace=True)
        
    # CLEAN DATA: remove rows with invalid latency or lossrate
    
    # 0.5 ms < latency <= 300ms
    merged_df = merged_df[(merged_df.latencyMlab > 0.5) & (merged_df.latencyMlab <= 300)]
    # All test instances where a unit’s packet loss exceeded 10% were removed
    merged_df = merged_df[(merged_df.lossrateMlab <= 10)]
    
    # Quantile-based Flooring and Capping
    for col in new_mlab_mappings.values():
        floor_rate = 0.01
        # set higher floor_rate for these values (due to non-uniform distributions of tests and latency)
        if col in ['numTestDownloadMlab', 'numDeviceDownloadMlab', 'numTestUploadMlab',
                   'numDeviceUploadMlab', 'latencyMlab',]: 
            floor_rate = 0.05
        cap_rate = 1 - floor_rate
        floor_value = merged_df[col].quantile(floor_rate) 
        cap_value = merged_df[col].quantile(cap_rate)
        
        # # Show the number of rows floored/capped
        # print(merged_df.shape, merged_df[merged_df[col] < floor_value].shape, merged_df[merged_df[col] > cap_value].shape)
        # Skewness value explains the extent to which the data is normally distributed. 
        # print("*** COL = ", col, ", initial skew: ", merged_df[col].skew())
        merged_df[col] = np.where(
            merged_df[col] < floor_value, floor_value, merged_df[col])
        merged_df[col] = np.where(
            merged_df[col] > cap_value, cap_value, merged_df[col])
        # print("Improved skew: ", merged_df[col].skew())
    
    s = merged_df.isnull().sum()
    if s.sum(): print(s[s>0])
    return merged_df


time: 1.3 ms (started: 2022-04-01 03:40:25 -04:00)


In [127]:
# COUNTY
county_speed = pd.read_csv(f'mlab_speeds/{QUARTER}-county-speed.csv')
county_speed = county_speed.rename(columns = {'GEOID' : 'GEOID_c'})
county_speed['GEOID_c'] = county_speed['GEOID_c'].astype(str).str.zfill(COUNTY_LENGTH)
county_device = pd.read_csv(f'mlab_speeds/{QUARTER}-county-device.csv')
county_device = county_device.rename(columns = {'GEOID' : 'GEOID_c'})
county_device['GEOID_c'] = county_device['GEOID_c'].astype(str).str.zfill(COUNTY_LENGTH)
mlab_county = merge_mlab_dfs(county_speed, county_device, 'GEOID_c') # MERGE 
# add TIGER geom to mlab_county
tiger_county = gp.read_file('https://www2.census.gov/geo/tiger/TIGER2019/COUNTY/tl_2019_us_county.zip')[['GEOID', 'geometry']].to_crs(CRS_COORDS)
tiger_county['GEOID_c'] = tiger_county['GEOID'].astype(str).str.zfill(COUNTY_LENGTH)
tiger_county = tiger_county[['GEOID_c', 'geometry']]
mlab_county = tiger_county.merge(mlab_county, on='GEOID_c', how='left')

del tiger_county, county_speed, county_device

# There are a few TIGER counties without Mlab speeds: impute
for col in set(mlab_county) - set(['geometry', 'GEOID_c']):
    mlab_county[col].fillna(mlab_county[col].median(), inplace=True)

s = mlab_county.isnull().sum()
if s.sum(): 
    print(s[s>0]) 
if set(all_cbg).intersection(set(mlab_county)) != {'geometry'}:
    print("ALERT: all_cbg should not have any mlab speed cols")
if mlab_county.crs != all_cbg.crs:
    print("ALERT: Check crs values")


time: 9.78 s (started: 2022-04-01 07:09:25 -04:00)


In [128]:
### Area-interpolate COUNTY >> CBG (mlab #test #devices)
device_cols = ['numTestDownloadMlab', 'numDeviceDownloadMlab', 
                               'numTestUploadMlab', 'numDeviceUploadMlab', ]
# 2.5min: AREA INTERPOLATION #device and #test
county_to_cbg_area_interpolation = area_interpolate(
    source_df = mlab_county,  
    target_df = all_cbg,  
    extensive_variables = device_cols,
    # categorical_variables = [],
)

time: 2min 26s (started: 2022-04-01 07:09:35 -04:00)


In [129]:
all_cbg_NEW = pd.concat([all_cbg,
          county_to_cbg_area_interpolation[device_cols].round(0).astype(int)],
          axis = 1)

# Copy Mlab max/min speeds
minmax_cols = [ 'maxDownloadMbpsMlab', 'maxUploadMbpsMlab', 
               'minDownloadMbpsMlab', 'minUploadMbpsMlab',]

all_cbg_NEW['GEOID_c'] = all_cbg_NEW.GEOID_cbg.str[:COUNTY_LENGTH]
all_cbg_NEW = all_cbg_NEW.merge(mlab_county[minmax_cols + ['GEOID_c']], 
                                      how='left', on='GEOID_c').drop(columns='geometry')

# TURN GEOID cols to INTEGER type in order for RF model to work
all_cbg_NEW[['GEOID_cbg', 'GEOID_c']] = all_cbg_NEW[['GEOID_cbg', 'GEOID_c']].astype(int)

for col in minmax_cols:
    all_cbg_NEW[col].fillna(all_cbg_NEW[col].median(), inplace=True)

s = all_cbg_NEW.isnull().sum()
if s.sum(): print(f"ALERT: {s[s>0]}")
# always = (220334, 77) QoQ
all_cbg_NEW.shape # same number of CBG, same number of columns: (220334, 77)

(220334, 77)

time: 487 ms (started: 2022-04-01 07:12:02 -04:00)


### Mlab CBG = Mlab training data

In [130]:
cbg_speed = pd.read_csv(f'mlab_speeds/{QUARTER}-cbg-speed.csv')
cbg_speed = cbg_speed.rename(columns = {'GEOID' : 'GEOID_cbg'})
cbg_speed['GEOID_cbg'] = cbg_speed['GEOID_cbg'].astype(str).str.zfill(CBG_LENGTH)
cbg_device = pd.read_csv(f'mlab_speeds/{QUARTER}-cbg-device.csv')
cbg_device = cbg_device.rename(columns = {'GEOID' : 'GEOID_cbg'})
cbg_device['GEOID_cbg'] = cbg_device['GEOID_cbg'].astype(str).str.zfill(CBG_LENGTH)
# MERGE 
mlab_cbg_combined = merge_mlab_dfs(cbg_speed, cbg_device, 'GEOID_cbg')

del cbg_speed, cbg_device

mlab_cbg_combined_ids = set(mlab_cbg_combined.GEOID_cbg)
# RF model only works with Numeric type
mlab_cbg_combined['GEOID_cbg'] = mlab_cbg_combined['GEOID_cbg'].astype(int)

s = mlab_cbg_combined.isnull().sum()
if s.sum():
    print(f"ALERT mlab_cbg_combined has nulls {s[s>0]}")
if set(mlab_cbg_combined) - set(mlab_cbg_combined.select_dtypes('number')):
    print(f"ALERT mlab_cbg_combined should contain only numberic cols")

time: 151 ms (started: 2022-04-01 07:12:02 -04:00)


In [131]:
mlab_free_df = all_cbg_NEW.drop(MLAB_END_COLS, axis=1, errors='ignore')
if set(mlab_free_df.select_dtypes('object')):
    print("ALERT: mlab_free_df has non-numeric cols", set(mlab_free_df.select_dtypes('object')))
    
## INNER JOIN: even if there are cbg with mlab data, remove them if no ES/demographic/ookla data available
cbg_training_df = mlab_cbg_combined.merge(mlab_free_df, how='inner', on='GEOID_cbg') 
del mlab_free_df

s = cbg_training_df.isnull().sum()
if s.sum():
    print(f"ALERT cbg_training_df has nulls {s[s>0]}")

# 2021Q1: (26596, 83) (220334, 77) ; 2021Q2: (26973, 83) (220334, 77)
# 2021Q3: (26578, 83) (220334, 77) ; 2021Q4: (26742, 83) (220334, 77)
print("SIZE OF training v.s. prediction set ", cbg_training_df.shape, all_cbg_NEW.shape)
 

SIZE OF training v.s. prediction set  (26742, 83) (220334, 77)
time: 137 ms (started: 2022-04-01 07:12:02 -04:00)


In [132]:
# MORE CHECKS
need_to_predict_cols = {'meanDownloadMbpsMlab','meanUploadMbpsMlab',
                     'medDownloadMbpsMlab','medUploadMbpsMlab', 
                        'latencyMlab', 'lossrateMlab'}
if set(cbg_training_df) - set(all_cbg_NEW) != need_to_predict_cols:
    print("ALERT: cbg_training_df is missing cols")
if cbg_training_df.shape[1] != cbg_training_df.select_dtypes('number').shape[1]:
    print("ALERT: cbg_training_df has non-numeric cols")   
    
# CONFIRM there are no views lurking: we don't want to accidentally update all_cbg_NEW (PREDICTION) with cbg_training_df (TRAINING) later. 
print(cbg_training_df._is_view, cbg_training_df._is_copy) # (False, None)
print(all_cbg_NEW._is_view, all_cbg_NEW._is_copy) # (False, None)

False None
False None
time: 12.5 ms (started: 2022-04-01 07:12:03 -04:00)


In [133]:
CBG_TRAINING_COLS = ['CMC', 'Education', 'GEOID_c', 'GEOID_cbg', 'Health', 'MaxConsumerDown98', 'MaxConsumerUp98', 'POP2019', 'Public Admin', 'age65overper', 'asianper', 'bachelorper', 'blackper', 'cafiiLocation', 'hh2020', 'hu2020', 'landareaSqmi', 'lat', 'latencyMlab', 'latencyOokla', 'lengthMile', 'lon', 'lossrateMlab', 'maxDownloadMbpsMlab', 'maxDownloadMbpsOokla', 'maxUploadMbpsMlab', 'maxUploadMbpsOokla', 'maxadownFiber', 'maxadownOther', 'maxadownWireless', 'maxadupFiber', 'maxadupOther', 'maxadupWireless', 'meanDownloadMbpsMlab', 'meanDownloadMbpsOokla', 'meanUploadMbpsMlab', 'meanUploadMbpsOokla', 'medDownloadMbpsMlab', 'medDownloadMbpsOokla', 'medUploadMbpsMlab', 'medUploadMbpsOokla', 'mhincome', 'minDownloadMbpsMlab', 'minDownloadMbpsOokla', 'minUploadMbpsMlab', 'minUploadMbpsOokla', 'nativeper', 'nocomputerper_ct', 'nointernetper', 'nointernetper_ct', 'numDeviceDownloadMlab', 'numDeviceOokla', 'numDeviceUploadMlab', 'numISPcomm', 'numISPfiber', 'numISPother', 'numISPresi', 'numISPwireless', 'numTestDownloadMlab', 'numTestOokla', 'numTestUploadMlab', 'num_household', 'num_household_ct', 'num_housingunit', 'otherraceper', 'parcelBuildingCount', 'parcelBuildingFootprint', 'parcelNumAgri', 'parcelNumCommer', 'parcelNumInfra', 'parcelNumRem', 'parcelNumResi', 'parcelNumTotal', 'parcelNumValid', 'pop2020', 'povertybelow15', 'povertybelow15_ct', 'povertybelow20_ct', 'povertyper', 'povertyper_ct', 'rdofLocation', 'rdofReserve', 'whiteper']
len(CBG_TRAINING_COLS) # 83

83

time: 2.11 ms (started: 2022-04-01 07:12:03 -04:00)


In [134]:
# Save current quarter training_df
if set(cbg_training_df) != set(CBG_TRAINING_COLS):
    print("ALERT: CHECK cols of cbg_training_df")
    
cbg_training_df.to_csv(f'mlab_speeds/{QUARTER}-TRAINING-CBG.csv', index=False)

time: 996 ms (started: 2022-04-01 07:12:03 -04:00)


## ROLLING QUARTER: addition

In [135]:
year, qrtr = QUARTER.split('Q')
year, qrtr = int(year), int(qrtr)

all_prev_quarters = []
if year >= 2021 and qrtr in [1,2,3,4]:
    # all four quarters of past years
    for y in range(2021, year):
        for q in range(1, 5):
            all_prev_quarters.append(str(y)+'Q'+str(q))
    # quarters of current year  
    for q in range(1, qrtr):
        all_prev_quarters.append(str(year)+'Q'+str(q)) 
        
print(QUARTER, all_prev_quarters)

2021Q4 ['2021Q1', '2021Q2', '2021Q3']
time: 903 µs (started: 2022-04-01 07:12:04 -04:00)


In [136]:
additional_cbg_training = []
for PREV_QUARTER in all_prev_quarters:
    df = pd.read_csv(f'mlab_speeds/{PREV_QUARTER}-TRAINING-CBG.csv')
    if set(df) != set(CBG_TRAINING_COLS):
        print(f"ALERT: check cols of {PREV_QUARTER} training df")
    additional_cbg_training.append(df)

if len(additional_cbg_training):    
    additional_cbg_training = pd.concat(additional_cbg_training, ignore_index=False)
    if cbg_training_df.GEOID_cbg.dtype != additional_cbg_training.GEOID_cbg.dtype:
        print("ALERT: GEOID_cbg dtype mismatched")    
    cbg_training_df_combined = pd.concat([cbg_training_df, additional_cbg_training], ignore_index=True)
else:
    cbg_training_df_combined = cbg_training_df.copy()

time: 612 ms (started: 2022-04-01 07:12:04 -04:00)


In [137]:
# 2021Q3: ((26578, 83), (53569, 83), (80147, 83))
# 2021Q4: ((26742, 83), (80147, 83), (106889, 83))
cbg_training_df.shape, additional_cbg_training.shape, cbg_training_df_combined.shape

((26742, 83), (80147, 83), (106889, 83))

time: 1.49 ms (started: 2022-04-01 07:12:04 -04:00)


### PREDICTIONS with RANDOM FOREST models
- NOTE: For meanSpeed and medSpeed: The Random Forest MLAB models's predictions are CONSERVATIVE (i.e. most PREDICTED VALUES concentrate (have low skew) and are OPTIMISTIC (have higher mean and medians)



In [49]:
def evaluate_reg_model(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    # print('Model Performance', model)
    print('Average Error: {:0.4f} Mbps.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    # print('Mean Absolute Error MAE:', metrics.mean_absolute_error(test_labels, predictions))  
    model_test_pred = pd.DataFrame({'Observed': test_labels.flatten(), 'Predicted': predictions.flatten()})
    # print(model_test_pred.sample(frac=0.1, random_state=1).head(6))
    return accuracy

def show_feature_importances(model, predictors):
    # Get numerical feature importances
    importances = list(model.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(predictors, importances) if importance > 0.1]
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # The feature and importances 
    for pair in feature_importances:
        print('Variable: {} Importance: {}'.format(*pair))
        
def build_rf_model(training_df, exclude_attributes, response):
    scaler =  StandardScaler()
    predictors = sorted(list(set(training_df) - set(exclude_attributes)))

    X = training_df[predictors].values
    y = training_df[response].values 
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    # Build the model: SLOW
    rf_model = RandomForestRegressor(n_estimators=200, random_state = 0) 
    
    # Normalization after train_test_split
    X_train_norm = scaler.fit_transform(X_train)
    rf_model.fit(X_train_norm, y_train)

    X_test_norm = scaler.transform(X_test) 
    # evaluation PREDICTIONS on test data
    evaluate_reg_model(rf_model, X_test_norm, y_test)
    # Feature importances
    show_feature_importances(rf_model, predictors)

    return rf_model, scaler, predictors
        

time: 1.31 ms (started: 2022-04-01 04:40:21 -04:00)


### Build models: latency, lossrate, meanUp, meanDown

In [None]:
# 50 min to complete building 6 models
all_model_results = []

for cur_response in ['latencyMlab', 'lossrateMlab', 
        'meanUploadMbpsMlab', 'meanDownloadMbpsMlab', 
         'medDownloadMbpsMlab', 'medUploadMbpsMlab',]:
    
    exclude_attributes = {
        'meanDownloadMbpsMlab', 'meanUploadMbpsMlab',
        'medDownloadMbpsMlab', 'medUploadMbpsMlab', 'latencyMlab', 'lossrateMlab', } 
    if cur_response in ['medDownloadMbpsMlab', 'medUploadMbpsMlab',]:
        # use Mlab [newly predicted] mean speeds as predictors
        exclude_attributes = {
        'medDownloadMbpsMlab', 'medUploadMbpsMlab',  'latencyMlab', 'lossrateMlab',} 
    print("Building RF model for ", cur_response)  
    # BEFORE in mlab_init: train on cbg_training_df
    # cur_model, cur_scaler, cur_predictor = build_rf_model(cbg_training_df, exclude_attributes, predictor_col)
    # NOW in mlab_rolling: train on cbg_training_df_combined
    cur_model, cur_scaler, cur_predictor = build_rf_model(cbg_training_df_combined, exclude_attributes, cur_response)
    all_model_results.append([cur_response, cur_model, cur_scaler, cur_predictor])


Building RF model for  latencyMlab


In [None]:
# 2021Q1 (init), vs 2021Q2, Q3, Q4 RESULTS = 
# 'latencyMlab',  accuracy=65%, 68%, 69%, 
#  'lossrateMlab', Average Error: 1.1758 Mbps, 1.09 Mbps, 1.07 Mbps, 
# 'meanUploadMbpsMlab', accuracy=60%, 61%, 61%, 
#  'meanDownloadMbpsMlab', accuracy=70%, 72%, 73%
# 'medDownloadMbpsMlab', accuracy=90%, 88%, 89%
#  'medUploadMbpsMlab', accuracy=80%, 80%, 80%


In [58]:
# Number of features seen during fit.
for (cur_response, cur_model, cur_scaler, cur_predictors) in all_model_results:
    print(cur_response, cur_model.n_features_in_, cur_scaler.n_features_in_, len(cur_predictor))
# latencyMlab 77 77 79
# lossrateMlab 77 77 79
# meanUploadMbpsMlab 77 77 79
# meanDownloadMbpsMlab 77 77 79
# medDownloadMbpsMlab 79 79 79
# medUploadMbpsMlab 79 79 79    

latencyMlab 77 77 79
lossrateMlab 77 77 79
meanUploadMbpsMlab 77 77 79
meanDownloadMbpsMlab 77 77 79
medDownloadMbpsMlab 79 79 79
medUploadMbpsMlab 79 79 79
time: 1.01 ms (started: 2022-04-01 05:15:57 -04:00)


## Making predictions

In [None]:
all_cbg_NEW1 = all_cbg_NEW.copy()

In [63]:
all_cbg_NEW = all_cbg_NEW1.copy()

time: 44.8 ms (started: 2022-04-01 05:25:56 -04:00)


In [None]:
for cur_response, cur_model, cur_scaler, cur_predictors in all_model_results:
    X_pred = all_cbg_NEW[cur_predictors].values
    X_pred_norm = cur_scaler.transform(X_pred)
    all_cbg_NEW[cur_response] = cur_model.predict(X_pred_norm).round(2)

    print(cur_response, "COMPARISON training vs prediction (mean, 50%)")
    print(cbg_training_df[cur_response].describe()[['mean', '50%']].values, 'v.s.', 
          all_cbg_NEW[cur_response].describe()[['mean', '50%']].values)

# 2021Q1 references: 
# latencyMlab COMPARISON training vs prediction (mean, 50%)
# [27.53841466 24.81      ] v.s. [27.32264022 26.47      ]
# lossrateMlab COMPARISON training vs prediction (mean, 50%)
# [2.23052333 1.66      ] v.s. [3.21274674 3.38      ]
# meanUploadMbpsMlab COMPARISON training vs prediction (mean, 50%)
# [29.16358677 17.66      ] v.s. [100.70669588 103.79      ]
# meanDownloadMbpsMlab COMPARISON training vs prediction (mean, 50%)
# [83.77329782 79.24      ] v.s. [144.66818875 153.47      ]
# medDownloadMbpsMlab COMPARISON training vs prediction (mean, 50%)
# [58.05855318 54.03      ] v.s. [117.51646006 129.78      ]
# medUploadMbpsMlab COMPARISON training vs prediction (mean, 50%)
# [18.55867215 10.07      ] v.s. [79.64160007 78.84      ]


In [None]:
all_cbg_NEW.shape # ALWAYS (220334, 83)

In [None]:
# CLEAN DATA: invalid values are replaced with NaN, and fill with medians later
# in most cases, when the latency and lossrate models make good predictions, then: 
# all the latencies are valid: 0.5 ms < latency <= 300ms
all_cbg_NEW.loc[(all_cbg_NEW['latencyMlab'] <= 0.5) | (all_cbg_NEW['latencyMlab'] > 300),
               ['latencyMlab']] = np.nan
# All test instances where a unit’s packet loss <= 10% 
all_cbg_NEW.loc[(all_cbg_NEW['lossrateMlab'] > 10), ['lossrateMlab']] = np.nan

# Check skew. Since, most important metrics medDown and medUp have good skew,
# there's no need for quantile-based capping/flooring
for col in ['medDownloadMbpsMlab', 'medUploadMbpsMlab']:
    if abs(all_cbg_NEW[col].skew()) > 3:
        print(f"ALERT: {col} has high skew: {all_cbg_NEW[col].skew()}")

# Check null
s = all_cbg_NEW.isnull().sum()
if s.sum(): print(s[s>0]) 

In [None]:
# Create speedSourceMlab
all_cbg_NEW['speedSourceMlab'] = 'bigQueryCountyRFregressionAtCBG' # 88%
cbg_training_df['speedSourceMlab'] = 'bigQueryAtCBG' # 12% 

# IMPORTANT: overwrite newly predicted all_cbg_NEW values using available cbg_training_df
all_cbg_NEW.set_index("GEOID_cbg", inplace=True)
cbg_training_df.set_index("GEOID_cbg", inplace=True)
all_cbg_NEW.update(cbg_training_df)

if cbg_training_df.shape[1] - all_cbg_NEW.shape[1]:
    print("ALERT: all_cbg_NEW <> cbg_training_df has diff. columns LENGTH")
if set(cbg_training_df) - set(all_cbg_NEW):
    print("ALERT: all_cbg_NEW <> cbg_training_df has diff. columns NAMES") 


s = all_cbg_NEW.isnull().sum()
if s.sum(): 
    print(f"ALERT all_cbg_NEW has nulls, {s[s>0]}")
    
# FILL nulls, just in case -- very unlikely
for col in set(all_cbg_NEW.select_dtypes('number')):
    all_cbg_NEW[col].fillna(all_cbg_NEW[col].median(), inplace=True)

In [None]:
all_cbg_NEW.reset_index(inplace=True)
all_cbg_NEW['GEOID_cbg'] = all_cbg_NEW['GEOID_cbg'].astype(str).str.zfill(CBG_LENGTH)
path = f'mlab_speeds/{QUARTER}-COMPLETE-CBG.csv'
all_cbg_NEW.to_csv(path, index=False)
print(f"SAVED completed CBG-level speed df to {path}")

In [None]:
all_cbg_NEW.shape # after added speedSourceMlab col, always = (220334, 84) QoQ

## FINALLY, from MLAB CBG to CB

In [92]:
# REUSE cat_OOKLA_speed_conditions for speedCatMlab!
def cat_OOKLA_speed_conditions(down, up, latency):
    if down < 25 or up < 3:
        return 0
    if down < 100 or up < 20 or latency > 100: 
        return 1
    return 2
cat_speed_OOKLA_vectorize = np.vectorize(cat_OOKLA_speed_conditions)

time: 504 µs (started: 2022-04-01 05:34:49 -04:00)


In [None]:
mlab_cbg_complete = all_cbg_NEW[['GEOID_cbg', 'speedSourceMlab'] + MLAB_MID_COLS].copy(deep = True)
for sf in SF52:
# for sf in ['02', '10']:
    abbrv = SF52[sf]
    # speedCatNtia was calculated (never got modified) in gen_ookla.py
    state_cb_upload = pd.read_csv(f"ookla_ntia/{QUARTER}_CB_{sf}.csv")
    if set(state_cb_upload) != set(OOKLA_END_COLS+NTIA_END_COLS+['GEOID']):
        print("ALERT: Check column names of state_cb_upload")

    state_cb_upload['GEOID'] = state_cb_upload['GEOID'].astype(str).str.zfill(CB_LENGTH)
    state_cb_upload['GEOID_cbg'] = state_cb_upload['GEOID'].str[:CBG_LENGTH]

    # print(abbrv, sf, "Without MLAB fields: ", state_cb_upload.shape) 
    s = state_cb_upload.isnull().sum()
    if s.sum(): 
        print("ALERT: state_cb_upload has nulls ", s[s>0])
    
    state_cb_upload = state_cb_upload.merge(mlab_cbg_complete, how='left', on='GEOID_cbg').drop(columns='GEOID_cbg')

    # maybe later: ANALYSIS 
    # # - speed change (mlab/ookla) and speedRankReady change from 2021Q3 to 2021Q4
    # # - states/counties/CBG with top speeds, and top speed improvement/degrade

    # Update speedCatOokla, just in case
    state_cb_upload['speedCatOokla']= cat_speed_OOKLA_vectorize(
        state_cb_upload['medDownloadMbpsOokla'], state_cb_upload['medUploadMbpsOokla'],  state_cb_upload['latencyOokla'])
    # Generate new speedCatMlab col
    state_cb_upload['speedCatMlab']= cat_speed_OOKLA_vectorize(
        state_cb_upload['medDownloadMbpsMlab'], state_cb_upload['medUploadMbpsMlab'],  state_cb_upload['latencyMlab'])
    # Generate new speedRankReadyRaw col
    def cat_speedRank_conditions(catNtia, catMlab, catOokla):
        if catNtia == 0:                                     return 'UnservedDef'         # (0, -, -)
        if catMlab == 0 and catOokla == 0:                   return 'UnservedLikely'      # (1+, 0, 0)
        if catMlab == 0 or catOokla == 0:                    return 'UnservedArguably'    # (1+, 0, 1+) (1+, 1+, 0)
        if catNtia == 2 and catMlab == 2 and catOokla == 2:  return 'Served'              # (2, 2, 2)
        if catNtia == 1:                                     return 'UnderservedDef'      # (1,1+,1+) 
        if catNtia == 2 and catMlab == 1 and catOokla == 1:  return 'UnderservedLikely'   # (2,1,1)
        if catNtia == 2 and catMlab + catOokla == 3:         return 'UnderservedArguably' # (2,1,2) (2,2,1)
        print("ALERT: SHOULD NOT REACH HERE!")
        return        
    cat_speedRank_vectorize = np.vectorize(cat_speedRank_conditions)
    state_cb_upload['speedRankReadyRaw']= cat_speedRank_vectorize(state_cb_upload.speedCatNtia, state_cb_upload.speedCatMlab, state_cb_upload.speedCatOokla)

    # numTest v.s. numDevice check
    # # RARE CASE: when devices > tests, maybe due to (1) Ookla's bad raw data, or (2) our Quantile-based Flooring and Capping process independently created this scenario
    # # FIX: set devices = tests
    state_cb_upload['numDeviceUploadMlab'] = np.where(state_cb_upload['numDeviceUploadMlab'] > state_cb_upload['numTestUploadMlab'], 
            state_cb_upload['numTestUploadMlab'], state_cb_upload['numDeviceUploadMlab'])    
    state_cb_upload['numDeviceDownloadMlab'] = np.where(state_cb_upload['numDeviceDownloadMlab'] > state_cb_upload['numTestDownloadMlab'], 
            state_cb_upload['numTestDownloadMlab'], state_cb_upload['numDeviceDownloadMlab'])    
        
    # print(abbrv, sf, "With MLAB fields + speedRankReadyRaw: ", state_cb_upload.shape) 
    s = state_cb_upload.isnull().sum()
    if s.sum(): 
        print("ALERT: state_cb_upload has nulls ", s[s>0])
    
    if set(state_cb_upload) != set(['GEOID', 'speedRankReadyRaw'] + 
                                   NTIA_END_COLS + OOKLA_END_COLS + MLAB_END_COLS):
        print("ALERT: check columns in the final state_cb_upload")
        
    path = f'speed_ready_upload/{QUARTER}_{sf}.csv'
    state_cb_upload.to_csv(path, index=False)
    print(abbrv, sf, f"Saved {state_cb_upload.shape=} to {path}")


In [None]:
# RESULT CHECK
s = """AL 01 Saved state_cb_upload.shape=(252266, 38) to speed_ready_upload/2021Q1_01.csv
AK 02 Saved state_cb_upload.shape=(45292, 38) to speed_ready_upload/2021Q1_02.csv
AZ 04 Saved state_cb_upload.shape=(241666, 38) to speed_ready_upload/2021Q1_04.csv
AR 05 Saved state_cb_upload.shape=(186211, 38) to speed_ready_upload/2021Q1_05.csv
CA 06 Saved state_cb_upload.shape=(710145, 38) to speed_ready_upload/2021Q1_06.csv
CO 08 Saved state_cb_upload.shape=(201062, 38) to speed_ready_upload/2021Q1_08.csv
CT 09 Saved state_cb_upload.shape=(67578, 38) to speed_ready_upload/2021Q1_09.csv
DE 10 Saved state_cb_upload.shape=(24115, 38) to speed_ready_upload/2021Q1_10.csv
DC 11 Saved state_cb_upload.shape=(6507, 38) to speed_ready_upload/2021Q1_11.csv
FL 12 Saved state_cb_upload.shape=(484481, 38) to speed_ready_upload/2021Q1_12.csv
GA 13 Saved state_cb_upload.shape=(291086, 38) to speed_ready_upload/2021Q1_13.csv
HI 15 Saved state_cb_upload.shape=(25016, 38) to speed_ready_upload/2021Q1_15.csv
ID 16 Saved state_cb_upload.shape=(149842, 38) to speed_ready_upload/2021Q1_16.csv
IL 17 Saved state_cb_upload.shape=(451554, 38) to speed_ready_upload/2021Q1_17.csv
IN 18 Saved state_cb_upload.shape=(267071, 38) to speed_ready_upload/2021Q1_18.csv
IA 19 Saved state_cb_upload.shape=(216007, 38) to speed_ready_upload/2021Q1_19.csv
KS 20 Saved state_cb_upload.shape=(238600, 38) to speed_ready_upload/2021Q1_20.csv
KY 21 Saved state_cb_upload.shape=(161672, 38) to speed_ready_upload/2021Q1_21.csv
LA 22 Saved state_cb_upload.shape=(204447, 38) to speed_ready_upload/2021Q1_22.csv
ME 23 Saved state_cb_upload.shape=(69518, 38) to speed_ready_upload/2021Q1_23.csv
MD 24 Saved state_cb_upload.shape=(145247, 38) to speed_ready_upload/2021Q1_24.csv
MA 25 Saved state_cb_upload.shape=(157508, 38) to speed_ready_upload/2021Q1_25.csv
MI 26 Saved state_cb_upload.shape=(329885, 38) to speed_ready_upload/2021Q1_26.csv
MN 27 Saved state_cb_upload.shape=(259777, 38) to speed_ready_upload/2021Q1_27.csv
MS 28 Saved state_cb_upload.shape=(171778, 38) to speed_ready_upload/2021Q1_28.csv
MO 29 Saved state_cb_upload.shape=(343565, 38) to speed_ready_upload/2021Q1_29.csv
MT 30 Saved state_cb_upload.shape=(132288, 38) to speed_ready_upload/2021Q1_30.csv
NE 31 Saved state_cb_upload.shape=(193352, 38) to speed_ready_upload/2021Q1_31.csv
NV 32 Saved state_cb_upload.shape=(84538, 38) to speed_ready_upload/2021Q1_32.csv
NH 33 Saved state_cb_upload.shape=(48837, 38) to speed_ready_upload/2021Q1_33.csv
NJ 34 Saved state_cb_upload.shape=(169588, 38) to speed_ready_upload/2021Q1_34.csv
NM 35 Saved state_cb_upload.shape=(168609, 38) to speed_ready_upload/2021Q1_35.csv
NY 36 Saved state_cb_upload.shape=(350169, 38) to speed_ready_upload/2021Q1_36.csv
NC 37 Saved state_cb_upload.shape=(288987, 38) to speed_ready_upload/2021Q1_37.csv
ND 38 Saved state_cb_upload.shape=(133769, 38) to speed_ready_upload/2021Q1_38.csv
OH 39 Saved state_cb_upload.shape=(365344, 38) to speed_ready_upload/2021Q1_39.csv
OK 40 Saved state_cb_upload.shape=(269118, 38) to speed_ready_upload/2021Q1_40.csv
OR 41 Saved state_cb_upload.shape=(196621, 38) to speed_ready_upload/2021Q1_41.csv
PA 42 Saved state_cb_upload.shape=(421545, 38) to speed_ready_upload/2021Q1_42.csv
RI 44 Saved state_cb_upload.shape=(25181, 38) to speed_ready_upload/2021Q1_44.csv
SC 45 Saved state_cb_upload.shape=(181908, 38) to speed_ready_upload/2021Q1_45.csv
SD 46 Saved state_cb_upload.shape=(88360, 38) to speed_ready_upload/2021Q1_46.csv
TN 47 Saved state_cb_upload.shape=(240116, 38) to speed_ready_upload/2021Q1_47.csv
TX 48 Saved state_cb_upload.shape=(914231, 38) to speed_ready_upload/2021Q1_48.csv
UT 49 Saved state_cb_upload.shape=(115406, 38) to speed_ready_upload/2021Q1_49.csv
VT 50 Saved state_cb_upload.shape=(32580, 38) to speed_ready_upload/2021Q1_50.csv
VA 51 Saved state_cb_upload.shape=(285762, 38) to speed_ready_upload/2021Q1_51.csv
WA 53 Saved state_cb_upload.shape=(195574, 38) to speed_ready_upload/2021Q1_53.csv
WV 54 Saved state_cb_upload.shape=(135218, 38) to speed_ready_upload/2021Q1_54.csv
WI 55 Saved state_cb_upload.shape=(253096, 38) to speed_ready_upload/2021Q1_55.csv
WY 56 Saved state_cb_upload.shape=(86204, 38) to speed_ready_upload/2021Q1_56.csv
PR 72 Saved state_cb_upload.shape=(77189, 38) to speed_ready_upload/2021Q1_72.csv"""
l = [k.split(')')[0].split('(')[1].split(',')[0] for k in s.split("Saved state_cb_upload.shape=")[1:]]
# 11155486 (11,155,486)! ALL census blocks present! ok!
sum(int(x) for x in l)

#### NOTE: Cols count: explained
###### before <> after: state_cb_upload.merge(mlab_cbg_complete,...)
- state_cb_upload = f"ookla_ntia/{QUARTER}_CB_{sf}.csv" 
    - = 13 NTIA + 7 Ookla + GEOID_cbg + GEOID = 22 cols
- merge with mlab_cbg_complete (MLAB_END_COLS - speedCatMlab + GEOID_cbg) = 37 
    - minus GEOID_cbg = 36
    - plus speedCatMlab + speedRankReadyRaw = 38 = final column count 
    - = GEOID + speedRankReadyRaw + NTIA_END_COLS#7 + OOKLA_END_COLS#13 + MLAB_END_COLS#16


### Final step: Run upload_speed_test.py: Check results, upload to speed_test
- IMPORTANT: update QUARTER inside upload_speed_test.py
- by running: $ bash speed/cmds.bash &  (comment other commands in cmds.bash as needed)
- NOTE: when complete uploading, check docs count for speed_test index = total number of census block in SF52: 11,155,486

In [None]:
# CHECK UPLOAD COUNTS
cmds_log_string = """
"""
s1 = cmds_log_string.split('Completed uploading')[1:]
s2 = [int(k.split(' records to ')[0].strip()) for k in s1]
len(s2), sum(s2) # (52, 11-155-486 YES!)

In [158]:
# CHECK DOWNLOAD COUNTS
s = """
"""
s1 = s.split('Saved ntia_df ')[1:]
s2 = [int(k.split(') to  Elasticsearch/')[0].split(', ')[0][1:]) for k in s1]
len(s2), sum(s2) # (52, 11-155-486 YES!)

time: 2.53 ms (started: 2022-04-01 12:34:01 -04:00)
