In [1]:
import pandas as pd

In [2]:
# import microbusiness density data
mbd = pd.read_csv("mbd_train.csv")
mbd = mbd.drop(['row_id'], axis=1)

In [3]:
# move county and state to new df which will act as a cfips index

cfips_index = mbd[['cfips', 'county', 'state']].drop_duplicates().reset_index(drop=True)
mbd.drop(['county', 'state'], axis=1, inplace=True)

In [4]:
print(cfips_index.shape)
cfips_index.head()

(3135, 3)


Unnamed: 0,cfips,county,state
0,1001,Autauga County,Alabama
1,1003,Baldwin County,Alabama
2,1005,Barbour County,Alabama
3,1007,Bibb County,Alabama
4,1009,Blount County,Alabama


In [27]:
print(mbd.shape)
mbd.head()

(122265, 4)


Unnamed: 0,cfips,first_day_of_month,microbusiness_density,active
0,1001,2019-08-01,3.007682,1249
1,1001,2019-09-01,2.88487,1198
2,1001,2019-10-01,3.055843,1269
3,1001,2019-11-01,2.993233,1243
4,1001,2019-12-01,2.993233,1243


In [6]:
# transpose the date data so each unique cfips is in its own row

df = mbd

# Convert the 'first_day_of_month' column to datetime type
df['first_day_of_month'] = pd.to_datetime(df['first_day_of_month'])

# Pivot the DataFrame
df_pivoted = df.pivot(index='cfips', columns='first_day_of_month', values=['microbusiness_density', 'active'])

# Flatten the column multi-index
df_pivoted.columns = [f'{col[0]}_{col[1].strftime("%Y-%m-%d")}' for col in df_pivoted.columns]

# Reset index to restore 'cfips' as a regular column
df_pivoted.reset_index(inplace=True)

# Merge with the original df
mbd_cfips_grouped = df_pivoted.merge(df['cfips'].drop_duplicates(), on='cfips')


In [7]:
print(mbd_cfips_grouped.shape)
mbd_cfips_grouped.head()

(3135, 79)


Unnamed: 0,cfips,microbusiness_density_2019-08-01,microbusiness_density_2019-09-01,microbusiness_density_2019-10-01,microbusiness_density_2019-11-01,microbusiness_density_2019-12-01,microbusiness_density_2020-01-01,microbusiness_density_2020-02-01,microbusiness_density_2020-03-01,microbusiness_density_2020-04-01,...,active_2022-01-01,active_2022-02-01,active_2022-03-01,active_2022-04-01,active_2022-05-01,active_2022-06-01,active_2022-07-01,active_2022-08-01,active_2022-09-01,active_2022-10-01
0,1001,3.007682,2.88487,3.055843,2.993233,2.993233,2.96909,2.909326,2.933231,3.000167,...,1401.0,1417.0,1418.0,1433.0,1408.0,1422.0,1461.0,1455.0,1463.0,1472.0
1,1003,7.239156,7.290936,7.425439,7.426071,7.470274,7.413655,7.282522,7.30961,7.568799,...,13247.0,13401.0,13610.0,13668.0,13545.0,14573.0,14686.0,14545.0,14289.0,14320.0
2,1005,1.073138,0.995794,1.160149,1.000628,1.000628,1.027229,1.022314,1.032144,1.086209,...,235.0,239.0,237.0,236.0,235.0,236.0,241.0,237.0,239.0,244.0
3,1007,1.310777,1.305176,1.254761,1.254761,1.265965,1.253638,1.248041,1.264831,1.270428,...,216.0,220.0,225.0,223.0,222.0,227.0,236.0,230.0,234.0,229.0
4,1009,1.544148,1.575892,1.546415,1.573625,1.555485,1.573217,1.536999,1.541526,1.577744,...,778.0,789.0,798.0,783.0,776.0,787.0,813.0,815.0,822.0,813.0


In [8]:
# claculate mbd statistics for each county

df = mbd_cfips_grouped

# Filter columns with "microbusiness_density_" in the label
mbd_columns = [col for col in df.columns if col.startswith('microbusiness_density_')]

# Calculate statistics for filtered columns
statistics = {
    'mbd_mean': df[mbd_columns].mean(axis=1),
    'mbd_min': df[mbd_columns].min(axis=1),
    'mbd_max': df[mbd_columns].max(axis=1),
    'mbd_median': df[mbd_columns].median(axis=1),
    'mbd_variance': df[mbd_columns].var(axis=1),
    'mbd_std': df[mbd_columns].std(axis=1)
}

mbd_stats = pd.DataFrame(statistics)
mbd_stats.set_index(df['cfips'], inplace=True)


In [9]:
mbd_stats

Unnamed: 0_level_0,mbd_mean,mbd_min,mbd_max,mbd_median,mbd_variance,mbd_std
cfips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001,3.164535,2.884870,3.463856,3.193804,0.026933,0.164113
1003,7.889811,7.239156,8.573463,7.909001,0.133555,0.365452
1005,1.102521,0.982994,1.232074,1.102025,0.005975,0.077298
1007,1.270834,1.186877,1.343183,1.265965,0.001589,0.039863
1009,1.676951,1.536999,1.852060,1.700081,0.009240,0.096125
...,...,...,...,...,...,...
56037,3.119993,2.640386,3.540377,3.163303,0.060228,0.245414
56039,26.337935,25.209639,27.720806,26.090586,0.634601,0.796619
56041,3.946930,3.364125,4.612016,3.954258,0.139415,0.373384
56043,2.904894,2.569905,3.256737,2.977668,0.054972,0.234460


In [25]:
top_var = mbd_stats.nlargest(5,'mbd_variance')
top_var.head()

Unnamed: 0_level_0,mbd_mean,mbd_min,mbd_max,mbd_median,mbd_variance,mbd_std
cfips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32510,87.196625,10.847511,224.53825,11.628876,8658.042524,93.048603
46127,19.630615,5.092314,284.34003,5.515983,3788.672082,61.55219
56033,73.51041,30.183279,227.75665,82.235069,1648.237929,40.598497
16021,46.730638,10.954705,96.812622,56.363235,1191.42931,34.517087
32017,40.140351,3.789673,72.88575,67.930374,1150.56217,33.919938


In [26]:
bottom_var = mbd_stats.nsmallest(5, 'mbd_variance')
bottom_var.head()

Unnamed: 0_level_0,mbd_mean,mbd_min,mbd_max,mbd_median,mbd_variance,mbd_std
cfips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
51081,0.164327,0.143384,0.186645,0.164626,0.00016,0.01263
48427,0.527867,0.495833,0.556357,0.529088,0.000162,0.01274
13101,0.073227,0.063837,0.100067,0.064516,0.000225,0.015007
39037,1.849626,1.81106,1.885354,1.849395,0.000346,0.018606
29181,0.814338,0.765477,0.861162,0.81478,0.000356,0.018863


In [22]:
# calculate active stats for each county

df = mbd_cfips_grouped

# Filter columns with "active_" in the label
active_columns = [col for col in df.columns if col.startswith('active_')]

# Calculate statistics for filtered columns
statistics = {
    'active_mean': df[active_columns].mean(axis=1),
    'active_min': df[active_columns].min(axis=1),
    'active_max': df[active_columns].max(axis=1),
    'active_median': df[active_columns].median(axis=1),
    'active_variance': df[active_columns].var(axis=1),
    'active_std': df[active_columns].std(axis=1)
}

active_stats = pd.DataFrame(statistics)
active_stats.set_index(df['cfips'], inplace=True)

In [23]:
active_stats

Unnamed: 0_level_0,active_mean,active_min,active_max,active_median,active_variance,active_std
cfips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1001,1331.717949,1198.0,1472.0,1341.0,6115.313090,78.200467
1003,13064.538462,11464.0,14686.0,13247.0,739530.623482,859.959664
1005,222.102564,200.0,244.0,222.0,172.726046,13.142528
1007,226.769231,212.0,240.0,226.0,50.919028,7.135757
1009,742.230769,679.0,822.0,753.0,1944.024291,44.091091
...,...,...,...,...,...,...
56037,1003.641026,860.0,1132.0,1022.0,5962.130904,77.214836
56039,4953.538462,4750.0,5163.0,4943.0,13520.834008,116.279121
56041,573.666667,488.0,674.0,574.0,3053.175439,55.255547
56043,177.410256,159.0,197.0,180.0,152.932524,12.366589


In [12]:
census = pd.read_csv("census_starter.csv")
census.head()

Unnamed: 0,pct_bb_2017,pct_bb_2018,pct_bb_2019,pct_bb_2020,pct_bb_2021,cfips,pct_college_2017,pct_college_2018,pct_college_2019,pct_college_2020,...,pct_it_workers_2017,pct_it_workers_2018,pct_it_workers_2019,pct_it_workers_2020,pct_it_workers_2021,median_hh_inc_2017,median_hh_inc_2018,median_hh_inc_2019,median_hh_inc_2020,median_hh_inc_2021
0,76.6,78.9,80.6,82.7,85.5,1001,14.5,15.9,16.1,16.7,...,1.3,1.1,0.7,0.6,1.1,55317,58786.0,58731,57982.0,62660.0
1,74.5,78.1,81.8,85.1,87.9,1003,20.4,20.7,21.0,20.2,...,1.4,1.3,1.4,1.0,1.3,52562,55962.0,58320,61756.0,64346.0
2,57.2,60.4,60.5,64.6,64.6,1005,7.6,7.8,7.6,7.3,...,0.5,0.3,0.8,1.1,0.8,33368,34186.0,32525,34990.0,36422.0
3,62.0,66.1,69.2,76.1,74.6,1007,8.1,7.6,6.5,7.4,...,1.2,1.4,1.6,1.7,2.1,43404,45340.0,47542,51721.0,54277.0
4,65.8,68.5,73.0,79.6,81.0,1009,8.7,8.1,8.6,8.9,...,1.3,1.4,0.9,1.1,0.9,47412,48695.0,49358,48922.0,52830.0


In [13]:
indcom = pd.read_csv("VF_indcom_cfips_Q123.csv")

In [14]:
print(indcom.size)
indcom.head()

575120


Unnamed: 0,cfips,county,state,groupflag,total_pop_21,total_pop_20,orders_rank_aug19,orders_rank_sep19,orders_rank_oct19,orders_rank_nov19,...,top3industries_jun22,top3industries_jul22,top3industries_aug22,top3industries_sep22,top3industries_oct22,top3industries_nov22,top3industries_dec22,top3industries_jan23,top3industries_feb23,top3industries_mar23
0,1001.0,Autauga,AL,30k - 225k pop,58239.0,55639.0,668.0,511.0,723.0,624.0,...,"fashion, beauty, realestate","fashion, beauty, realestate","fashion, charity_or_pol, realestate","fashion, realestate, retail","fashion, retail, sports","fashion, realestate, retail","fashion, retail, realestate","fashion, realestate, retail","fashion, realestate, retail","realestate, fashion, sports"
1,1003.0,Baldwin,AL,30k - 225k pop,227131.0,218289.0,804.0,780.0,831.0,899.0,...,"beauty, restaurant, hobby","hobby, beauty, transportation","beauty, transportation, hobby","beauty, auto, transportation","auto, hobby, beauty","beauty, fitness_wellness, transportation","beauty, outdoors, transportation","beauty, transportation, outdoors","beauty, charity_or_pol, transportation","hobby, realestate, outdoors"
2,1005.0,Barbour,AL,< 30k pop,25259.0,25026.0,1001.0,684.0,883.0,1118.0,...,"artdesign, events","events, homesvcs","events, fitness_wellness","events, artdesign","fitness_wellness, health_med",homesvcs,"homesvcs, fashion, fitness_wellness","events, health_med","events, financial","events, financial"
3,1007.0,Bibb,AL,< 30k pop,22412.0,22374.0,123.0,96.0,113.0,199.0,...,"homesvcs, beauty, health_med","beauty, health_med, fitness_wellness","beauty, fashion, health_med","beauty, fashion, retail","beauty, fashion, health_med","beauty, fashion, homedecor","homedecor, fashion, fitness_wellness","fitness_wellness, fashion, health_med","retail, events, fashion","fitness_wellness, health_med, artdesign"
4,1009.0,Blount,AL,30k - 225k pop,58884.0,57755.0,65.0,108.0,203.0,121.0,...,"photography, hobby, profsvcs","photography, profsvcs, artdesign","profsvcs, photography, beauty","profsvcs, music, artdesign","hobby, profsvcs, events","hobby, profsvcs, beauty","hobby, profsvcs, music","hobby, profsvcs, music","photography, profsvcs, beauty","artdesign, photography, beauty"
