In [1]:
import numpy as np
import pandas as pd
import os

### 1. Load BLS raw data

In [2]:
years = ['1990', '2000', '2010', '2020']
NAICS = ['11', '21', '31', '51', '52', '54']
files = []

# create list of desired file names, looping through year, BLS folder, and NAICS code
for year in years:
    folder = sorted(os.listdir('../BLS_raw/' + year + '.annual.by_industry'))
    for file in folder:
        for code in NAICS:
            if code + ' ' in file[12:15]:
                files.append(file)
            else:
                pass

In [3]:
df_list = []
 
# loop through year and file to append dataframes into a list
for year in years:
    for file in files:
        if year in file:
            temp_df = pd.read_csv('../BLS_raw/' + year + '.annual.by_industry/' + file)
            mask_non50 = ((temp_df['area_fips'].str[-3:] == '000') | 
                          (temp_df['area_fips'].str[:1] == 'C') | 
                          (temp_df['area_fips'].str[:2] == 'US') | 
                          (temp_df['area_fips'].str[:1] == '7'))
            temp_df = temp_df[~mask_non50] # apply Kurt's filter within loop
            df_list.append(temp_df)
        else:
            pass

# store dataframes in dictionary
names = []
for x in range(0, len(files)):
    names.append('df' + files[x][12:14]+ '_' + files[x][:4])

d = dict(zip(names, df_list))
d.keys()

dict_keys(['df11_1990', 'df21_1990', 'df51_1990', 'df52_1990', 'df54_1990', 'df11_2000', 'df21_2000', 'df51_2000', 'df52_2000', 'df54_2000', 'df11_2010', 'df21_2010', 'df51_2010', 'df52_2010', 'df54_2010', 'df11_2020', 'df21_2020', 'df51_2020', 'df52_2020', 'df54_2020'])

### 2. Process raw data

In [4]:
# drop undefined counties
for key in d.keys():
    d[key] = d[key][d[key]['area_fips'].str[-3:] != '999']
    print(d[key].shape)

(1111, 43)
(667, 43)
(2197, 43)
(3023, 43)
(2440, 43)
(1156, 43)
(692, 43)
(2301, 43)
(3135, 43)
(2755, 43)
(3293, 43)
(2523, 43)
(5292, 43)
(3975, 43)
(4641, 43)
(3299, 43)
(2492, 43)
(5199, 43)
(3834, 43)
(4733, 43)


In [5]:
# merge public and private sector employment within-county
for key in d.keys():
    d[key] = (d[key].groupby(by=['area_fips', 'area_title'], as_index=False)['annual_avg_emplvl'].sum())
    print(d[key].shape)

(1099, 3)
(667, 3)
(2061, 3)
(2301, 3)
(2003, 3)
(1141, 3)
(692, 3)
(2086, 3)
(2316, 3)
(2097, 3)
(3071, 3)
(2515, 3)
(3081, 3)
(3130, 3)
(3121, 3)
(3091, 3)
(2486, 3)
(3066, 3)
(3125, 3)
(3122, 3)


In [6]:
# load typology (US Census geographical classifications) and regional data
typ = pd.read_csv('typology/typology.csv')

# change county codes to proper fips
typ['fips'] = typ['fips'].astype(str).str.zfill(5)

# merge BLS data with '03, '13, & '20' geographical classifications, retaining all counties from typ file
for key in d.keys():
    d[key] = d[key].merge(typ, how='right', left_on='area_fips', right_on='fips')

# export selected columns
final_cols = ['fips', 'name', 'State', 'State Code', 'Region', 'Division',
              'Region_alt', 'msa', 'type_bea20', 'annual_avg_emplvl']

for key in d.keys():
    d[key][final_cols].to_csv('my_naics/naics_' + key[2:] + '.csv', index_label=False)

In [10]:
# load new NAICS CSVs
codes = ['11', '21', '51', '52', '54']
path = 'my_naics/naics_'

# use big loop to update all previously processed NAICS files
for code in codes:
    df90 = pd.read_csv(path + code + '_1990.csv')
    df00 = pd.read_csv(path + code + '_2000.csv')
    df10 = pd.read_csv(path + code + '_2010.csv')
    df20 = pd.read_csv(path + code + '_2020.csv')
    
    # merge years under NAICS code
    temp1 = df90.merge(df00, how='inner', on='fips', suffixes=['_90', '_00'])
    temp2 = df10.merge(df20, how='inner', on='fips', suffixes=['_10', '_20'])
    df = temp1.merge(temp2, how='inner', on='fips')
df

Unnamed: 0,fips,name_90,State_90,State Code_90,Region_90,Division_90,Region_alt_90,msa_90,type_bea20_90,annual_avg_emplvl_90,...,annual_avg_emplvl_10,name_20,State_20,State Code_20,Region_20,Division_20,Region_alt_20,msa_20,type_bea20_20,annual_avg_emplvl_20
0,1001,Autauga County,Alabama,AL,South,East South Central,South,"Montgomery, AL",Metro,109.0,...,239.0,Autauga County,Alabama,AL,South,East South Central,South,"Montgomery, AL",Metro,0.0
1,1003,Baldwin County,Alabama,AL,South,East South Central,South,"Daphne-Fairhope-Foley, AL",Metro,453.0,...,1652.0,Baldwin County,Alabama,AL,South,East South Central,South,"Daphne-Fairhope-Foley, AL",Metro,2312.0
2,1005,Barbour County,Alabama,AL,South,East South Central,South,"Eufaula, AL-GA",Micro,,...,0.0,Barbour County,Alabama,AL,South,East South Central,South,"Eufaula, AL-GA",Micro,7.0
3,1007,Bibb County,Alabama,AL,South,East South Central,South,"Birmingham-Hoover, AL",Metro,29.0,...,48.0,Bibb County,Alabama,AL,South,East South Central,South,"Birmingham-Hoover, AL",Metro,75.0
4,1009,Blount County,Alabama,AL,South,East South Central,South,"Birmingham-Hoover, AL",Metro,118.0,...,199.0,Blount County,Alabama,AL,South,East South Central,South,"Birmingham-Hoover, AL",Metro,241.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3138,56037,Sweetwater County,Wyoming,WY,West,Mountain,Interior Northwest,"Rock Springs, WY",Micro,239.0,...,573.0,Sweetwater County,Wyoming,WY,West,Mountain,Interior Northwest,"Rock Springs, WY",Micro,431.0
3139,56039,Teton County,Wyoming,WY,West,Mountain,Interior Northwest,"Jackson, WY-ID",Micro,334.0,...,816.0,Teton County,Wyoming,WY,West,Mountain,Interior Northwest,"Jackson, WY-ID",Micro,1070.0
3140,56041,Uinta County,Wyoming,WY,West,Mountain,Interior Northwest,"Evanston, WY",Micro,131.0,...,305.0,Uinta County,Wyoming,WY,West,Mountain,Interior Northwest,"Evanston, WY",Micro,232.0
3141,56043,Washakie County,Wyoming,WY,West,Mountain,Interior Northwest,,Rural,77.0,...,150.0,Washakie County,Wyoming,WY,West,Mountain,Interior Northwest,,Rural,79.0


### 3. Calculate employment change columns

In [24]:
# load new NAICS CSVs
codes = ['11', '21', '51', '52', '54']
path = 'my_naics/naics_'

# use big loop to update all previously processed NAICS files
for code in codes:
    df90 = pd.read_csv(path + code + '_1990.csv')
    df00 = pd.read_csv(path + code + '_2000.csv')
    df10 = pd.read_csv(path + code + '_2010.csv')
    df20 = pd.read_csv(path + code + '_2020.csv')
    
    # merge years under NAICS code
    temp1 = df90.merge(df00, how='inner', on='fips', suffixes=['_90', '_00'])
    temp2 = df10.merge(df20, how='inner', on='fips', suffixes=['_10', '_20'])
    df = temp1.merge(temp2, how='inner', on='fips')
    
    # clean column names
    cols = df.columns.tolist()[:10]
    for col in df.columns.tolist()[10:]:
        if col[:-3] == 'annual_avg_emplvl':
            cols.append(col)
        else:
            pass
    df = df[cols]
    df.columns = df.columns.str.replace('_90', '')
    df = df.rename(columns={'annual_avg_emplvl': 'annual_avg_emplvl_90'})
    
    # replace nulls with zeroes 
    empl_cols = df.columns[-4:]
    df[empl_cols] = df[empl_cols].fillna(0)

    # rate of change function
    def rate_chg(df, year1, year2, chg):
        df[chg] = np.where((df[year1]== 0),
                          ((df[year2] - df[year1]) / 1).round(4),
                          ((df[year2] - df[year1]) / df[year1]).round(4))
    
    # define new column namer
    namer = empl_cols.str.split('_')

    # calculate rate of change
    for x in range(0,3):
        rate_chg(df, empl_cols[x], empl_cols[x+1], 'chg_' + namer[x][2] + '_' + namer[x][3] + '_' + namer[x+1][3])
    
    # calculate total rate of change column (1990-2020)
    df['chg_emplvl_90_20'] = np.where((df['annual_avg_emplvl_90']==0),
                                      ((df['annual_avg_emplvl_20'] - df['annual_avg_emplvl_90']) / 1).round(4),
                                      ((df['annual_avg_emplvl_20'] - df['annual_avg_emplvl_90']) / df['annual_avg_emplvl_90']).round(4))
    
    # export file
    print(code, df.shape)
    df.to_csv('my_naics_chg/naics_' + code + '.csv', index_label=False)

11 (3143, 17)
21 (3143, 17)
51 (3143, 17)
52 (3143, 17)
54 (3143, 17)


### 4. Incorporate spatial data

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

# load US counties SHP
gdf_full = gpd.read_file('../GIS Data/usa_census_counties_2018_20m/')
gdf_full.set_index('GEOID', inplace=True)

# drop non-continental columns (AK, HI, & PR)
mask_non_continental = ((gdf_full['STATEFP'] == '02') | (gdf_full['STATEFP'] == '15') | (gdf_full['STATEFP'] == '72'))
gdf_continental = gdf_full[~mask_non_continental]

In [None]:
# load NAICS files with change columns
codes = ['11', '21', '51', '52', '54']
path = 'my_naics_chg/naics_'

for code in codes:
    df = pd.read_csv(path + code + '.csv')
    df['fips'] = df['fips'].astype(str).str.zfill(5)
    df = df.set_index('fips')
    
    # merge with continental gdf
    gdf = gdf_continental.merge(df, how='left', left_index=True, right_index=True)
    
    # export to shapefile
    #gdf.to_file('SHPs/NAICS', driver ='ESRI Shapefile')
    
    # SPATIAL ANALYSIS
    for col in gdf.columns[20:]:
        ax = gdf.plot(column=col, cmap='RdYlGn',
                      edgecolor='lightgrey', linewidth=0.1,
                      legend=True, legend_kwds={'shrink': 0.6},
                      vmax=1, vmin=-1,
                      figsize=(15,10),
                      missing_kwds={'color': 'white', 'hatch': 'XXX',
                                    'edgecolor': 'lightgrey', 'linewidth' : 0.2,
                                    'label': 'Null or No Data'})
    
        title = 'NAICS ' + code + ' Industry Employment Dynamics ' + col[11:13] + '-' + col[14:]
        ax.set_title(title, fontsize = 13)
        ax.axis("off")
    
        # save figure
        ax.get_figure().savefig('maps/' + title, dpi=600, bbox_inches="tight")

In [None]:
# create separate code book for cluster analysis