In [18]:
import pandas as pd             # data package
import matplotlib.pyplot as plt # graphics 
import datetime as dt
import numpy as np

import requests, io             # internet and input tools  
import zipfile as zf            # zip file tools 
import os  

import pyarrow as pa
import pyarrow.parquet as pq

import statsmodels.formula.api as smf
from linearmodels.iv import IV2SLS
from linearmodels.panel import PanelOLS

### Overview. 

This file essentially grabs the month by county files from the [Quarterly Census of Employment and Wages](https://www.bls.gov/cew/) files from the BLS and then creates employment measures at the county-level, monthly frequency. A couple of comments about the code:

   - In the funciton ``clean_bls_quarter`` there is a line to be uncommented or not depending upon if I want a dataset with goods employment or total employment. Future enhancements of this notebook should just return one dataframe with both.
    
    
   - It can accomadate the 2016 data (and further back if modified). Currently it just uses the 2017, 2018, and 2019 (which only have Q1 values. See the relese calander when updates will be made.
   
### Step 1

Bring in the trade/tariff data for which we will merge stuff....

In [19]:
cwd = os.getcwd()

trade_data = pq.read_table(cwd + "\\data\\total_trade_data.parquet").to_pandas()

trade_data["time"] = pd.to_datetime(trade_data.time)

trade_data.set_index(["area_fips", "time"],inplace = True)

In [20]:
trade_data.head()

exposure = pd.qcut(trade_data.xs('2018-12-1', level=1).tariff, 4 ,labels = False)

most_exposed = exposure[exposure == 3].index.tolist()

trade_data.loc[most_exposed].xs('2018-12-1', level=1).tariff.mean()

6.515098948465189

This is ultra-clunky. Should fix in the future. But it takes names (which is how the BLS files are written) and then will map them into a datatime value.

In [21]:
empl_time_dict_16 = {"January Employment":dt.datetime(2016,1,1),
                 "February Employment":dt.datetime(2016,2,1),
                 "March Employment":dt.datetime(2016,3,1),
                 "April Employment":dt.datetime(2016,4,1),
                 "May Employment":dt.datetime(2016,5,1),
                 "June Employment":dt.datetime(2016,6,1),
                 "July Employment":dt.datetime(2016,7,1),
                 "August Employment":dt.datetime(2016,8,1),
                 "September Employment":dt.datetime(2016,9,1),
                 "October Employment":dt.datetime(2016,10,1),
                 "November Employment":dt.datetime(2016,11,1),
                 "December Employment":dt.datetime(2016,12,1),}

In [22]:
empl_time_dict_17 = {"January Employment":dt.datetime(2017,1,1),
                 "February Employment":dt.datetime(2017,2,1),
                 "March Employment":dt.datetime(2017,3,1),
                 "April Employment":dt.datetime(2017,4,1),
                 "May Employment":dt.datetime(2017,5,1),
                 "June Employment":dt.datetime(2017,6,1),
                 "July Employment":dt.datetime(2017,7,1),
                 "August Employment":dt.datetime(2017,8,1),
                 "September Employment":dt.datetime(2017,9,1),
                 "October Employment":dt.datetime(2017,10,1),
                 "November Employment":dt.datetime(2017,11,1),
                 "December Employment":dt.datetime(2017,12,1),}

empl_time_dict_18 = {"January Employment":dt.datetime(2018,1,1),
                 "February Employment":dt.datetime(2018,2,1),
                 "March Employment":dt.datetime(2018,3,1),
                 "April Employment":dt.datetime(2018,4,1),
                 "May Employment":dt.datetime(2018,5,1),
                 "June Employment":dt.datetime(2018,6,1),
                 "July Employment":dt.datetime(2018,7,1),
                 "August Employment":dt.datetime(2018,8,1),
                 "September Employment":dt.datetime(2018,9,1),
                 "October Employment":dt.datetime(2018,10,1),
                 "November Employment":dt.datetime(2018,11,1),
                 "December Employment":dt.datetime(2018,12,1),}

empl_time_dict_19 = {"January Employment":dt.datetime(2019,1,1),
                 "February Employment":dt.datetime(2019,2,1),
                 "March Employment":dt.datetime(2019,3,1),}

clistQ1 = ['Area\nCode','NAICS','Qtr','January Employment', 'February Employment',
       'March Employment', 'Total Quarterly Wages', 'Average Weekly Wage','Own',"Area Type"]

### Step 2: Download

This downloads the ``.zip`` files for which we can grab the data. They are all in excell format. 

In [23]:
#url = "https://data.bls.gov/cew/data/files/2016/xls/2016_all_county_high_level.zip"
# This will read in the annual, single file. It's big, but has all we want...

#r = requests.get(url) 

# convert bytes to zip file  
#bls_q2016 = zf.ZipFile(io.BytesIO(r.content)) 
#bls_q2016.extractall(cwd + "\\bls_files")

url = "https://data.bls.gov/cew/data/files/2017/xls/2017_all_county_high_level.zip"
# This will read in the annual, single file. It's big, but has all we want...

r = requests.get(url) 

# convert bytes to zip file  
bls_q2017 = zf.ZipFile(io.BytesIO(r.content)) 
bls_q2017.extractall(cwd + "\\bls_files")

url = "https://data.bls.gov/cew/data/files/2018/xls/2018_all_county_high_level.zip"

r = requests.get(url) 

bls_q2018 = zf.ZipFile(io.BytesIO(r.content)) 
bls_q2018.extractall(cwd + "\\bls_files")

In [24]:
url = "https://data.bls.gov/cew/data/files/2019/xls/2019_all_county_high_level.zip"

r = requests.get(url) 

bls_q2019 = zf.ZipFile(io.BytesIO(r.content)) 
bls_q2019.extractall(cwd + "\\bls_files")

In [25]:
bls_q2019.filelist

[<ZipInfo filename='allhlcn191.xlsx' compress_type=deflate external_attr=0x20 file_size=6839716 compress_size=6715278>]

   ### Step 3: Clean and Shape it
   
   Below is a function that takes in an excell sheet and does what we want to it. Then below we will work through a for loop over all the sheets.

In [26]:
def clean_bls_quarter(excell_sheet, time_dict):

    df = pd.read_excel(excell_sheet, sheet_name = "US_St_Cn_MSA")

# Take only private

    df = df[df["Own"] == 5] 

# Take aggregate

    #df = df[df["NAICS"] == 101] # Take goods producing 
    
    df = df[df["NAICS"] == 10] # Take all employment in all sectors

# Take only counties 
    df = df[df["Area Type"] == "County"] 

    df.rename({"Area\nCode": "GEOFIPS"},axis = 1, inplace = True)

    df["GEOFIPS"] = df["GEOFIPS"].astype(int)

    df.set_index("GEOFIPS", inplace = True)

    df = df.reindex(trade_data.index.get_level_values(0).unique().astype(int).tolist())

    df = df.iloc[:,[13,14,15]].reset_index()
    # This grabs only values we want, i.e. the employment for that quarter. So for example,
    # in Q1, 13 = January, 14 = Febuary, 15 = March. And so forth for Q2...

    df = df.melt("GEOFIPS")

    df.replace(time_dict,inplace = True)

    df.rename({"variable":"time", "value":"emply_month", "GEOFIPS": "area_fips"}, axis = 1, inplace = True)
    
    df["area_fips"] = df["area_fips"].astype(str)
    
    df.set_index(["area_fips", "time"], inplace = True)
    

    return df

Then given the function above, work through the file list. 

In [27]:
#root_name = cwd + "\\bls_files\\"

#root_name = root_name + "allhlcn16"

#quarter = ["1","2","3","4"]

df = pd.DataFrame([])

#for item in quarter:
    
#    file_name = root_name + item + ".xlsx"
    
#    df = df.append(clean_bls_quarter(file_name,empl_time_dict_16))
    
############################################################################

root_name = cwd + "\\bls_files\\"

root_name = root_name + "allhlcn17"

quarter = ["1","2","3","4"]

for item in quarter:
    
    file_name = root_name + item + ".xlsx"
    
    df = df.append(clean_bls_quarter(file_name,empl_time_dict_17))
    
############################################################################  
root_name = cwd + "\\bls_files\\"

root_name = root_name + "allhlcn18"

quarter = ["1","2","3","4"]

for item in quarter:
    
    file_name = root_name + item + ".xlsx"
    
    df = df.append(clean_bls_quarter(file_name,empl_time_dict_18))
    
############################################################################  
root_name = cwd + "\\bls_files\\"

root_name = root_name + "allhlcn19"

quarter = ["1"]

for item in quarter:
    
    file_name = root_name + item + ".xlsx"
    
    df = df.append(clean_bls_quarter(file_name,empl_time_dict_19))

Then just checksome stuff, reshape, then save for the analysis part. Note how this is working (again clunky), if you want the goods employment, uncomment out that. If you want total employment do the other one.

In [28]:
df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,emply_month
area_fips,time,Unnamed: 2_level_1
9009,2019-03-01,315131.0
9011,2019-03-01,91395.0
9013,2019-03-01,26452.0
9015,2019-03-01,31210.0
9999,2019-03-01,40906.0


In [29]:
df.sort_values(["area_fips", "time"], inplace = True)

In [30]:
trade_employ = trade_data.merge(df, left_index = True, right_index = True, how = "left")
# This is a place to be mindfull about time period, if we want 

In [32]:
#file_path = os.getcwd() + "\\data\\trade_employment_goods.parquet"

#pq.write_table(pa.Table.from_pandas(trade_employ.reset_index()), file_path)

file_path = os.getcwd() + "\\data\\trade_employment_all.parquet"

pq.write_table(pa.Table.from_pandas(trade_employ.reset_index()), file_path)

In [33]:
trade_employ.corr()

Unnamed: 0,total_exp_pc,china_exp_pc,tariff,emplvl_2017,total_employment,emply_month
total_exp_pc,1.0,0.705856,0.288051,0.04176,-0.027994,-0.028684
china_exp_pc,0.705856,1.0,0.168035,-0.008813,-0.050778,-0.052891
tariff,0.288051,0.168035,1.0,-0.016043,-0.07393,-0.072093
emplvl_2017,0.04176,-0.008813,-0.016043,1.0,0.896478,0.896232
total_employment,-0.027994,-0.050778,-0.07393,0.896478,1.0,0.998848
emply_month,-0.028684,-0.052891,-0.072093,0.896232,0.998848,1.0


In [35]:
trade_employ.tail(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_exp_pc,china_exp_pc,tariff,emplvl_2017,fips,total_employment,emply_month
area_fips,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9999,2017-07-01,3.877345,0.48635,0.001085,10.0,9999,37362.0,45139.0
9999,2017-08-01,4.350422,0.517713,0.001085,10.0,9999,37362.0,45383.0
9999,2017-09-01,4.24436,0.404123,0.001085,10.0,9999,37362.0,45743.0
9999,2017-10-01,3.867837,0.276036,0.001085,10.0,9999,37362.0,46457.0
9999,2017-11-01,4.041468,0.395534,0.001085,10.0,9999,37362.0,46585.0
9999,2017-12-01,4.238751,0.61033,0.001085,10.0,9999,37362.0,46132.0
9999,2018-01-01,3.376915,0.215411,0.001085,10.0,9999,37362.0,40965.0
9999,2018-02-01,3.679109,0.274159,0.001085,10.0,9999,37362.0,41184.0
9999,2018-03-01,4.782102,0.469788,0.001085,10.0,9999,37362.0,41307.0
9999,2018-04-01,3.82559,0.285198,0.001085,10.0,9999,37362.0,40636.0
