In [1]:
import pandas as pd             # data package
import matplotlib.pyplot as plt # graphics 
import datetime as dt
import numpy as np
from census import Census # This is new...

import requests, io             # internet and input tools  
import zipfile as zf            # zip file tools 
import os  

#import weightedcalcs as wc
#import numpy as np

import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
print("")
print("**********************************************************************************")
print("Downloading and processing BLS file")
print("")

url = "https://data.bls.gov/cew/data/files/2017/csv/2017_annual_singlefile.zip"
# This will read in the annual, single file. It's big, but has all we want...

r = requests.get(url) 

# convert bytes to zip file  
bls_sf = zf.ZipFile(io.BytesIO(r.content)) 
print('Type of zipfile object:', type(bls_sf))

clist = ['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'disclosure_code', 'annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages','avg_annual_pay']

df = pd.read_csv(bls_sf.open(bls_sf.namelist()[0]), usecols= clist)


**********************************************************************************
Downloading and processing BLS file

Type of zipfile object: <class 'zipfile.ZipFile'>


  interactivity=interactivity, compiler=compiler, result=result)


Then the file below cleans stuff up. The most important is the `NAICS_county_level` which selects the NAICS aggregation and then the county aggregation. Website describing this is here:

[https://data.bls.gov/cew/doc/titles/agglevel/agglevel_titles.htm](https://data.bls.gov/cew/doc/titles/agglevel/agglevel_titles.htm)

In [3]:
NAICS_county_level = 75 
# This is the code that will select only counties at the 3 digit NAICS level

df_county = df[df.agglvl_code == NAICS_county_level].copy()

df_county = df_county[df_county.own_code == 5]
# Only grab private stuff

df_county = df_county[(df_county.area_fips.str[0:2] != "72") & (df_county.area_fips.str[0:2] != "78")
              & (df_county.area_fips.str[0:2] != "02") & (df_county.area_fips.str[0:2] != "15")]
#Drop puerto rico, alaska, hawaii...this mayb not be doing what I think it is...as it looks like these guys are there
# Does not matter as analysis is performed withthem, drop them when do the map. 

df_county["sup_ind"] = df_county.industry_code.str[1].astype(int)
# sometimes there are super industries floating around we want to drop them.
# not clear if this matters with the conditioning all ready

df_county = df_county[df_county["sup_ind"] > 0]

df_county.area_fips = df_county.area_fips.astype(str)

df_national = df_county.groupby("industry_code").agg({"annual_avg_emplvl": "sum"})

df_national.reset_index(inplace = True)

df_national.rename({"annual_avg_emplvl":"nat_emplvl"}, axis = 1, inplace = True)

Let's compute annual employment. 

In [4]:
df_county.annual_avg_emplvl.sum()

115756851

which matches well with FRED (https://fred.stlouisfed.org/series/USPRIV) in 2017 (off by a couple million)

### Read in Trade Data and Merge

In [13]:
imports_by_naics = pd.read_csv(".//data//imports_by_naics.csv", dtype= {"naics3": str})

imports_by_naics.set_index(["naics3"], inplace = True)

dftrade_17_naics3 = pd.read_csv(".//data//2017_imports_by_naics.csv", dtype= {"naics3": str})

dftrade_17_naics3.set_index(["naics3"], inplace = True)

In [14]:
dftrade_17_naics3.head()


Unnamed: 0_level_0,2017_china_trade
naics3,Unnamed: 1_level_1
111,616881500.0
112,37346220.0
113,236869800.0
114,2323795000.0
211,1397256.0


In [15]:
df_national = df_national.merge(dftrade_17_naics3["2017_china_trade"],
                                left_on = "industry_code", right_index = True, how = "left")

df_national["2017_china_trade"].replace(np.nan, 0, inplace = True)

df_national["trd_wts"] = (df_national["2017_china_trade"]/df_national["2017_china_trade"].sum())

Then check to make sure that the trade weights sum up to one.

In [16]:
df_national.trd_wts.sum()

1.0

---

### Step 3 Merge trade data with the county data

This is the most time consuming step (interms of compuation time). So start with the county data set, `groupby` county, then apply a function which will create (i) time varying exports (which are constructed with the 2017 weightes) and (ii) time varying tariffs (also constructed using the 2017) weights. 

The final want is a big dataframe that has county, time, export exposure and tariff exposure. 

In [17]:
print("")
print("**********************************************************************************")
print("Constructing County-Level Tariffs and Exports")
print("")


**********************************************************************************
Constructing County-Level Tariffs and Exports



In [18]:
grp = df_county.groupby("area_fips")

# Let's just look at one of the groups...

#grp.get_group("1001").head()

Below are the two key functions that deliver this. Basically it does the following: 

- Take a group at county level, merge it with the national level data set, so the resulting `df` has the county and nation.

- Create the weights. 

- Then merge it with the exports, this will now be a df with exports varying over time, but with the fixed weights associated with each entry.

- Then aggregate the national exports by NAICS by the county level weights, giving a county level time series of exports.

---

**Updates**

- The tariff measure does the following: fix a county, take employment in industry $i$ and divide by total county employment, then sum up tariffs across industries with the weights being the county level share. The idea here is if all employment in a county is soy, then the "effective" tariff that the county faces is the soy tariff.

In equation terms: here $c$ is county, $s$ is industry, $n$, below is nation.

$\tau_{c,t} = \sum_{s\in S}\frac{L_{c,s}}{L_{c,S}} \tau_{s,t}$

Note that below, I make one further adjustment to make sure that $L_{c,S}$ is for all employment, not just the sum across $L_{c,s}$


- The export measure: What am I doing: take a county's employment in industry $i$ and divide by **national** level employment in industry $i$. Then a "county's" exports is the the sum across industries, weighted by the county's share of national employment in each industry. The idea here is, if a county's has all national level employment in an industry, all that industries exports will be assigned to that county.

$\mbox{EX}_{c,t} = \frac{1}{L_{c,S,2017}}\sum_{s\in S}\frac{L_{c,s,2017}}{L_{n,s,2017}} \mbox{EX}_{s,t}$

and then I divide by total employment in the county to have a county per worker measure. This is done for exports to China and then export in total. Note that below, I make one further adjustment to make sure that $L_{c,S}$ is for all employment, not just the sum across $L_{c,s}$




In [22]:
def create_trade_weights(df):
    # Takes in the county groupings and will return, for each county, a time series of export
    # exposure, tariffs, and other statistics. 

    new_df = df.merge(df_national[["nat_emplvl",
                                   "industry_code", "trd_wts"]],
                                  how = "outer", left_on = "industry_code", right_on = "industry_code")
    # Merge the nation with the county, why, we want to make sure all the naics codes are lined up properly
        
    new_df["emp_wts"] = (new_df.annual_avg_emplvl/new_df.nat_emplvl)
     
    # create the weights...
        
    foo_df = imports_by_naics.merge(new_df[["emp_wts","trd_wts",
                                           "industry_code",
                                          "annual_avg_emplvl"]], left_index = True, right_on = "industry_code")  
    
    # Now each weight is for a NAICS code, we will merge it with the export trade data set, so for all naics, all time...
    # This is a big df whith all trade data and then the county's weights for each naics code
    
    foo_grp = foo_df.groupby("time")
    
    # group by time. 
    
    foo = foo_grp.apply(trade_by_naics)
    
    # Then for each time gropuing, we aggregate across the naics codes according to the weights above.
    
    foo = foo.droplevel(1)
    
    foo["fips"] = df["area_fips"].astype(str).iloc[0]
    
    # some cleaning of the df
    
    foo["total_employment"] = new_df.annual_avg_emplvl.sum()
    
    # get total employment.
    
    return pd.DataFrame(foo)
    

In [23]:
def trade_by_naics(df):
    # Simple function just to test about aggregation 

    china_imp_pc = (1/df["annual_avg_emplvl"].sum())*(df["china_trade"]*df["emp_wts"]).sum()
    
    total_imp_pc = (1/df["annual_avg_emplvl"].sum())*(df["total_trade"]*df["emp_wts"]).sum()
    # the first term multiplies trade by the county's share of national level employment
    # then the outside term divides by number of workers in a county. 
    
    #tariff_nwt_pc = (1/df["annual_avg_emplvl"].sum())*(df["tariff_trd_w_avg"]*df["emp_wts"]).sum()
    # This is the measure that makes most sense, need to justify it...
    tariff =  ((df["annual_avg_emplvl"]*df["tariff_trd_w_avg"])/df["annual_avg_emplvl"].sum()).sum()
    # local employment share weighted tariff. So if all guys are in area are working in soy,
    # then they are facing the soybean tariff....
    
    foo = {"total_imp_pc": [total_imp_pc],
          "china_imp_pc": [china_imp_pc],
           "tariff": [tariff],
          "emplvl_2017": df["annual_avg_emplvl"].sum()}

    return pd.DataFrame(foo)

Then apply the function to the county groups

In [24]:
trade_county = grp.apply(create_trade_weights)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  
  


And we are done and output the file to where we want it

Unnamed: 0_level_0,Unnamed: 1_level_0,total_imp_pc,china_imp_pc,tariff,emplvl_2017,fips,total_employment
area_fips,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
48123,2019-09-01,114485.752818,46449.517478,32.989823,600.0,48123,4137.0
48123,2019-10-01,115237.179352,40010.055528,32.989823,600.0,48123,4137.0
48123,2019-11-01,99380.664946,33064.010855,32.989823,600.0,48123,4137.0
48123,2019-12-01,103693.084538,34645.488358,32.989823,600.0,48123,4137.0
48123,2020-01-01,115467.020595,41316.863047,32.989434,600.0,48123,4137.0
36035,2019-09-01,97199.727453,45363.001413,32.940729,1231.0,36035,9568.0
36035,2019-10-01,98404.233825,39292.346525,32.940729,1231.0,36035,9568.0
36035,2019-11-01,83226.331981,32433.032949,32.940729,1231.0,36035,9568.0
36035,2019-12-01,84177.689456,33662.187227,32.940729,1231.0,36035,9568.0
36035,2020-01-01,95201.626814,39860.880563,32.940651,1231.0,36035,9568.0


**One more adjustment.** Notice that in the function, when we are merging, we are droping all the NAICS codes without trade. So these measures (total trade, china trade, and tariffs) are only conditional on being traded. This only matters in so far as the denominator, the ``df["annual_avg_emplvl"].sum()`` is concerned. 

To make the adjustment then, we multiply the employment measure in the denominator and then divide through by the ``total_employment`` measure. 

In [26]:
trade_county["tariff"] = (trade_county["emplvl_2017"]/
                              trade_county["total_employment"])*trade_county["tariff"]

trade_county["china_imp_pc"] = (trade_county["emplvl_2017"]/
                                    trade_county["total_employment"])*trade_county["china_imp_pc"]

trade_county["total_imp_pc"] = (trade_county["emplvl_2017"]/
                                    trade_county["total_employment"])*trade_county["total_imp_pc"]

In [27]:
trade_county.sort_values(by = ["tariff","emplvl_2017"], ascending = False).head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_imp_pc,china_imp_pc,tariff,emplvl_2017,fips,total_employment
area_fips,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2013,2020-01-01,4254.824381,207.345253,30.66934,1895.0,2013,1944.0
2013,2019-09-01,4076.358774,199.916696,30.669323,1895.0,2013,1944.0
2013,2019-10-01,4403.587918,173.825751,30.669323,1895.0,2013,1944.0
2013,2019-11-01,4053.388117,162.770589,30.669323,1895.0,2013,1944.0
2013,2019-12-01,4117.081888,192.625473,30.669323,1895.0,2013,1944.0
2013,2020-02-01,3780.13368,155.089665,29.051986,1895.0,2013,1944.0
2013,2020-03-01,4708.596839,119.131824,29.051986,1895.0,2013,1944.0
2013,2019-06-01,4117.486294,177.746891,27.434596,1895.0,2013,1944.0
2013,2019-07-01,4432.215964,189.09461,27.434596,1895.0,2013,1944.0
2013,2019-08-01,4284.659687,192.24698,27.434594,1895.0,2013,1944.0


In [28]:
my_api_key = '34e40301bda77077e24c859c6c6c0b721ad73fc7'
# This is my api_key

c = Census(my_api_key)
# This will create an object c which has methods associated with it.
# We will see  these below.

type(c) 
# Per the discussion below, try c.tab and see the options. 

code = ("NAME","B01001_001E","B19013_001E") # Same Codes:

county_2017 = pd.DataFrame(c.acs5.get(code, 
                                         {'for': 'county:*'}, year=2017))
                                         # Same deal, but we specify county then the wild card
                                         # On the example page, there are ways do do this, only by state
        
county_2017 = county_2017.rename(columns = {"B01001_001E":"2017_population", "B19013_001E":"2017_income"})

county_2017["GEOFIPS"] = (county_2017["state"] + county_2017["county"]).astype(int)

county_2017["2017_population"] = county_2017["2017_population"].astype(float)

county_2017["2017_income"] = county_2017["2017_income"].astype(float)

county_2017.set_index(["GEOFIPS"], inplace = True)

In [29]:
trade_county.reset_index(inplace = True)

trade_county["int_area_fips"] = trade_county["area_fips"].astype(int)

trade_county = trade_county.merge(county_2017[["2017_income","2017_population"]],
                                  left_on = "int_area_fips", right_index = True, how = "left")

#trade_employ.drop(labels = "index", axis = 1, inplace = True)

trade_county.set_index(["area_fips", "time"],inplace = True)

In [31]:
trade_county.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,total_imp_pc,china_imp_pc,tariff,emplvl_2017,fips,total_employment,int_area_fips,2017_income,2017_population
area_fips,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10001,2015-01-01,455.305454,25.407092,0.751445,2843.0,10001,29514.0,10001,57647.0,173145.0
10001,2015-02-01,404.659655,20.680831,0.751448,2843.0,10001,29514.0,10001,57647.0,173145.0
10001,2015-03-01,501.924814,26.873311,0.751466,2843.0,10001,29514.0,10001,57647.0,173145.0
10001,2015-04-01,491.740587,25.616221,0.751465,2843.0,10001,29514.0,10001,57647.0,173145.0
10001,2015-05-01,459.0108,26.146841,0.751465,2843.0,10001,29514.0,10001,57647.0,173145.0


In [32]:
file_path =  ".\\data"+ "\\imports_trade_data_2020.parquet"

pq.write_table(pa.Table.from_pandas(trade_county.reset_index()), file_path)

  'start': level._start,
  'stop': level._stop,
  'step': level._step
