In [31]:
import pandas as pd             # data package
import matplotlib.pyplot as plt # graphics 
import datetime as dt
import numpy as np
from census import Census # This is new...

import requests, io             # internet and input tools  
import zipfile as zf            # zip file tools 
import os  

#import weightedcalcs as wc
#import numpy as np

import pyarrow as pa
import pyarrow.parquet as pq

This file creates the trade file behind the [Phase One Tracker](https://www.tradewartracker.com/). It proceeds in several steps.

1. Grabs the trade data

2. Constructs the relavent Phase One product catagories and the associated goals/targets.

3. Maps the data to the county level.

In [32]:
trade_type = "exports"

my_key = "&key=34e40301bda77077e24c859c6c6c0b721ad73fc7"
# This is my key. I'm nice and I have it posted. If you will be doing more with this
# please get your own key!

---
#### Grabe the trade data using the Census's API

In [33]:
def census_trade(url, trade_type, country, product_level):
    
    r = requests.get(url) 
    
    print(r)
    
    df = pd.DataFrame(r.json()[1:]) # This then converts it to a dataframe
    # Note that the first entry is the labels

    df.columns = r.json()[0]

    df.time = pd.to_datetime(df.time, format="%Y-%m")
    # This is so I can call this correctly...
    
    if trade_type == "imports":
        
        trade_label = country + "_" + trade_type
        
        df[trade_label] = df["CON_VAL_MO"].astype(float)
        
        df[product_level] = df["I_COMMODITY"].astype(str)
        
        df.drop(["CON_VAL_MO", "I_COMMODITY", "COMM_LVL"], axis = 1, inplace = True)
        
    if trade_type == "exports":
    
        trade_label = country + "_" + trade_type
        
        df[trade_label] = df["ALL_VAL_MO"].astype(float)

        df[product_level] = df["E_COMMODITY"].astype(str)
        
        df.drop(["ALL_VAL_MO", "E_COMMODITY", "COMM_LVL"], axis = 1, inplace = True)
    
    return df

In [44]:
end_use = "hs?get=E_COMMODITY,ALL_VAL_MO"

url = "https://api.census.gov/data/timeseries/intltrade/exports/" + end_use 
url = url + my_key + "&time==from+2013-01" + "&COMM_LVL=HS6"

url = url + "&CTY_CODE=5700"

df = census_trade(url, trade_type, "china", "hs6")

df["hs4"] = df["hs6"].str[0:4]

df.time.max()

<Response [200]>


Timestamp('2020-05-01 00:00:00')

In [45]:
df.head()

Unnamed: 0,time,CTY_CODE,china_exports,hs6,hs4
0,2013-01-01,5700,4806.0,841090,8410
1,2013-01-01,5700,287000.0,481031,4810
2,2013-01-01,5700,311650.0,481160,4811
3,2013-01-01,5700,63701.0,481620,4816
4,2013-01-01,5700,40265.0,490290,4902


---
#### Bring in the Phase One Product list

In [46]:
dfproducts = pd.read_csv(".\\data"+ "\\annex-6-1.csv", dtype = {"hs4": str})

In [47]:
df_phaseone = df.merge(dfproducts, left_on = "hs4", right_on = "hs4", how = "left", indicator = True)

In [48]:
concordance = pq.read_table(".\\data\\alt_concordance.parquet").to_pandas()

concordance.head()

dict_concordance = dict(zip(concordance.hs6,concordance.naics)) 

df_phaseone["naics"] = df_phaseone["hs6"].map(dict_concordance)

df_phaseone["naics3"] = df_phaseone["naics"].str[0:3]
# The NAICS codes are for mapping the data to the county level data

In [49]:
df_phaseone.high_catagory = df_phaseone.high_catagory.fillna("not in aggreement")

Then the outfiles ``phaseone-tradedata.parquet`` is the main file used in ``phase-one-plots.ipynb`` notebook

In [50]:
out_file = ".\\data"+ "\\phaseone-tradedata.parquet"

pq.write_table(pa.Table.from_pandas(df_phaseone), out_file)

#### This then constructs the benchmark and goal measures

In [51]:
df_phaseone.set_index("time", inplace = True)

In [52]:
grp = df_phaseone.loc["2017"].groupby("high_catagory")

benchmarks = grp.agg({"china_exports": "sum"})

benchmarks.columns = ["2017 Values"]

In [53]:
grp = df_phaseone.loc["2020"].groupby("high_catagory")

current = grp.agg({"china_exports": "sum"})

current.columns = ["2020 Values"]

In [54]:
benchmarks = benchmarks.merge(current, left_index = True, right_index = True)

In [55]:
# These are the goals from the AGREEMENT

benchmarks["Goals"] = 32900000000

benchmarks.iloc[1,2] = 12500000000

benchmarks.iloc[2,2] = 18500000000

benchmarks.iloc[3,2] = np.nan

Then the ``phaseone-goals.parquet`` file is the data file used to create the bar graph.

In [56]:
out_file = ".\\data"+ "\\phaseone-goals.parquet"

pq.write_table(pa.Table.from_pandas(benchmarks), out_file)

In [57]:
benchmarks

Unnamed: 0_level_0,2017 Values,2020 Values,Goals
high_catagory,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1. Manufactured Goods,50437860000.0,19479850000.0,32900000000.0
2. Agriculture,21256170000.0,5585255000.0,12500000000.0
3. Energy,15922060000.0,3505937000.0,18500000000.0
not in aggreement,50264670000.0,13310000000.0,


---
#### The code below then performs the projection of the data down to the county level

The first step is to group on NAICS. Note that how this is constructed is only Phase One coverd products are included. 

In [58]:
df_phaseone.loc["2017":]

Unnamed: 0_level_0,CTY_CODE,china_exports,hs6,hs4,description,low_catagory,high_catagory,_merge,naics,naics3
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-01,5700,753280.0,200899,2008,"Fruit, nuts and other edible parts of plants, ...",Other agricultural commodities,2. Agriculture,both,311421,311
2017-01-01,5700,361441.0,151800,1518,"Animal or vegetable fats, oils and their fract...",Other agricultural commodities,2. Agriculture,both,311613,311
2017-01-01,5700,289709.0,152000,1520,"Glycerol (glycerine), whether or not pure; gly...",Other agricultural commodities,2. Agriculture,both,325611,325
2017-01-01,5700,36631.0,152110,1521,"Vegetable waxes (other than triglycerides), be...",Other agricultural commodities,2. Agriculture,both,325998,325
2017-01-01,5700,2285893.0,160239,1602,"Other prepared or preserved meat, meat offal o...",Meat,2. Agriculture,both,311615,311
...,...,...,...,...,...,...,...,...,...,...
2020-05-01,5700,23320264.0,852351,8523,"Discs, tapes, solid-state non-volatile storage...",Electrical equipment and mac.hioery,1. Manufactured Goods,both,334613,334
2020-05-01,5700,341171.0,852352,8523,"Discs, tapes, solid-state non-volatile storage...",Electrical equipment and mac.hioery,1. Manufactured Goods,both,334413,334
2020-05-01,5700,32764.0,852359,8523,"Discs, tapes, solid-state non-volatile storage...",Electrical equipment and mac.hioery,1. Manufactured Goods,both,334613,334
2020-05-01,5700,327850.0,852380,8523,"Discs, tapes, solid-state non-volatile storage...",Electrical equipment and mac.hioery,1. Manufactured Goods,both,334614,334


In [59]:
df_phaseone = df_phaseone.loc["2017":]

grp = df_phaseone.groupby(["time","naics3"])

exports_by_naics = grp.agg({"china_exports": "sum"})

exports_by_naics.reset_index(inplace = True)

exports_by_naics.set_index(["naics3"], inplace = True)

##################################################################

grp = df_phaseone[df_phaseone["high_catagory"] != "not in aggreement"].groupby(["time","naics3"])

exports_phaseone = grp.agg({"china_exports": "sum"})

exports_phaseone.reset_index(inplace = True)

exports_phaseone.set_index(["naics3"], inplace = True)

##################################################################

exports_phaseone.rename(mapper = {"china_exports": "phase_one_exports"}, inplace = True, axis = 1)

exports_by_naics = exports_by_naics.merge(exports_phaseone, 
                                          left_on = ["naics3", "time"], right_on = ["naics3", "time"])

In [60]:
exports_by_naics.tail()

Unnamed: 0_level_0,time,china_exports,phase_one_exports
naics3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
337,2020-05-01,4748678.0,0.0
339,2020-05-01,357396485.0,308669410.0
910,2020-05-01,121597022.0,4654176.0
930,2020-05-01,10236239.0,4171017.0
990,2020-05-01,101889086.0,284077.0


The following functions do the following: 

1. Grab the BLS QCEW file for 2017

2. Then create the exports per worker measure at the county level. 

In [61]:
def download_bls():

    print("")
    print("**********************************************************************************")
    print("Downloading and processing BLS file")
    print("")

    url = "https://data.bls.gov/cew/data/files/2017/csv/2017_annual_singlefile.zip"
# This will read in the annual, single file. It's big, but has all we want...

    r = requests.get(url) 

# convert bytes to zip file  
    bls_sf = zf.ZipFile(io.BytesIO(r.content)) 
    print('Type of zipfile object:', type(bls_sf))

    clist = ['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'disclosure_code', 'annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages','avg_annual_pay']

    df = pd.read_csv(bls_sf.open(bls_sf.namelist()[0]), usecols= clist)

########################################################################

    NAICS_county_level = 75 
# This is the code that will select only counties at the 3 digit NAICS level

    df_county = df[df.agglvl_code == NAICS_county_level].copy()

    df_county = df_county[df_county.own_code == 5]
# Only grab private stuff

    df_county = df_county[(df_county.area_fips.str[0:2] != "72") & (df_county.area_fips.str[0:2] != "78")
              & (df_county.area_fips.str[0:2] != "02") & (df_county.area_fips.str[0:2] != "15")]
#Drop puerto rico, alaska, hawaii...this mayb not be doing what I think it is...as it looks like these guys are there
# Does not matter as analysis is performed withthem, drop them when do the map. 

    df_county["sup_ind"] = df_county.industry_code.str[1].astype(int)
# sometimes there are super industries floating around we want to drop them.
# not clear if this matters with the conditioning all ready

    df_county = df_county[df_county["sup_ind"] > 0]

    df_county.area_fips = df_county.area_fips.astype(str)

    df_national = df_county.groupby("industry_code").agg({"annual_avg_emplvl": "sum"})

    df_national.reset_index(inplace = True)

    df_national.rename({"annual_avg_emplvl":"nat_emplvl"}, axis = 1, inplace = True)
    
    return df_county, df_national

In [62]:
def create_trade_weights(df):
    # Takes in the county groupings and will return, for each county, a time series of export
    # exposure, tariffs, and other statistics. 

    new_df = df.merge(df_national[["nat_emplvl",
                                   "industry_code"]],
                                  how = "outer", left_on = "industry_code", right_on = "industry_code")
    # Merge the nation with the county, why, we want to make sure all the naics codes are lined up properly
        
    new_df["emp_wts"] = (new_df.annual_avg_emplvl/new_df.nat_emplvl)
     
    # create the weights...
        
    foo_df = exports_by_naics.merge(new_df[["emp_wts",
                                           "industry_code",
                                          "annual_avg_emplvl"]], left_index = True, right_on = "industry_code")  
    
    # Now each weight is for a NAICS code, we will merge it with the export trade data set, so for all naics, all time...
    # This is a big df whith all trade data and then the county's weights for each naics code
    
    foo_grp = foo_df.groupby("time")
    
    # group by time. 
    
    foo = foo_grp.apply(trade_by_naics)
    
    # Then for each time gropuing, we aggregate across the naics codes according to the weights above.
    
    foo = foo.droplevel(1)
    
    foo["fips"] = df["area_fips"].astype(str).iloc[0]
    
    # some cleaning of the df
    
    foo["total_employment"] = new_df.annual_avg_emplvl.sum()
    
    # get total employment.
    
    return pd.DataFrame(foo)
    

In [63]:
def trade_by_naics(df):
    # Simple function just to test about aggregation 

    china_exp_pc = (1/df["annual_avg_emplvl"].sum())*(df["china_exports"]*df["emp_wts"]).sum()
    
    china_pho_pc = (1/df["annual_avg_emplvl"].sum())*(df["phase_one_exports"]*df["emp_wts"]).sum()
    # the first term multiplies trade by the county's share of national level employment
    # then the outside term divides by number of workers in a county. 
    
    foo = {"china_exp_pc": [china_exp_pc],
           "china_pho_pc": [china_pho_pc],
          "emplvl_2017": df["annual_avg_emplvl"].sum()}

    return pd.DataFrame(foo)

In [None]:
df_county, df_national = download_bls()

#print(df_county.annual_avg_emplvl.sum())

grp = df_county.groupby("area_fips")

trade_county = grp.apply(create_trade_weights)

trade_county["china_exp_pc"] = (trade_county["emplvl_2017"]/
                                    trade_county["total_employment"])*trade_county["china_exp_pc"]

trade_county["china_pho_pc"] = (trade_county["emplvl_2017"]/
                                    trade_county["total_employment"])*trade_county["china_pho_pc"]


**********************************************************************************
Downloading and processing BLS file

Type of zipfile object: <class 'zipfile.ZipFile'>


In [None]:
trade_county.head()

In [None]:
trade_county.head()

And we are set. The only final part is to add in some information from the census. 

In [None]:
my_api_key = '34e40301bda77077e24c859c6c6c0b721ad73fc7'
# This is my api_key

c = Census(my_api_key)
# This will create an object c which has methods associated with it.
# We will see  these below.

type(c) 
# Per the discussion below, try c.tab and see the options. 

code = ("NAME","B01001_001E","B19013_001E") # Same Codes:

county_2017 = pd.DataFrame(c.acs5.get(code, 
                                         {'for': 'county:*'}, year=2017))
                                         # Same deal, but we specify county then the wild card
                                         # On the example page, there are ways do do this, only by state
        
county_2017 = county_2017.rename(columns = {"B01001_001E":"2017_population", "B19013_001E":"2017_income"})

county_2017["GEOFIPS"] = (county_2017["state"] + county_2017["county"]).astype(int)

county_2017["2017_population"] = county_2017["2017_population"].astype(float)

county_2017["2017_income"] = county_2017["2017_income"].astype(float)

county_2017.set_index(["GEOFIPS"], inplace = True)

In [None]:
trade_county.reset_index(inplace = True)

trade_county["int_area_fips"] = trade_county["area_fips"].astype(int)

trade_county = trade_county.merge(county_2017[["2017_income","2017_population"]],
                                  left_on = "int_area_fips", right_index = True, how = "left")

#trade_employ.drop(labels = "index", axis = 1, inplace = True)

trade_county.set_index(["area_fips", "time"],inplace = True)

In [None]:
trade_county.head()

In [None]:
file_path =  ".\\data"+ "\\phase_one_county.parquet"

pq.write_table(pa.Table.from_pandas(trade_county), file_path)

Then the ``phase_one_county.parquet`` file is the main file used in the ``phase-one-map.ipynb`` notebook.