In [138]:
import pandas as pd             # data package
import matplotlib.pyplot as plt # graphics 
import datetime as dt
import numpy as np

import requests, io             # internet and input tools  
import zipfile as zf            # zip file tools 
import os  

#import weightedcalcs as wc
#import numpy as np

import pyarrow as pa
import pyarrow.parquet as pq

### An Alternative Concordance Mapping

This notebook constructs a better concordence to go from hs6 codes to naics codes. The issue (as raised to me by another research) is that while the hs10 to naics concordence is one to one (an hs10 code mapped into one naics); if you simply truncate down to the hs6 then the concordence becomes one to many (for one hs6 code there are multiple naics codes).

In the main body of ``countylevel_tariffs_and_exports`` this was ignored and the rules dictionaries were applies to create the mapping (so first an hs6 code would be mapped into a naics code, if in the creation of the dictionary it saw the same h6 code going to a different naics code, it simply overwrites the old one). 

The code below provides an altnerative solution. What it does is that if first merges the hs10 export data with the census concordance. Then the rule is: for one hs6 code, if there are multiple naics codes, take the one with the largest amount of trade associated with it. 

Utimatly this does not seem to matter at all for my results, but it is a detail to get right and be minfull of. 

#### First Read in the HS10 Data

In [139]:
my_key = "&key=34e40301bda77077e24c859c6c6c0b721ad73fc7"
# This is my key. I'm nice and I have it posted. If you will be doing more with this
# please get your own key!


In [140]:
end_use = "hs?get=E_COMMODITY,ALL_VAL_MO"

url = "https://api.census.gov/data/timeseries/intltrade/exports/"
url = url + end_use + my_key + "&time==from+2017-01" + "&COMM_LVL=HS10"
# note the change in the commodity level

r = requests.get(url) 

df_all_trade = pd.DataFrame(r.json()[1:]) # This then converts it to a dataframe
# Note that the first entry is the labels

df_all_trade.columns = r.json()[0]

df_all_trade.time = pd.to_datetime(df_all_trade.time, format="%Y-%m")
# This is so I can call this correctly...

df_all_trade["total_trade"] = df_all_trade.ALL_VAL_MO.astype(float)

df_all_trade.E_COMMODITY = df_all_trade.E_COMMODITY.astype(str)

df_all_trade.head(10)

Unnamed: 0,E_COMMODITY,ALL_VAL_MO,time,COMM_LVL,total_trade
0,206490010,6235559,2018-05-01,HS10,6235559.0
1,6212200010,265561,2018-03-01,HS10,265561.0
2,7213200000,2954764,2018-03-01,HS10,2954764.0
3,9114300000,22462,2018-10-01,HS10,22462.0
4,2844302010,0,2018-09-01,HS10,0.0
5,206490010,4501786,2017-01-01,HS10,4501786.0
6,2844302010,232488,2017-01-01,HS10,232488.0
7,2844302010,0,2018-08-01,HS10,0.0
8,6212200010,52286,2018-08-01,HS10,52286.0
9,206490010,3251276,2019-03-01,HS10,3251276.0


In [141]:
df_all_trade.set_index("time", inplace = True)

In [142]:
dftrade_17 = df_all_trade.loc["2017"].groupby("E_COMMODITY").agg({"total_trade":"sum"})

In [143]:
dftrade_17.head()

Unnamed: 0_level_0,total_trade
E_COMMODITY,Unnamed: 1_level_1
101210000,260550827.0
101290000,188067222.0
101300000,3274174.0
101900000,1814439.0
102210010,4038807.0


In [144]:
dftrade_17.shape

(9151, 1)

#### Then Read in the Census Concordance

In [145]:
url = "https://www.census.gov/foreign-trade/reference/codes/concordance/expconcord17.xls"

df_concordance = pd.read_excel(url, dtype = {"commodity": str, "naics": str})

df_concordance["hs8"] = df_concordance.commodity.str[0:8]
# truncate down to get the hs8

df_concordance["hs6"] = df_concordance.commodity.str[0:6]
# truncate down to get the hs6

#df_concordance["naics3"] = df_concordance["naics"].str[0:3]

#dict_concordance = dict(zip(df_concordance.hs6,df_concordance.naics)) 

# This creates a dictionaty from which we can map the hs6 to the naics codes

In [146]:
#df_concordance[["hs6","naics3"]].head()

#first = df_concordance.groupby("hs6")["naics"].count().max()

#second = df_concordance.groupby("naics")["hs6"].count().max()

#test = df_concordance.groupby("hs6")["naics"].count()

#test.sum()

#### Then Merge the two

I'm going to do a right merge to perseve the structure of the original census concordance. 

In [147]:
trade_conc = dftrade_17.merge(df_concordance[["hs6","naics","commodity"]],
                                left_index = True, right_on = "commodity", how = "right", indicator = True)

In [148]:
trade_conc.head()

Unnamed: 0,total_trade,hs6,naics,commodity,_merge
0,260550827.0,10121,112920,101210000,both
1,188067222.0,10129,112920,101290000,both
2,3274174.0,10130,112920,101300000,both
3,1814439.0,10190,112920,101900000,both
4,4038807.0,10221,11211X,102210010,both


#### Then Groupby and Assign Naics Codes

I will group by hs6, then for each hs6 code there may be multiple naics codes. In these cases, idea is to assign the naics code to the hs6 code based on which one has the most trade.

In [149]:
def concordance_group(df):
    
    #print(df.index)
    
    num_naics = df.shape[0]
    # This tells me how many naics codes are there
    
    if num_naics == 1:
        # if only one (like most), just grab the naics code
        
        max_naics = df.naics.iloc[0]
        
    else:
        # if many, then grab the max trade flows...
        
        max_naics = df.loc[df.total_trade.idxmax()].naics
    
    foo = {"num_naics": [num_naics],
          "naics": [max_naics]}
    
    return pd.DataFrame(foo)

In [150]:
grp = trade_conc.groupby("hs6")

In [158]:
grp.get_group("030211")

Unnamed: 0,total_trade,hs6,naics,commodity,_merge
148,2373449.0,30211,112511,302110010,both
149,516374.0,30211,114111,302110090,both


This provides a good example of the issue. So two hs10 codes (0302110010, 0302110090) and two different naics codes (114111, 112511), but the same hs6 code 030211. So how do we go from hs6 to naics. I'll do it by taking the one with the most trade. So 030211 will go to 112511. 

In [157]:
concordance_group(grp.get_group("030211"))

Unnamed: 0,num_naics,naics
0,2,112511


In [152]:
mc = grp.apply(concordance_group)

In [153]:
mc.reset_index(inplace = True)

In [154]:
mc.head()

Unnamed: 0,hs6,level_1,num_naics,naics
0,10121,0,1,112920
1,10129,0,1,112920
2,10130,0,1,112920
3,10190,0,1,112920
4,10221,0,4,11211X


In [155]:
mc.groupby("hs6")["naics"].count().sum()

5376

In [156]:
file_path = os.getcwd() + "\\data"+ "\\alt_concordance.parquet"

pq.write_table(pa.Table.from_pandas(mc), file_path)