In [97]:
import pandas as pd             # data package
import matplotlib.pyplot as plt # graphics 
import datetime as dt

# these are new 
import requests, io             # internet and input tools  
import zipfile as zf            # zip file tools 
import shutil                   # file management tools 
import os                       # operating system tools (check files)

In [5]:
url = "https://data.bls.gov/cew/data/files/2017/csv/2017_annual_singlefile.zip"
# This will read in the annual, single file. It's big, but has all we want...

r = requests.get(url) 

In [6]:
# convert bytes to zip file  
mlz = zf.ZipFile(io.BytesIO(r.content)) 
print('Type of zipfile object:', type(mlz))

Type of zipfile object: <class 'zipfile.ZipFile'>


In [7]:
mlz.namelist()

['2017.annual.singlefile.csv']

In [15]:
clist = ['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'qtr', 'disclosure_code', 'annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages', 'taxable_annual_wages',
       'annual_contributions', 'annual_avg_wkly_wage', 'avg_annual_pay']

# These are the columns we care about and will grab

[https://data.bls.gov/cew/doc/titles/area/area_titles.htm](https://data.bls.gov/cew/doc/titles/area/area_titles.htm)

In [238]:
df = pd.read_csv(mlz.open(mlz.namelist()[0]), usecols= clist)

  interactivity=interactivity, compiler=compiler, result=result)


In [17]:
df.head()

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,total_annual_wages,taxable_annual_wages,annual_contributions,annual_avg_wkly_wage,avg_annual_pay
0,1000,0,10,50,0,2017,A,,124881,1936819,89088710816,14933112889,219228261,885,45997
1,1000,1,10,51,0,2017,A,,1208,53131,4339038631,0,0,1571,81668
2,1000,1,102,52,0,2017,A,,1208,53131,4339038631,0,0,1571,81668
3,1000,1,1021,53,0,2017,A,,610,11173,716001109,0,0,1232,64083
4,1000,1,1022,53,0,2017,A,,2,12,369309,0,0,584,30354


In [18]:
df.columns

Index(['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'qtr', 'disclosure_code', 'annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages', 'taxable_annual_wages',
       'annual_contributions', 'annual_avg_wkly_wage', 'avg_annual_pay'],
      dtype='object')

Now what we want to do is to create a national dataset for which we can merge on the county....

In [165]:

df_nation_naics = df[(df.area_fips == 1000) & (df.industry_code.str.len() == 4) & 
                     (df.own_code == 5)].copy()

# Grab just the national, just the 3 digit naics codes, and only private...

df_nation_naics["sup_ind"] = df_nation_naics.industry_code.str[1].astype(int)

# Then figure out only the naics codes as there are like "super industry" stuff 
# that we will want to throw out. The second digit for this stuff is always 0

In [166]:
df_nation_naics.head()

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,total_annual_wages,taxable_annual_wages,annual_contributions,annual_avg_wkly_wage,avg_annual_pay,sup_ind
798,1000,5,1011,53,0,2017,A,,1815,17999,949424301,185054327,3675387,1014,52748,0
799,1000,5,1012,53,0,2017,A,,9645,85262,4453218100,971090634,24965514,1004,52230,0
800,1000,5,1013,53,0,2017,A,,5447,263487,14543812862,2536465538,40603852,1061,55197,0
802,1000,5,1021,53,0,2017,A,,32410,375653,15220515239,3465334796,39913767,779,40518,0
803,1000,5,1022,53,0,2017,A,,1969,20807,1226186264,195420767,3728064,1133,58932,0


In [167]:
national_private_employment = df_nation_naics[df_nation_naics["sup_ind"] > 0].annual_avg_emplvl.sum()

print("national employment", national_private_employment)

national employment 1569628


In [169]:
df_nation_naics[df_nation_naics["sup_ind"] > 0].shape

(301, 16)

---

### Now lets merge this with the trade data

In [170]:
my_key = "&key=34e40301bda77077e24c859c6c6c0b721ad73fc7"

end_use = "naics?get=NAICS,CTY_CODE,ALL_VAL_MO,CTY_NAME"

url = "https://api.census.gov/data/timeseries/intltrade/exports/" + end_use + my_key + "&time==from+2017-01" + "&COMM_LVL=NA4"

url = url + "&CTY_CODE=5700"

In [171]:
r = requests.get(url) 

r

<Response [200]>

In [172]:
dftrade = pd.DataFrame(r.json()[1:]) # This then converts it to a dataframe
# Note that the first entry is the labels

dftrade.columns = r.json()[0]

dftrade.time = pd.to_datetime(dftrade.time, format="%Y-%m")
# This is so I can call this correctly...

dftrade.ALL_VAL_MO = dftrade.ALL_VAL_MO.astype(float)

dftrade.head(10)

Unnamed: 0,NAICS,CTY_CODE,ALL_VAL_MO,CTY_NAME,time,COMM_LVL,CTY_CODE.1
0,1111,5700,1931313000.0,CHINA,2017-01-01,NA4,5700
1,1112,5700,1071322.0,CHINA,2017-01-01,NA4,5700
2,1113,5700,14805220.0,CHINA,2017-01-01,NA4,5700
3,1114,5700,1512023.0,CHINA,2017-01-01,NA4,5700
4,1119,5700,167360800.0,CHINA,2017-01-01,NA4,5700
5,1121,5700,1172263.0,CHINA,2017-01-01,NA4,5700
6,1122,5700,2740.0,CHINA,2017-01-01,NA4,5700
7,1124,5700,391783.0,CHINA,2017-01-01,NA4,5700
8,1125,5700,25995.0,CHINA,2017-01-01,NA4,5700
9,1129,5700,1046655.0,CHINA,2017-01-01,NA4,5700


In [173]:
dftrade.set_index("time", inplace = True)

In [174]:
df17naics_trade = dftrade.loc["2017"].groupby("NAICS").agg({"ALL_VAL_MO":"sum"})
# Alot going on here, grab 2017, groupby NAICS code, then compute the sum. 
# So for a given NAICS code, this will be summing accross all observations,
# which in this case is across months. Thus this is annual exports.

In [175]:
df17naics_trade.head()

Unnamed: 0_level_0,ALL_VAL_MO
NAICS,Unnamed: 1_level_1
1111,13626270000.0
1112,47501720.0
1113,451978900.0
1114,16806110.0
1119,1665841000.0


Then merge it with the national level NAICS. Note the groupby operation above leaves the index as the naics code left is on the industry code. Default here is inner, need to think about if I want to carry around zeros.

In [176]:
df_nation_naics = df_nation_naics.merge(df17naics_trade, left_on = "industry_code", right_index = True)


In [179]:
df_nation_naics.shape

(101, 17)

In [177]:
print("national employment", national_private_employment)
print("Potential China Export Employment",test.annual_avg_emplvl.sum())
print("Share of Employment Potentially Exposed",test.annual_avg_emplvl.sum()/national_private_employment)

national employment 1569628
Potential China Export Employment 278064
Share of Employment Potentially Exposed 0.17715280308455253


In [178]:
df_nation_naics.head()

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,total_annual_wages,taxable_annual_wages,annual_contributions,annual_avg_wkly_wage,avg_annual_pay,sup_ind,ALL_VAL_MO
811,1000,5,1111,56,0,2017,A,,25,97,3480843,866894,10984,691,35947,1,13626270000.0
819,1000,5,1112,56,0,2017,A,,23,359,15601310,3600157,201532,835,43438,1,47501720.0
823,1000,5,1113,56,0,2017,A,,12,82,2104643,851903,5561,494,25692,1,451978900.0
828,1000,5,1114,56,0,2017,A,,118,2019,71582219,19538367,304625,682,35460,1,16806110.0
834,1000,5,1119,56,0,2017,A,,126,736,23039200,6841437,72860,602,31307,1,1665841000.0


### Get the county level data setup to line up with trade and national data...

What I want to do now is rename and probably drop a bunch of this stuff. Then merge it on the national df on the industry code. A think I need to figure out is to only have the county fips codes

In [239]:
df.area_fips = df.area_fips.astype(str)

In [245]:
df_county = df[(df.area_fips.str[-2:] != "00") & (df.area_fips.str[0] != "C") & (df.area_fips.str[0:2] != "US")
              & (df.area_fips.str[0:2] != "72") & (df.area_fips.str[0:2] != "78")
              & (df.area_fips.str[0:2] != "02") & (df.area_fips.str[0:2] != "15")].copy()

# Grab just the national, just the 3 digit naics codes, and only private...

#df_nation_naics["sup_ind"] = df_nation_naics.industry_code.str[1].astype(int)


# https://data.bls.gov/cew/doc/titles/agglevel/agglevel_titles.htm may be able to select on...

In [246]:
df_county = df_county[(df_county.industry_code.str.len() == 4) & (df_county.own_code == 5)]

In [247]:
df_county["sup_ind"] = df_county.industry_code.str[1].astype(int)

In [248]:
df_county = df_county[df_county["sup_ind"] > 0]

In [250]:
df_county.tail(10)

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,total_annual_wages,taxable_annual_wages,annual_contributions,annual_avg_wkly_wage,avg_annual_pay,sup_ind
2948376,56999,5,7223,76,0,2017,A,N,1,0,0,0,0,0,0,2
2948379,56999,5,7225,76,0,2017,A,N,0,0,0,0,0,0,0,2
2948384,56999,5,8111,76,0,2017,A,N,1,0,0,0,0,0,0,1
2948387,56999,5,8112,76,0,2017,A,,3,20,2055991,530200,3340,1985,103230,1
2948391,56999,5,8113,76,0,2017,A,,7,15,1344183,455382,11672,1783,92702,1
2948395,56999,5,8129,76,0,2017,A,N,1,0,0,0,0,0,0,1
2948399,56999,5,8131,76,0,2017,A,N,1,0,0,0,0,0,0,1
2948402,56999,5,8133,76,0,2017,A,,7,36,906687,447726,8042,481,25012,1
2948405,56999,5,8134,76,0,2017,A,N,1,0,0,0,0,0,0,1
2948408,56999,5,8139,76,0,2017,A,N,2,0,0,0,0,0,0,1


In [252]:
df_county.shape

(452504, 16)

In [255]:
300*3181

954300

In [262]:
df_county.agglvl_code.unique()

array([76], dtype=int64)

In [263]:
df_county[df_county.area_fips == "56999"].head()

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,total_annual_wages,taxable_annual_wages,annual_contributions,annual_avg_wkly_wage,avg_annual_pay,sup_ind
2947849,56999,5,2111,76,0,2017,A,N,10,0,0,0,0,0,0,1
2947855,56999,5,2123,76,0,2017,A,N,2,0,0,0,0,0,0,1
2947861,56999,5,2131,76,0,2017,A,,25,733,57901384,25383942,1299542,1519,79010,1
2947868,56999,5,2211,76,0,2017,A,N,2,0,0,0,0,0,0,2
2947873,56999,5,2361,76,0,2017,A,N,1,0,0,0,0,0,0,3
