## Purpose
Take the occupation employment statistics and map them onto the zip code level using principal building activity (PCA) code from CBECS. The end result is the number of workers in a given zip code in a given commercial building type.

In [1]:
import pandas as pd
import sqlite3
import numpy as np

Read in employment data

In [2]:
# ------------------------------------------- #
con = sqlite3.connect('Datasets/oes_may12_demo.db')
# ------------------------------------------- #

Get occupation data at the "major" occupation level. Here we are also going to divy up the total number employees amongst zip code. We will do it such that all zip codes within a given MSA will be given an even number of employees

In [3]:
df = pd.read_sql('select * from data where OCC_GROUP = \'major\'',con,index_col = 'index')
df_zip_count = df.groupby(['MSA No.','OCC_CODE'])[['id']].count()
df_zip_count.rename(columns = {'id':'nzip'},inplace=True)
df = df.join(df_zip_count,on=['MSA No.','OCC_CODE'])
df['EMP'] = df['TOT_EMP']/df['nzip']

In [4]:
df.shape

(724570, 11)

Get occupation data at the "total" occupation level. Here we are also going to divy up the total number employees amongst zip code. We will do it such that all zip codes within a given MSA will be given an even number of employees

In [5]:
df_tot = pd.read_sql('select * from data where OCC_GROUP = \'total\'',con,index_col = 'index')
df_tot_zip_count = df_tot.groupby(['MSA No.','OCC_CODE'])[['id']].count()
df_tot_zip_count.rename(columns = {'id':'nzip'},inplace=True)
df_tot = df_tot.join(df_tot_zip_count,on=['MSA No.','OCC_CODE'])
df_tot['EMP'] = df_tot['TOT_EMP']/df_tot['nzip']

Use this field for distributing employees amongst PBA types

In [6]:
field = 'EMP'

Read in dataframe with my mappings of job classifcations onto PBA

In [7]:
job = pd.read_excel('Datasets/job_class.xlsx',index_col=0) # Job class can be mapped with certain building types

  warn(msg)


Read in csv file with description of PBA types

In [8]:
pba_id = pd.read_csv('Datasets/pba_id.csv',index_col=0)

Create an empty truth table with job types as rows and PBAs as columns. This will show our mapping of which jobs are found in which PBAs

In [9]:
tbl = pd.DataFrame(np.zeros((len(job),len(pba_id)),dtype=bool),index = job.index,columns = pba_id.index)

Read in CBECS data

In [10]:
# ---------------------------------------------------------------------- #
cb = pd.read_csv('Datasets/cbecs_servers_cleaned_2012_dummy.csv')
#cb = pd.read_csv('Datasets/cbecs_servers_cleaned_2018_dummy.csv')


cb = cb[['PUBID', 'FINALWT', 'NWKER', 'PBA', 'CENDIV']]
# ---------------------------------------------------------------------- #

Create a new column to get number of workers for a given building in CBECS

In [11]:
cb['FINALWT_NWKER'] = cb['FINALWT']*cb['NWKER']

Do a pivot table to get the number of workers in a given PBA by census division

In [12]:
cb_piv = cb.pivot_table(index = 'CENDIV',columns='PBA',values = 'FINALWT_NWKER',aggfunc=np.sum)

Replace NaNs w/ 0s

In [13]:
cb_piv = cb_piv.replace(np.nan,0.)

There are some PBAs that are missing in cb_piv. For completeness add those PBAs back in and assign them values of 0.

In [14]:
no_vals = list(set(tbl.columns).difference(set(cb_piv.columns)))
for x in no_vals: cb_piv[x] = 0.


In [15]:
cb_piv = cb_piv.loc[:,tbl.columns]

Load in table to map state name to census division

In [16]:
geo = pd.read_csv('Datasets/geo_mapping.csv')
state_to_cdiv = geo.loc[:,['state abbrev','cdiv num']].set_index('state abbrev')

Add census division to occ_data

In [17]:
occ_data_big = df.join(state_to_cdiv,on = 'STATE')
occ_data_big_tot = df_tot.join(state_to_cdiv,on = 'STATE')

Get only koser data

In [18]:
m = (occ_data_big['cdiv num'] == occ_data_big['cdiv num'])

In [19]:
occ_data = occ_data_big[m]

Fill out truth table

In [20]:
for i in job.index:
    vals = job.loc[i,job.columns[1:]].dropna().astype(int).values
    tbl.loc[i,vals] = True

Use truth table to fill out a mask based on job code in occ_data

In [21]:
occ_mask = (occ_data.loc[:,['OCC_CODE']].join(tbl,on='OCC_CODE')).drop('OCC_CODE',axis = 1)

Use cb_piv to fill out a dataframe with number of workers in a given census division

In [22]:
occ_finalwt = (occ_data.loc[:,['cdiv num']].join(cb_piv,on='cdiv num')).drop('cdiv num',axis = 1)

Multiply number of works by mask and normalize

In [23]:
occ_mask_norm = (occ_mask * occ_finalwt).div((occ_mask * occ_finalwt).sum(axis=1),axis=0)

Now multiply by the number of workers in the OCC data

In [24]:
occ_by_pba = occ_mask_norm.mul(occ_data[field],axis=0)

Drop any empy rows

In [25]:
occ_by_pba.dropna(axis=0,how='all',inplace = True)

Flatten out the dataframe

In [26]:
occ_by_pba_flat  = occ_by_pba.unstack().replace(0,np.nan).dropna().reset_index().rename(columns = {'level_0':'pba','index':'occ_id',0:'nwrkrs'})

Add state, zip code, and census div to dataframe

In [27]:
occ = occ_by_pba_flat.join(occ_data.loc[:,['STATE','ZIP CODE','cdiv num']],on='occ_id')

Groupby zip and pba to get the number of workers in a given zip code for a given PBA

In [28]:
occ = occ.groupby(['STATE','ZIP CODE','pba'])['nwrkrs'].sum().reset_index()

okay, now we are going to get geographic distribution of all workers in OCC data.

In [29]:
m2 = (occ_data_big_tot['cdiv num'] == occ_data_big_tot['cdiv num'])

In [30]:
occ_data2 = occ_data_big_tot.loc[m2]

In [31]:
all_workers = occ_data2.loc[:,['STATE','ZIP CODE',field]]

In [32]:
all_workers.rename(columns = {field:'nwrkrs'},inplace = True)

Assign to PBA 1 (which currently shouldn't be included in the dataset)

In [33]:
all_workers['pba'] = 1

In [34]:
all_workers = all_workers[['STATE','ZIP CODE','pba','nwrkrs']]

Add this to the occ dataframe

In [35]:
occ = pd.concat([occ,all_workers],ignore_index=True)

Add census division to the dataframe

In [36]:
occ = occ.join(state_to_cdiv,on='STATE')

In [37]:
#occ = occ.sort(columns=['cdiv num','pba'])

occ = occ.sort_values(['cdiv num','pba'])

Get total number of workers for each census div and pba

In [38]:
occ_tot = occ.groupby(['cdiv num','pba'])[['nwrkrs']].sum()
occ_tot.rename(columns = {'nwrkrs':'nwrkrs_tot'}, inplace = True)

Add columns to occ dataframe

In [39]:
occ = occ.join(occ_tot,on =['cdiv num','pba'])

Divide the number of workers (nwrkrs) in each zip and pba by the total number of workers in that census divison and pba.

In [40]:
occ['frac'] = occ['nwrkrs']/occ['nwrkrs_tot']

In [41]:
occ.reset_index(drop = True, inplace = True)

Write out our csv file that has the fraction of employees in a given zip and pba relative to the number of workers that census division and pba.

In [42]:
# ------------------------------------------- #
occ.to_csv('Datasets/employment_by_zip_and_pba_2012.csv')
# ------------------------------------------- #

In [43]:
# occ[(occ['cdiv num'] == 3) & (occ['pba'] == 4)].frac.sum()

In [44]:
occ.head()

Unnamed: 0,STATE,ZIP CODE,pba,nwrkrs,cdiv num,nwrkrs_tot,frac
0,MA,1001,1,3049.361702,1,5570366.0,0.000547
1,MA,1002,1,3049.361702,1,5570366.0,0.000547
2,MA,1003,1,3049.361702,1,5570366.0,0.000547
3,MA,1007,1,3049.361702,1,5570366.0,0.000547
4,MA,1008,1,3049.361702,1,5570366.0,0.000547


In [45]:
sorted(occ.pba.unique())

[1, 2, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 23, 24, 25, 26, 91]