In [46]:
import pandas as pd
import numpy as np
import glob

In [2]:
path = r'C:\Users\MTMAINZE\projects\covid-recovery\data\jobs-files' # use your path
all_files = glob.glob(path + "/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)

In [3]:
frame.shape

(123325, 9)

In [5]:
frame.head()

Unnamed: 0,Area Name,Area Bucket,Occupation Name,Occupation Bucket,Area,Occupation,Year,Jobs,Resident Workers
0,"Atkinson County, GA","Atkinson County, GA",Chief Executives,Chief Executives,13003,11-1011,2019,3.12885,4.122558
1,"Baker County, GA","Baker County, GA",Chief Executives,Chief Executives,13007,11-1011,2019,0.544435,0.996549
2,"Banks County, GA","Banks County, GA",Chief Executives,Chief Executives,13011,11-1011,2019,5.586603,10.732003
3,"Atkinson County, GA","Atkinson County, GA",General and Operations Managers,General and Operations Managers,13003,11-1021,2019,41.005093,57.276707
4,"Baker County, GA","Baker County, GA",General and Operations Managers,General and Operations Managers,13007,11-1021,2019,8.249552,14.886872


In [6]:
frame.drop_duplicates(inplace=True)

In [7]:
frame.shape

(123325, 9)

In [57]:
# bring in the onet codes
socs = pd.read_excel('covid-recovery-data.xlsx', sheet_name='soc-onet-xwalk')
physical = pd.read_excel('covid-recovery-data.xlsx', sheet_name='physical-prox')
disease = pd.read_excel('covid-recovery-data.xlsx', sheet_name='disease-exposure')

In [58]:
# just take the first O*NET code for each SOC as that works for our purposes here
socs.drop_duplicates(subset='SOC',inplace=True)

In [59]:
data = pd.merge(frame,socs,how='left',left_on='Occupation',right_on='SOC')

In [60]:
data = data[['Area Name','SOC','O*NET','Occupation Name','Area','Jobs','Resident Workers']]

In [61]:
data.rename(columns={'Area Name'        : 'County',
                     'Area'             : 'County FIPS',
                     'Jobs'             : '2019 Jobs',
                     'Resident Workers' : '2019 Resident Workers'}, inplace=True)

In [62]:
physical = physical[['Physical Proximity Index','O*NET']]

In [63]:
data = pd.merge(data,physical,how='left',on='O*NET')

In [64]:
disease = disease[['Disease Exposure Index','O*NET']]

In [66]:
data = pd.merge(data,disease,how='left',on='O*NET')

In [69]:
data['Frequent Physical Proximity'] = np.where(data['Physical Proximity Index']>=75, 'Frequent', 'Not Frequent')

In [71]:
data['Frequent Disease Exposure'] = np.where(data['Disease Exposure Index']>=75, 'Frequent', 'Not Frequent')

In [115]:
jobsData = data[['County','SOC','O*NET','Occupation Name','County FIPS','2019 Jobs','2019 Resident Workers']]
physProxSums = data.groupby(['County', 'County FIPS','Frequent Physical Proximity'])['2019 Jobs', '2019 Resident Workers',].sum().reset_index()
diseaseSums = data.groupby(['County', 'County FIPS','Frequent Disease Exposure'])['2019 Jobs', '2019 Resident Workers',].sum().reset_index()

In [116]:
physProxSums.head()

Unnamed: 0,County,County FIPS,Frequent Physical Proximity,2019 Jobs,2019 Resident Workers
0,"Appling County, GA",13001,Frequent,1384.703634,1541.872067
1,"Appling County, GA",13001,Not Frequent,6253.344558,6342.886618
2,"Atkinson County, GA",13003,Frequent,310.03826,651.607254
3,"Atkinson County, GA",13003,Not Frequent,2355.319565,3019.94086
4,"Bacon County, GA",13005,Frequent,839.507591,867.838385


In [117]:
diseaseFreqJobs = pd.pivot_table(diseaseSums,index=['County', 'County FIPS'],columns="Frequent Disease Exposure", values='2019 Jobs').reset_index().rename_axis(None, axis=1)
diseaseFreqRes = pd.pivot_table(diseaseSums,index=['County', 'County FIPS'],columns="Frequent Disease Exposure", values='2019 Resident Workers').reset_index().rename_axis(None, axis=1)
physProxFreqRes = pd.pivot_table(physProxSums,index=['County', 'County FIPS'],columns="Frequent Physical Proximity", values='2019 Resident Workers').reset_index().rename_axis(None, axis=1)
physProxFreqJobs = pd.pivot_table(physProxSums,index=['County', 'County FIPS'],columns="Frequent Physical Proximity", values='2019 Jobs').reset_index().rename_axis(None, axis=1)

In [118]:
diseaseFreqJobs.rename(columns={'Frequent'     : 'Jobs Frequent Disease Exposure',
                                'Not Frequent' : 'Jobs Infrequent Disease Exposure'}, inplace=True)

diseaseFreqRes.rename(columns={'Frequent'     : 'Resident Workers Frequent Disease Exposure',
                               'Not Frequent' : 'Resident Workers Infrequent Disease Exposure'}, inplace=True)

physProxFreqJobs.rename(columns={'Frequent'     : 'Jobs Frequent Physical Proximity',
                                'Not Frequent' : 'Jobs Infrequent Physical Proximity'}, inplace=True)

physProxFreqRes.rename(columns={'Frequent'     : 'Resident Workers Frequent Physical Proximity',
                               'Not Frequent' : 'Resident Workers Infrequent Physical Proximity'}, inplace=True)

In [123]:
countySum = pd.merge(diseaseFreqJobs,diseaseFreqRes,how='outer',on=['County','County FIPS'])

In [127]:
countySum = pd.merge(countySum,physProxFreqJobs,how='outer',on=['County','County FIPS'])

In [128]:
countySum = pd.merge(countySum,physProxFreqRes,how='outer',on=['County','County FIPS'])

In [138]:
# write files out to quickly visualize in whatever
countySum.to_csv('countySum.csv',index=False)
data.to_csv('countyJobs.csv',index=False)