In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import geopandas as gpd
import fiona

import datetime

In [2]:
sheets_dict = pd.read_excel('ui_claims_county_industry.xlsx', sheet_name=None,skiprows=2,skipfooter=3)

df = pd.DataFrame()
for name, sheet in sheets_dict.items():
    sheet['County'] = name + 'County'
    sheet = sheet.rename(columns=lambda x: x.split('\n')[-1])
    df = df.append(sheet)

df.reset_index(inplace=True, drop=True)

In [3]:
del df['Unnamed: 2']
del df['Unnamed: 4']

In [4]:
df = df.melt(id_vars=["County", "NAICS", "INDUSTRY NAME"], 
             var_name="Date", 
             value_name="Claims")

df['Claims'] = df['Claims'].replace(',', '', regex=True)

df['Claims'] = df['Claims'].astype(int)

In [116]:
warns = pd.read_excel('warnLogs_200720.xlsx')
warns['County'] = warns['County'] + ' County'
warns['County'] = warns['County'].str.strip()

In [117]:
codes = pd.read_excel('../../data/geographies/regional-commissions.xlsx', usecols='A,B,C')
codes['County'] = codes['County'] + ' County'
codes['County'] = codes['County'].astype(str)

msa = pd.read_excel('../../data/geographies/cbsa.xlsx', sheet_name='cbsa', usecols='A,D,H,I')
msa = msa.loc[msa['State'] == 'Georgia']

In [118]:
df = pd.merge(df,codes,how='left',on='County')


In [119]:
df = pd.merge(df,msa,how='left',on='County')
warns = pd.merge(warns,codes,how='left',on='County')
warns = pd.merge(warns,msa,how='left',on='County')

In [120]:
today = np.datetime64('today')
lastYear = today - 365

warnsCurr = warns[(warns['Date'] > '2020-03-01') & (warns['Date'] <= today)]
warnsCurr['Period'] = 'Current Period'
warnsLast = warns[(warns['Date'] > '2019-03-01') & (warns['Date'] <= lastYear)]
warnsLast['Period'] = 'Last Period'
warnPeriods = warnsCurr.append([warnsLast])

warnPeriods = warnPeriods[['ID','Company name','Est. Impact','Date','Period']]

warns = pd.merge(warns,warnPeriods,how='left',on=['ID','Company name','Est. Impact','Date'])

warns['Period'].fillna('NA',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [121]:
warns.head(2)

Unnamed: 0,ID,Company name,City,ZIP,County,Est. Impact,LWDA,Date,area_code,Regional Commission,CBSA Code,CBSA Title,State,Period
0,GA201900061,"St. Partners, LLC",Gainesville,30504,Hall County,220,2.0,2019-12-21,13139.0,Georgia Mountains,23580.0,"Gainesville, GA",Georgia,
1,GA201900063,"Mount Vernon Mills, Inc.",Alto,30510,Banks County,600,2.0,2020-03-09,13011.0,Georgia Mountains,,,,Current Period


In [123]:
test = warns.groupby(['County','Period']).size().reset_index()
test.sort_values(['County','Period'], inplace=True)

In [None]:
# groupby county and total warns in each of the above periods
# groupby msa and total warns in each of the above periods
# groupby regional commission and total warns in each of the bove periods
# get pct difference between periods for each gography

In [9]:
labor = pd.read_excel('county_laborForce.xlsx')
labor.dropna(subset=['title'], inplace=True)
labor = labor.loc[labor['Month'] == 3.0]
labor['Date'] = pd.to_datetime(labor.Month.astype(int).astype(str) + '-7' + '-2020', format = '%m-%d-%Y')
labor['Date'] = labor['Date'].dt.strftime('%m/%d/%Y')
labor['title'] = labor['title'].replace(', GA', '', regex=True)

In [10]:
marchLabor = labor[['title','Date','emp','unemp','laborforce']]

In [11]:
marchLabor.rename(columns={'title'      : 'County',
                           'emp'        : 'MarchEmployed',
                           'unemp'      : 'MarchUnemployed',
                           'laborforce' : 'MarchLaborForce'}, inplace=True)
marchLabor.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
# create separate frame of total claims by county, not industry
countyClaims = df.groupby(['area_code','County','Date']).agg({'Claims':'sum'}).reset_index()

In [13]:
countyClaims.drop_duplicates(inplace=True)

In [14]:
countyClaims = pd.merge(countyClaims,marchLabor,how='left',on=['County','Date'])

countyClaims.MarchEmployed.fillna(method='ffill', inplace=True)
countyClaims.MarchUnemployed.fillna(method='ffill', inplace=True)
countyClaims.MarchLaborForce.fillna(method='ffill', inplace=True)

countyClaims['CumClaims'] = countyClaims.groupby(['area_code','County'])['Claims'].apply(lambda x: x.cumsum())

countyClaims['Employed'] = countyClaims.MarchEmployed - countyClaims.CumClaims
countyClaims['Unemployed'] = countyClaims.MarchUnemployed + countyClaims.CumClaims
countyClaims['PctChngEmployed'] = ( ( countyClaims.Employed - countyClaims.MarchEmployed ) / countyClaims.MarchEmployed ) * 100
countyClaims.PctChngEmployed = countyClaims.PctChngEmployed.round(1)

In [15]:
# do this for the whole state
marchLaborState = marchLabor.groupby('Date') \
                            .agg({'MarchEmployed':'sum','MarchUnemployed' : 'sum','MarchLaborForce' : 'sum'}) \
                            .reset_index()

stateClaims = countyClaims[['County','Date','Claims']]

stateClaims = stateClaims.groupby('Date').agg({'Claims':'sum'}).reset_index()

stateClaims = pd.merge(stateClaims, marchLaborState, how='left', on='Date')

stateClaims.MarchEmployed.fillna(method='ffill', inplace=True)
stateClaims.MarchUnemployed.fillna(method='ffill', inplace=True)
stateClaims.MarchLaborForce.fillna(method='ffill', inplace=True)

In [16]:
stateClaims['State'] = 'Georgia'

In [17]:
stateClaims['CumClaims'] = stateClaims.groupby(['State'])['Claims'].apply(lambda x: x.cumsum())

In [18]:
stateClaims['Employed'] = stateClaims.MarchEmployed - stateClaims.CumClaims
stateClaims['Unemployed'] = stateClaims.MarchUnemployed + stateClaims.CumClaims
stateClaims['PctChngEmployed'] = ( ( stateClaims.Employed - stateClaims.MarchEmployed ) / stateClaims.MarchEmployed ) * 100
stateClaims.PctChngEmployed = stateClaims.PctChngEmployed.round(1)

In [19]:
stateClaims = stateClaims[['State','Date','Claims','MarchEmployed','MarchUnemployed','MarchLaborForce',
                           'CumClaims','Employed','Unemployed','PctChngEmployed']]

In [20]:
# now do the same with msa claims

In [21]:
marchLaborMsa = pd.merge(marchLabor, msa, how='left', on='County')

In [22]:
marchLaborMsa.dropna(subset=['CBSA Code'], inplace=True)

In [23]:
marchLaborMsa = marchLaborMsa.groupby(['CBSA Title','Date']) \
                             .agg({'MarchEmployed':'sum','MarchUnemployed' : 'sum','MarchLaborForce' : 'sum'}) \
                             .reset_index()

In [24]:
msaClaims = countyClaims[['County','Date','Claims']]
msaClaims = pd.merge(msaClaims, msa, how='left', on='County')

msaClaims.dropna(subset=['CBSA Code'], inplace=True)

msaClaims = msaClaims.groupby(['CBSA Title','Date']) \
                             .agg({'Claims':'sum'}) \
                             .reset_index()

msaClaims = pd.merge(msaClaims,marchLaborMsa,how='left',on=['CBSA Title','Date'])


msaClaims.MarchEmployed.fillna(method='ffill', inplace=True)
msaClaims.MarchUnemployed.fillna(method='ffill', inplace=True)
msaClaims.MarchLaborForce.fillna(method='ffill', inplace=True)

msaClaims['CumClaims'] = msaClaims.groupby(['CBSA Title'])['Claims'].apply(lambda x: x.cumsum())

msaClaims['Employed'] = msaClaims.MarchEmployed - msaClaims.CumClaims
msaClaims['Unemployed'] = msaClaims.MarchUnemployed + msaClaims.CumClaims
msaClaims['PctChngEmployed'] = ( ( msaClaims.Employed - msaClaims.MarchEmployed ) / msaClaims.MarchEmployed ) * 100
msaClaims.PctChngEmployed = msaClaims.PctChngEmployed.round(1)

In [25]:
# now repeat with regional commissions

In [26]:
marchLaborRc = pd.merge(marchLabor, codes, how='left', on='County')

In [27]:
marchLaborRc = marchLaborRc.groupby(['Regional Commission','Date']) \
                             .agg({'MarchEmployed':'sum','MarchUnemployed' : 'sum','MarchLaborForce' : 'sum'}) \
                             .reset_index()

In [28]:
rcClaims = countyClaims[['County','Date','Claims']]
rcClaims = pd.merge(rcClaims, codes, how='left', on='County')

rcClaims = rcClaims.groupby(['Regional Commission','Date']) \
                             .agg({'Claims':'sum'}) \
                             .reset_index()

rcClaims = pd.merge(rcClaims,marchLaborRc,how='left',on=['Regional Commission','Date'])


rcClaims.MarchEmployed.fillna(method='ffill', inplace=True)
rcClaims.MarchUnemployed.fillna(method='ffill', inplace=True)
rcClaims.MarchLaborForce.fillna(method='ffill', inplace=True)

rcClaims['CumClaims'] = rcClaims.groupby(['Regional Commission'])['Claims'].apply(lambda x: x.cumsum())

rcClaims['Employed'] = rcClaims.MarchEmployed - rcClaims.CumClaims
rcClaims['Unemployed'] = rcClaims.MarchUnemployed + rcClaims.CumClaims
rcClaims['PctChngEmployed'] = ( ( rcClaims.Employed - rcClaims.MarchEmployed ) / rcClaims.MarchEmployed ) * 100
rcClaims.PctChngEmployed = rcClaims.PctChngEmployed.round(1)

In [29]:
del countyClaims['area_code']

msaClaims.rename(columns={'CBSA Title' : 'Area'}, inplace=True)
rcClaims.rename(columns={'Regional Commission' : 'Area'}, inplace=True)
countyClaims.rename(columns={'County' : 'Area'}, inplace=True)
stateClaims.rename(columns={'State' : 'Area'}, inplace=True)

In [30]:
# stack on top of each other and write out as a single data frame for line and bar charts

In [31]:
data = stateClaims.append([countyClaims,rcClaims,msaClaims])

In [32]:
data.reset_index(drop=True, inplace=True)

In [33]:
# now make a flat file for mapbox

In [34]:
countyClaims=countyClaims.groupby('Area').apply(lambda x: x.reset_index(drop=True)).drop('Area',axis=1).reset_index()
dateMax = countyClaims.level_1.max()
countyClaims = countyClaims.loc[countyClaims['level_1'] == dateMax]
countyClaims.reset_index(drop=True, inplace=True)
countyShape = gpd.read_file("spatial/ga-counties.geojson")
countyClaims = pd.merge(countyClaims, codes, how='left', left_on='Area', right_on='County')
countyClaims.area_code = countyClaims.area_code.astype(str)
countyShape = pd.merge(countyShape, countyClaims, how='left', left_on='GEOID', right_on='area_code')
countyShapes = countyShape[['area_code','County','CumClaims','MarchEmployed','MarchUnemployed',
                            'MarchLaborForce','Employed','Unemployed','PctChngEmployed','geometry']]

In [35]:
countyShapes = gpd.GeoDataFrame(countyShapes, geometry='geometry')
countyPoints = countyShapes.copy()
countyPoints['geometry'] = countyPoints['geometry'].centroid

In [36]:
# now get the KPIs you need in a json file
dataK = data.copy()
dataK=dataK.groupby('Area').apply(lambda x: x.reset_index(drop=True)).drop('Area',axis=1).reset_index()
dateMax = dataK.level_1.max()
dataK = dataK.loc[dataK['level_1'] == dateMax]

In [37]:
dataK.sort_values('Claims', ascending=False, inplace=True)

In [38]:
dataK['LaborForce'] = dataK.Employed + dataK.Unemployed
dataK['Rate'] = ( dataK.Unemployed / dataK.LaborForce ) * 100
dataK.Rate = dataK.Rate.round(1)

In [39]:
dataK = dataK[['Area','MarchEmployed','CumClaims','Employed','Unemployed','PctChngEmployed','LaborForce','Rate']]

In [40]:
dataK.set_index('Area', inplace=True)

In [41]:
dataK.MarchEmployed = dataK.MarchEmployed.astype(int)
dataK.Employed = dataK.Employed.astype(int)
dataK.Unemployed = dataK.Unemployed.astype(int)
dataK.LaborForce = dataK.LaborForce.astype(int)

In [42]:
# convert kpi to strings with correct format for display purposes
dataK['MarchEmployed'] = dataK['MarchEmployed'].map('{:,.0f}'.format)
dataK['CumClaims'] = dataK['CumClaims'].map('{:,.0f}'.format)
dataK['Employed'] = dataK['Employed'].map('{:,.0f}'.format)
dataK['Unemployed'] = dataK['Unemployed'].map('{:,.0f}'.format)
dataK['LaborForce'] = dataK['LaborForce'].map('{:,.0f}'.format)
dataK['PctChngEmployed'] = dataK['PctChngEmployed'].map('{:,.1f}'.format) + '%'
dataK['Rate'] = dataK['Rate'].map('{:,.1f}'.format) + '%'

In [44]:
# write files
# this is the county polygon file for fill layers in mapbox
countyShapes.to_file("../application/app-data/mapbox/countyShapes.geojson", driver='GeoJSON')
# this is the county point file for circle layers in mapbox
countyPoints.to_file("../application/app-data/mapbox/countyPoints.geojson", driver='GeoJSON')
# this is for the github file to draw the d3 charts
data.to_csv('../application/app-data/uiClaims.csv', index=False)
# this is for the warn logs by city for a point layer within the application

warns.to_excel('../application/app-data/warnsClean.xlsx', index=False)
# this is for the json object to use as dynamically generated KPIs
dataK.to_json(orient='index')

'{"Georgia":{"MarchEmployed":"4,936,482","CumClaims":"1,793,861","Employed":"3,142,621","Unemployed":"2,017,514","PctChngEmployed":"-36.3%","LaborForce":"5,160,135","Rate":"39.1%"},"Atlanta-Sandy Springs-Alpharetta, GA":{"MarchEmployed":"2,995,413","CumClaims":"1,137,152","Employed":"1,858,261","Unemployed":"1,267,450","PctChngEmployed":"-38.0%","LaborForce":"3,125,711","Rate":"40.5%"},"ARC":{"MarchEmployed":"2,360,403","CumClaims":"921,503","Employed":"1,438,900","Unemployed":"1,025,152","PctChngEmployed":"-39.0%","LaborForce":"2,464,052","Rate":"41.6%"},"Fulton County":{"MarchEmployed":"538,690","CumClaims":"240,324","Employed":"298,366","Unemployed":"265,534","PctChngEmployed":"-44.6%","LaborForce":"563,900","Rate":"47.1%"},"Gwinnett County":{"MarchEmployed":"475,385","CumClaims":"176,481","Employed":"298,904","Unemployed":"195,535","PctChngEmployed":"-37.1%","LaborForce":"494,439","Rate":"39.5%"},"Northwest":{"MarchEmployed":"405,999","CumClaims":"153,244","Employed":"252,755","Une