In [1]:
import pandas as pd
import numpy as np
import DB_Utilities
DBTools = DB_Utilities.DBTools()  # instantiate the class
from  File_Utilities import FileTools
FileTools.MYDIR = ".\data"

redownload = False

In [3]:
def get_file_by_year(year):
    if redownload:
        print(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}co.zip")
        FileTools.unzip_file(FileTools.get_file_from_url(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}co.zip"))

        print(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}st.zip")
        FileTools.unzip_file(FileTools.get_file_from_url(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}st.zip"))

        print(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}us.zip")
        FileTools.unzip_file(FileTools.get_file_from_url(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}us.zip"))

    # df_county = pd.read_csv(FileTools.get_full_file_path(f'cbp{year[2:]}co.txt'))
    df_county = FileTools.load_df_from_csv(f'cbp{year[2:]}co.txt')
    # df_state = pd.read_csv(FileTools.get_full_file_path(f'cbp{year[2:]}st.txt'))
    df_state = FileTools.load_df_from_csv(f'cbp{year[2:]}st.txt')
    # df_us = pd.read_csv(FileTools.get_full_file_path(f'cbp{year[2:]}us.txt'))
    df_us = FileTools.load_df_from_csv(f'cbp{year[2:]}us.txt')

    return df_county, df_state, df_us

# test
# df_county, df_state, df_us = get_file_by_year('2020')



In [4]:
def convert_column_names_toLowerCase(df):
    df.columns = [x.lower() for x in df.columns]
    return df

def munge_data(_df):

    # they changed the case of the column names in 2016. Jerks.
    _df = convert_column_names_toLowerCase(_df)
    

    _df['naics'] = _df['naics'].str.replace('-', '')
    _df['naics'] = _df['naics'].str.replace(' ', '')
    _df['naics'] = _df['naics'].str.replace('/', '')
    _df['naics_level'] = _df['naics'].str.len()


    try:
        # If there is no county_fips just don't do this. 
        # This means that this data is coming from either the state or the us.
            # Get rid of counties with 999 fips, they are either statewide or unknown.
        _df = _df[_df['fipscty'] != 999].copy()
        _df['county_fips'] = _df['fipstate'].astype(str).str.zfill(2)+_df['fipscty'].astype(str).str.zfill(3)    
    except Exception as e:
        print(e)
        pass # hahaha. not handling this error today.
    
    return _df

In [5]:
def run_all(year="2020", force_run=False):
   print(f"Running for year {year}")

   file_name = FileTools.get_full_file_path(f'cbp_emp_percent_by_county_state_us_{year}.gzip')   
   does_file_exist = FileTools.check_file(file_name)
   # if not then make it...
   if does_file_exist and not force_run:              
      print(f"Loading from file : {file_name} .")
      # emp_percent_by_county_state_us = pd.read_csv(file_name)
      emp_percent_by_county_state_us = FileTools.load_df_from_parquet(f'cbp_emp_percent_by_county_state_us_{year}.gzip')
   else:
      print(f"Making from scratch for : {year}")
      # Do all the work and save it.
      df_county, df_state, df_us =  get_file_by_year(year)

      print('Munging data')
      df_county = munge_data(df_county)
      # We only want top level employmnet data at the state and us level.
      # LFO             C       Legal Form of Organization

      #                         '-' - All Establishments                        
      #                         C - C-Corporations and other corporate legal forms of organization
      #                         Z - S-Corporations
      #                         S - Sole Proprietorships
      #                         P - Partnerships
      #                         N - Non-Profits
      #                         G - Government
      #                         O - Other

      df_state = munge_data(df_state)
      df_state = df_state[df_state.lfo == '-'] #Only All Establishments
      df_us = munge_data(df_us)
      df_us = df_us[df_us.lfo == '-'] #Only All Establishments


      print('Getting region level employment')
      # county level employment, we will join on this later as our base data
      df_county_emp = df_county[df_county.naics_level == 0][['county_fips','emp']]
      # state level employment, we need this to calculate the locaton quotient in relation to the state.
      df_state_emp = df_state[(df_state.naics_level == 0) ][['fipstate','emp']]
      # us level employment, we need this to calculate the locaton quotient in relation to the us.
      df_us_emp = df_us[(df_us.naics_level == 0) ][['uscode','emp']]


      print('Getting county level employment by industry')
      # Get the percent of each naics code in each county
      emp_percent_by_county = pd.merge(df_county, df_county_emp, on="county_fips")
      emp_percent_by_county['percent_of_county_emp'] = emp_percent_by_county['emp_x']/emp_percent_by_county['emp_y']
      emp_percent_by_county.rename(columns = {'emp_x':'emp_county_naics' , 'emp_y':'emp_county'}, inplace = True)

      print('Getting state level employment by industry')
      # Get the percent of each naics code in each state
      emp_percent_by_state = pd.merge(df_state, df_state_emp, on="fipstate")
      emp_percent_by_state['percent_of_state_emp'] = emp_percent_by_state['emp_x']/emp_percent_by_state['emp_y']
      emp_percent_by_state.rename(columns = {'emp_x':'emp_state_naics', 'emp_y':'emp_state' }, inplace = True)
      emp_percent_by_state = emp_percent_by_state[['fipstate','naics','percent_of_state_emp', 'emp_state_naics', 'emp_state']]

      print('Getting us level employment by industry')
      # Get the percent of each naics code in the US
      emp_percent_by_us = pd.merge(df_us, df_us_emp, on="uscode")
      emp_percent_by_us['percent_of_us_emp'] = emp_percent_by_us['emp_x']/emp_percent_by_us['emp_y']
      emp_percent_by_us.rename(columns = {'emp_x':'emp_us_naics', 'emp_y':'emp_us' }, inplace = True)
      emp_percent_by_us = emp_percent_by_us[['naics','percent_of_us_emp', 'emp_us_naics', 'emp_us']]
      emp_percent_by_us.head()

      print('Merging county with state and us')
      # Merge all the data together so we can calculate the location quotient.
      emp_percent_by_county_state = pd.merge(emp_percent_by_county, emp_percent_by_state, how='left', left_on=["naics", 'fipstate'], right_on=["naics", 'fipstate'])
      emp_percent_by_county_state_us = pd.merge(emp_percent_by_county_state, emp_percent_by_us, how='left', left_on=["naics"], right_on=["naics"])

      print('Calculating location quotient')
      # Calculate the location quotient for county/state and for county/us
      emp_percent_by_county_state_us['location_quotient_county_state'] = emp_percent_by_county_state_us['percent_of_county_emp']/emp_percent_by_county_state_us['percent_of_state_emp']
      emp_percent_by_county_state_us['location_quotient_county_us'] = emp_percent_by_county_state_us['percent_of_county_emp']/emp_percent_by_county_state_us['percent_of_us_emp']
      
      emp_percent_by_county_state_us.replace([np.inf, -np.inf], np.nan, inplace=True)
      emp_percent_by_county_state_us['year'] = year
#
      print('Saving to file')
      FileTools.save_df_as_parquet(emp_percent_by_county_state_us, f'cbp_emp_percent_by_county_state_us_{year}.gzip')
      print('Writing to dababase')
      # DBTools.truncate_and_insert_df(emp_percent_by_county_state_us, f"cbp_emp_percent_by_county_state_us_{year}")
      # DBTools.insert_df(emp_percent_by_county_state_us, f"cbp_emp_percent_by_county_state_us_{year}")


   return emp_percent_by_county_state_us

In [6]:
years_to_get = [2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]
# years_to_get = [2020]
Force_Run = False

dataframes = []
for year in years_to_get:
    dataframes.append(run_all(str(year), Force_Run))

all_years  = FileTools.concatenate_dataframes(dataframes)
all_years.shape

Running for year 2020
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2020.gzip .
Running for year 2019
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2019.gzip .
Running for year 2018
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2018.gzip .
Running for year 2017
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2017.gzip .
Running for year 2016
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2016.gzip .
Running for year 2015
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2015.gzip .
Running for year 2014
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2014.gzip .
Running for year 2013
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2013.gzip .
Running for year 2012
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2012.gzip .
Running for year 2011
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2011.gzip .
Running for year 2010
Loading from file 

(19140419, 40)

In [7]:
naics_level = 2
# FileTools.save_df_as_parquet(all_years, f"cbp_lq_all_years.gzip")
all_years_naics  = all_years[all_years.naics_level == naics_level].copy()

# parquet is particular about mixed types, so we need to convert the objects to strings.
for df_column in all_years_naics.select_dtypes(include=['object']).columns:    
    all_years_naics [df_column] = all_years_naics [df_column].astype(str)

FileTools.save_df_as_parquet(all_years_naics , f"cbp_lq_{naics_level}digit_naics_all_years.gzip")

'cbp_lq_2digit_naics_all_years.gzip'

### Pivoting tables so county is row and all predictors are columns

In [8]:
# Load in the data.
_cbp_lq_digit_naics_all_years_df = FileTools.load_df_from_parquet(f"cbp_lq_{naics_level}digit_naics_all_years.gzip")                                                                   

In [9]:
all_data_cbp = pd.DataFrame()
for i in range(2010,2021): 
# for i in range(2020,2021): 
    year = str(i)
    print(f'Working on year : {year}')    

    # Filter by the year we want.
    temp_cbp_lq_digit_naics_all_years_df = _cbp_lq_digit_naics_all_years_df[
        (_cbp_lq_digit_naics_all_years_df['year'] == year) 
        & (_cbp_lq_digit_naics_all_years_df['naics_level'] == naics_level)
        ][[
            'county_fips','qp1', 'ap', 'est','naics','location_quotient_county_state','location_quotient_county_us']].copy()


    # Get some aggregations by county
    group_cbp_lq_digit_naics_all_years_df = temp_cbp_lq_digit_naics_all_years_df[['county_fips','qp1', 'ap', 'est']].groupby(['county_fips']).sum().reset_index()
    
    
    # Pivot all the naics codes up into columns
    pivot_cbp_lq_2digit_naics_all_years_df = pd.pivot_table(temp_cbp_lq_digit_naics_all_years_df, values='location_quotient_county_us', index=['county_fips'],
                        columns=['naics'], aggfunc=np.average, fill_value=0)


    # Add the aggregations to the pivot table
    merge_cbp_lq_2digit_naics_all_years_df = pivot_cbp_lq_2digit_naics_all_years_df.merge(group_cbp_lq_digit_naics_all_years_df, on='county_fips', how='left')
    merge_cbp_lq_2digit_naics_all_years_df['year'] = year

    # Do I need to do this?
    merge_cbp_lq_2digit_naics_all_years_df.reset_index(inplace=True)
    all_data_cbp = pd.concat([all_data_cbp, merge_cbp_lq_2digit_naics_all_years_df])


# Save the data to a parquet file.
FileTools.save_df_as_parquet(all_data_cbp, f'cbp_lq_{naics_level}digit_naics_all_years_PIVOT.gzip')
FileTools.save_df_as_csv(all_data_cbp, f'cbp_lq_{naics_level}digit_naics_all_years_PIVOT.csv') 

Working on year : 2010
Working on year : 2011
Working on year : 2012
Working on year : 2013
Working on year : 2014
Working on year : 2015
Working on year : 2016
Working on year : 2017
Working on year : 2018
Working on year : 2019
Working on year : 2020


'cbp_lq_2digit_naics_all_years_PIVOT.csv'

In [13]:
# For two digit data

if(naics_level == 2):

    rename_dict = {'11':'Agriculture_Forestry_Fishing_Hunting',
    '21':'Mining_Quarrying_and_Oil_Gas_Extraction',
    '22':'Utilities',
    '23':'Construction',
    '31':'Manufacturing',
    '42':'Wholesale_Trade',
    '44':'Retail_Trade',
    '48':'Transportation_Warehousing',
    '51':'Information',
    '52':'Finance_Insurance',
    '53':'Real_Estate_Rental_Leasing',
    '54':'Professional_Scientific_and_Technical_Services',
    '55':'Management_of_Companies_Enterprises',
    '56':'Administrative_Support_Waste_Management_Remediation_Services',
    '61':'Educational_Services',
    '62':'Health_Care_Social_Assistance',
    '71':'Arts_Entertainment_and_Recreation',
    '72':'Accommodation_Food_Services',
    '81':'Other_Services_except_Public_Administration',
    '92':'Public_Administration'}

    all_data_cbp.rename(columns=rename_dict, inplace=True)  


    for i, k in enumerate(rename_dict):
        print(i, k)



    for i, k in enumerate(rename_dict):
        print(rename_dict[k])
        column_name = rename_dict[k]  
        try:  
            all_data_cbp[f'{column_name}_base'] = np.where(all_data_cbp[column_name] > 1, 1, 0)
        except:
            pass
        
    FileTools.save_df_as_parquet(all_data_cbp, f'cbp_lq_{naics_level}digit_naics_all_years_base_PIVOT.gzip')
    FileTools.save_df_as_csv(all_data_cbp, f'cbp_lq_{naics_level}digit_naics_all_years_base_PIVOT.csv') 


0 11
1 21
2 22
3 23
4 31
5 42
6 44
7 48
8 51
9 52
10 53
11 54
12 55
13 56
14 61
15 62
16 71
17 72
18 81
19 92
Agriculture_Forestry_Fishing_Hunting
Mining_Quarrying_and_Oil_Gas_Extraction
Utilities
Construction
Manufacturing
Wholesale_Trade
Retail_Trade
Transportation_Warehousing
Information
Finance_Insurance
Real_Estate_Rental_Leasing
Professional_Scientific_and_Technical_Services
Management_of_Companies_Enterprises
Administrative_Support_Waste_Management_Remediation_Services
Educational_Services
Health_Care_Social_Assistance
Arts_Entertainment_and_Recreation
Accommodation_Food_Services
Other_Services_except_Public_Administration
Public_Administration


In [15]:
test_all_data_cbp = FileTools.load_df_from_parquet(f'cbp_lq_{naics_level}digit_naics_all_years_base_PIVOT.gzip')
print(test_all_data_cbp.shape)

# DBTools.truncate_and_insert_df(all_data_cbp, "cbp_PIVOT")

(34312, 45)


In [16]:
test_all_data_cbp.query("year == '2020'")

Unnamed: 0,index,county_fips,Agriculture_Forestry_Fishing_Hunting,Mining_Quarrying_and_Oil_Gas_Extraction,Utilities,Construction,Manufacturing,Wholesale_Trade,Retail_Trade,Transportation_Warehousing,...,Finance_Insurance_base,Real_Estate_Rental_Leasing_base,Professional_Scientific_and_Technical_Services_base,Management_of_Companies_Enterprises_base,Administrative_Support_Waste_Management_Remediation_Services_base,Educational_Services_base,Health_Care_Social_Assistance_base,Arts_Entertainment_and_Recreation_base,Accommodation_Food_Services_base,Other_Services_except_Public_Administration_base
0,0,01001,6.553056,1.751673,3.084663,0.926968,0.941877,0.467058,1.833722,0.193912,...,0,0,0,0,0,0,0,1,1,1
1,1,01003,0.155821,0.121375,0.865132,1.170328,0.752901,0.804869,1.731134,0.388414,...,0,1,0,0,0,0,0,1,1,1
2,2,01005,13.235114,2.463995,2.658154,0.241140,4.310533,0.444648,0.927560,3.123021,...,0,0,0,0,0,0,0,0,0,0
3,3,01007,8.331400,0.000000,0.780958,4.377982,1.672230,0.534321,1.004989,1.922038,...,0,0,0,0,0,0,1,0,0,0
4,4,01009,1.304897,0.000000,0.000000,1.491332,1.770292,1.316867,1.460473,0.816050,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3130,3130,56037,0.000000,42.468579,5.491809,0.900077,1.400053,0.920426,1.243615,1.399134,...,0,0,0,0,0,0,0,0,1,0
3131,3131,56039,1.224360,0.000000,0.000000,2.186666,0.132216,0.270069,0.895855,0.465927,...,0,1,0,0,0,1,0,1,1,0
3132,3132,56041,0.000000,5.759218,2.377486,5.649656,0.296068,0.459452,1.141431,1.191604,...,0,0,0,0,0,0,1,0,0,0
3133,3133,56043,0.000000,3.453445,2.523699,1.582423,1.469166,0.902132,1.171319,0.922063,...,0,1,0,0,0,0,1,0,0,1
