In [1]:
import pandas as pd
import numpy as np
import DB_Utilities
DBTools = DB_Utilities.DBTools()  # instantiate the class
from  File_Utilities import FileTools
FileTools.MYDIR = ".\data"

redownload = False

In [2]:
def get_file_by_year(year):
    if redownload:
        print(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}co.zip")
        FileTools.unzip_file(FileTools.get_file_from_url(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}co.zip"))

        print(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}st.zip")
        FileTools.unzip_file(FileTools.get_file_from_url(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}st.zip"))

        print(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}us.zip")
        FileTools.unzip_file(FileTools.get_file_from_url(f"https://www2.census.gov/programs-surveys/cbp/datasets/{year}/cbp{year[2:]}us.zip"))

    # df_county = pd.read_csv(FileTools.get_full_file_path(f'cbp{year[2:]}co.txt'))
    df_county = FileTools.load_df_from_csv(f'cbp{year[2:]}co.txt')
    # df_state = pd.read_csv(FileTools.get_full_file_path(f'cbp{year[2:]}st.txt'))
    df_state = FileTools.load_df_from_csv(f'cbp{year[2:]}st.txt')
    # df_us = pd.read_csv(FileTools.get_full_file_path(f'cbp{year[2:]}us.txt'))
    df_us = FileTools.load_df_from_csv(f'cbp{year[2:]}us.txt')

    return df_county, df_state, df_us

# test
# df_county, df_state, df_us = get_file_by_year('2020')



In [3]:
def convert_column_names_toLowerCase(df):
    df.columns = [x.lower() for x in df.columns]
    return df

def munge_data(_df):

    # they changed the case of the column names in 2016. Jerks.
    _df = convert_column_names_toLowerCase(_df)
    

    _df['naics'] = _df['naics'].str.replace('-', '')
    _df['naics'] = _df['naics'].str.replace(' ', '')
    _df['naics'] = _df['naics'].str.replace('/', '')
    _df['naics_level'] = _df['naics'].str.len()


    try:
        # If there is no county_fips just don't do this. 
        # This means that this data is coming from either the state or the us.
            # Get rid of counties with 999 fips, they are either statewide or unknown.
        _df = _df[_df['fipscty'] != 999].copy()
        _df['county_fips'] = _df['fipstate'].astype(str).str.zfill(2)+_df['fipscty'].astype(str).str.zfill(3)    
    except Exception as e:
        print(e)
        pass # hahaha. not handling this error today.
    
    return _df

In [5]:
def run_all(year="2020", force_run=False):
   print(f"Running for year {year}")

   file_name = FileTools.get_full_file_path(f'cbp_emp_percent_by_county_state_us_{year}.gzip')   
   does_file_exist = FileTools.check_file(file_name)
   # if not then make it...
   if does_file_exist and not force_run:              
      print(f"Loading from file : {file_name} .")
      # emp_percent_by_county_state_us = pd.read_csv(file_name)
      emp_percent_by_county_state_us = FileTools.load_df_from_parquet(f'cbp_emp_percent_by_county_state_us_{year}.gzip')
   else:
      print(f"Making from scratch for : {year}")
      # Do all the work and save it.
      df_county, df_state, df_us =  get_file_by_year(year)

      print('Munging data')
      df_county = munge_data(df_county)
      # We only want top level employmnet data at the state and us level.
      # LFO             C       Legal Form of Organization

      #                         '-' - All Establishments                        
      #                         C - C-Corporations and other corporate legal forms of organization
      #                         Z - S-Corporations
      #                         S - Sole Proprietorships
      #                         P - Partnerships
      #                         N - Non-Profits
      #                         G - Government
      #                         O - Other

      df_state = munge_data(df_state)
      df_state = df_state[df_state.lfo == '-'] #Only All Establishments
      df_us = munge_data(df_us)
      df_us = df_us[df_us.lfo == '-'] #Only All Establishments


      print('Getting region level employment')
      # county level employment, we will join on this later as our base data
      df_county_emp = df_county[df_county.naics_level == 0][['county_fips','emp']]
      # state level employment, we need this to calculate the locaton quotient in relation to the state.
      df_state_emp = df_state[(df_state.naics_level == 0) ][['fipstate','emp']]
      # us level employment, we need this to calculate the locaton quotient in relation to the us.
      df_us_emp = df_us[(df_us.naics_level == 0) ][['uscode','emp']]


      print('Getting county level employment by industry')
      # Get the percent of each naics code in each county
      emp_percent_by_county = pd.merge(df_county, df_county_emp, on="county_fips")
      emp_percent_by_county['percent_of_county_emp'] = emp_percent_by_county['emp_x']/emp_percent_by_county['emp_y']
      emp_percent_by_county.rename(columns = {'emp_x':'emp_county_naics' , 'emp_y':'emp_county'}, inplace = True)

      print('Getting state level employment by industry')
      # Get the percent of each naics code in each state
      emp_percent_by_state = pd.merge(df_state, df_state_emp, on="fipstate")
      emp_percent_by_state['percent_of_state_emp'] = emp_percent_by_state['emp_x']/emp_percent_by_state['emp_y']
      emp_percent_by_state.rename(columns = {'emp_x':'emp_state_naics', 'emp_y':'emp_state' }, inplace = True)
      emp_percent_by_state = emp_percent_by_state[['fipstate','naics','percent_of_state_emp', 'emp_state_naics', 'emp_state']]

      print('Getting us level employment by industry')
      # Get the percent of each naics code in the US
      emp_percent_by_us = pd.merge(df_us, df_us_emp, on="uscode")
      emp_percent_by_us['percent_of_us_emp'] = emp_percent_by_us['emp_x']/emp_percent_by_us['emp_y']
      emp_percent_by_us.rename(columns = {'emp_x':'emp_us_naics', 'emp_y':'emp_us' }, inplace = True)
      emp_percent_by_us = emp_percent_by_us[['naics','percent_of_us_emp', 'emp_us_naics', 'emp_us']]
      emp_percent_by_us.head()

      print('Merging county with state and us')
      # Merge all the data together so we can calculate the location quotient.
      emp_percent_by_county_state = pd.merge(emp_percent_by_county, emp_percent_by_state, how='left', left_on=["naics", 'fipstate'], right_on=["naics", 'fipstate'])
      emp_percent_by_county_state_us = pd.merge(emp_percent_by_county_state, emp_percent_by_us, how='left', left_on=["naics"], right_on=["naics"])

      print('Calculating location quotient')
      # Calculate the location quotient for county/state and for county/us
      emp_percent_by_county_state_us['location_quotient_county_state'] = emp_percent_by_county_state_us['percent_of_county_emp']/emp_percent_by_county_state_us['percent_of_state_emp']
      emp_percent_by_county_state_us['location_quotient_county_us'] = emp_percent_by_county_state_us['percent_of_county_emp']/emp_percent_by_county_state_us['percent_of_us_emp']
      
      emp_percent_by_county_state_us.replace([np.inf, -np.inf], np.nan, inplace=True)
      emp_percent_by_county_state_us['year'] = year

      FileTools.save_df_as_parquet(emp_percent_by_county_state_us, f'cbp_emp_percent_by_county_state_us_{year}.gzip')
      # DBTools.truncate_and_insert_df(emp_percent_by_county_state_us, f"cbp_emp_percent_by_county_state_us_{year}")


   return emp_percent_by_county_state_us

In [10]:
years_to_get = [2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010]
# years_to_get = [2020]

dataframes = []
for year in years_to_get:
    dataframes.append(run_all(str(year), False))

all_years  = FileTools.concatenate_dataframes(dataframes)
all_years.shape

Running for year 2020
Loading from file : .\data\cbp_emp_percent_by_county_state_us_2020.gzip .
Running for year 2019
Making from scratch for : 2019
Munging data
'fipscty'
'fipscty'
Getting region level employment
Getting county level employment by industry
Getting state level employment by industry
Getting us level employment by industry
Merging county with state and us
Calculating location quotient
Running for year 2018
Making from scratch for : 2018
Munging data
'fipscty'
'fipscty'
Getting region level employment
Getting county level employment by industry
Getting state level employment by industry
Getting us level employment by industry
Merging county with state and us
Calculating location quotient
Running for year 2017
Making from scratch for : 2017
Munging data
'fipscty'
'fipscty'
Getting region level employment
Getting county level employment by industry
Getting state level employment by industry
Getting us level employment by industry
Merging county with state and us
Calculatin

(19140419, 40)

In [43]:
# FileTools.save_df_as_parquet(all_years, f"cbp_lq_all_years.gzip")
all_years_2digit_naics = all_years[all_years.naics_level == 2].copy()

# parquet is particular about mixed types, so we need to convert the objects to strings.
for df_column in all_years_2digit_naics.select_dtypes(include=['object']).columns:    
    all_years_2digit_naics[df_column] = all_years_2digit_naics[df_column].astype(str)

FileTools.save_df_as_parquet(all_years_2digit_naics, f"cbp_lq_2digit_naics_all_years.gzip")

naics
emp_nf
qp1_nf
ap_nf
n<5
n5_9
n10_19
n20_49
n50_99
n100_249
n250_499
n500_999
n1000
n1000_1
n1000_2
n1000_3
n1000_4
county_fips
year
empflag


'cbp_lq_2digit_naics_all_years.gzip'

In [42]:
# all_years_2digit_naics.dtypes
for df_column in all_years_2digit_naics.select_dtypes(include=['object']).columns:
    print(df_column)
    all_years_2digit_naics[df_column] = all_years_2digit_naics[df_column].astype(str)

naics
emp_nf
qp1_nf


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_years_2digit_naics[df_column] = all_years_2digit_naics[df_column].astype(str)


ap_nf
n<5
n5_9
n10_19
n20_49
n50_99
n100_249
n250_499
n500_999
n1000
n1000_1
n1000_2
n1000_3
n1000_4
county_fips
year
empflag


In [33]:
test[2739]

1123