## Our notes

In this file we created data frames for each year of data sets and we took some time to understand the data in each set. We corrected the column names, dropped irrelevant columns and merged all of the data frames together so we would have one complete data frame with all rows of data from 2013-2023.

  naics_dict = {
    "11": "Agriculture, Forestry, Fishing and Hunting",
    "21": "Mining, Quarrying, and Oil and Gas Extraction",
    "22": "Utilities",
    "23": "Construction",
    "31": "Manufacturing",
    "32": "Manufacturing",
    "33": "Manufacturing",
    "42": "Wholesale Trade",
    "44": "Retail Trade",
    "45": "Retail Trade",
    "48": "Transportation and Warehousing",
    "49": "Transportation and Warehousing",
    "51": "Information",
    "52": "Finance and Insurance",
    "53": "Real Estate and Rental and Leasing",
    "54": "Professional, Scientific, and Technical Services",
    "55": "Management of Companies and Enterprises",
    "56": "Administrative and Support and Waste Management and Remediation Services",
    "61": "Educational Services",
    "62": "Health Care and Social Assistance",
    "71": "Arts, Entertainment, and Recreation",
    "72": "Accommodation and Food Services",
    "81": "Other Services (except Public Administration)",
    "92": "Public Administration"

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#functions for making df, lowering columns and adding year column

def made_df(file_name):
    try:
        content = pd.read_csv(file_name)
        return content
    
    except FileNotFoundError:
        print("File not found")
        
def lower_columns(data_frame):
    data_frame.columns = data_frame.columns.str.lower()
    return data_frame.columns

def add_year(df,year):
    df['year'] = year
    return df['year']


def replace_naics(naics_code):
    # Dictionary mapping NAICS codes to industry names
    naics_dict = {
    "11": "11",
    "21": "21",
    "22": "22",
    "23": "23",
    "31": "31",
    "32": "32",
    "33": "33",
    "42": "42",
    "44": "44",
    "45": "45",
    "48": "48",
    "49": "49",
    "51": "51",
    "52": "52",
    "53": "53",
    "54": "54",
    "55": "55",
    "56": "56",
    "61": "61",
    "62": "62",
    "71": "71",
    "72": "72",
    "81": "81",
    "92": "92"
}
    
    
    industry_code = naics_code[:2]  # Extract first two digits of NAICS code
    return naics_dict.get(industry_code, "UI")


In [3]:
# making df, lowering columns and adding year column
df13 = made_df('cbp13pr_mun.csv')
lower_columns(df13)
add_year(df13,2013)

df14 = made_df('cbp14pr_mun.csv')
lower_columns(df14)
add_year(df14,2014)

df15 = made_df('cbp15pr_mun.csv')
lower_columns(df15)
add_year(df15,2015)

df16 = made_df('cbp16pr_mun.csv')
lower_columns(df16)
add_year(df16,2016)

df17 = made_df('cbp17pr_co.csv')
lower_columns(df17)
add_year(df17,2017)

df18 = made_df('cbp_pr_mun_2018.csv')
lower_columns(df18)
add_year(df18,2018)

df19 = made_df('cbp_pr_mun_2019.csv')
lower_columns(df19)
add_year(df19,2019)

df20 = made_df('compindus2020.csv')
lower_columns(df20)
add_year(df20,2020)

df21 = made_df('compindus2021.csv')
lower_columns(df21)
add_year(df21,2021)

df22 = made_df('compindus2022.csv')
lower_columns(df22)
add_year(df22,2022)

df23 = made_df('compidus2023.csv')
lower_columns(df23)
add_year(df23,2023)


0       2023
1       2023
2       2023
3       2023
4       2023
        ... 
3435    2023
3436    2023
3437    2023
3438    2023
3439    2023
Name: year, Length: 3440, dtype: int64

In [4]:
#translating and renaming column names
df20.columns = df20.columns.str.lower().str.replace('industria','industry').str.replace('cty','town').str.replace('unidades','establishments')
df20.columns = df20.columns.str.replace('industria','industry').str.replace('empleo promedio','average_employment')

df21.columns = df21.columns.str.lower().str.replace('industria','industry').str.replace('cty','town').str.replace('unidades','establishments')
df21.columns = df21.columns.str.replace('industria','industry').str.replace('empleo promedio','average_employment')

df22.columns = df22.columns.str.lower().str.replace('industria','industry').str.replace('cty','town').str.replace('unidades','establishments')
df22.columns = df22.columns.str.replace('industria','industry').str.replace('empleo promedio','average_employment',)

df23.columns = df23.columns.str.lower().str.replace('industria','industry').str.replace('cty','town').str.replace('unidades','establishments').str.replace('empleo promedio','average_employment')
df23.columns = df23.columns.str.replace('industria','industry')


#dropping unneeded columns

df13 = df13.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n1_4', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4', 'empflag'], axis=1)
df14 = df14.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n1_4', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4', 'empflag'], axis=1)
df15 = df15.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n1_4', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4', 'empflag'], axis=1)
df16 = df16.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n1_4', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4', 'empflag'], axis=1)
df17 = df17.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4', 'empflag'], axis=1)
df18 = df18.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4'], axis=1)
df19 = df19.drop(['censtate', 'cencty','qp1_nf','qp1','ap_nf','ap','emp_nf','emp','fipstate','n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99',
       'n100_249', 'n250_499', 'n500_999', 'n1000', 'n1000_1', 'n1000_2',
       'n1000_3', 'n1000_4'], axis=1)
df20 = df20.drop([' promedio ',' salario total ',' average_employment '], axis=1)
df21 = df21.drop([' salrio total municipio primer trimestre 2021 ',' promedio ',' average_employment '], axis=1)
df22 = df22.drop([' salrio total municipio primr trimestre 2022 ',' promedio ',' average_employment '], axis=1)
df23 = df23.drop(['salario total municipio tercer trimestre 2022','promedio','average_employment'], axis=1)

In [5]:
#concatenating 
df13_19 = pd.concat([df13,df14,df15,df16,df17,df18,df19])
df13_19

df13_19.columns = df13_19.columns.str.lower().str.replace('est','establishments').str.replace('fipscty','town')
df13_19

df20_23 = pd.concat([df20,df21,df22,df23])
df20_23

df13_23 = pd.concat([df13_19, df20_23], axis = 0)
df13_23

Unnamed: 0,town,naics,establishments,year,municipio,industry
0,1,------,130,2013,,
1,1,23----,8,2013,,
2,1,236///,5,2013,,
3,1,2361//,5,2013,,
4,1,23611/,5,2013,,
...,...,...,...,...,...,...
3435,999,71,0,2023,No Codificado,"Arte, Entretenimiento y Recreación"
3436,999,72,0,2023,No Codificado,Alojamiento y Servicios de Alimentos
3437,999,81,0,2023,No Codificado,Otros Servicios Excepto Adm. Pública
3438,999,92,0,2023,No Codificado,Administración Pública


In [6]:
#converting establishment column to float
df13_23["establishments"]=pd.to_numeric(df13_23["establishments"], errors='coerce')


#dropping null rows based on columns
df13_23.dropna(subset=['naics'], inplace=True)
df13_23.dropna(subset=['establishments'], inplace=True)
df13_23.isna().sum()


#dropping null rows based on having no useful data
df13_23 = df13_23[df13_23['municipio'] != 'No Codificado']
df13_23 = df13_23[df13_23['naics'] != '------']


#resetting the index
df13_23 = df13_23.reset_index(drop=True)

In [7]:
df13_23['naics'] = df13_23['naics'].apply(replace_naics)

In [8]:
#dropping rows of the bottom 10 industries and 2 columns we won't need

df13_23 = df13_23[df13_23['naics'] != '61']
df13_23 = df13_23[df13_23['naics'] != '51']
df13_23 = df13_23[df13_23['naics'] != '71']
df13_23 = df13_23[df13_23['naics'] != '49']
df13_23 = df13_23[df13_23['naics'] != '21']
df13_23 = df13_23[df13_23['naics'] != '55']
df13_23 = df13_23[df13_23['naics'] != '22']
df13_23 = df13_23[df13_23['naics'] != 'UI']
df13_23 = df13_23[df13_23['naics'] != '11']
df13_23 = df13_23[df13_23['naics'] != '92']


df13_23 = df13_23.drop(['industry','municipio'], axis=1)


In [10]:
df13_23.isna().sum()

town              0
naics             0
establishments    0
year              0
dtype: int64

In [11]:
# Assuming df is your DataFrame
df13_23.to_pickle('full_clean_df.pkl')