# Data Cleaning

In [46]:
import pandas as pd # pandas is a library for data manipulation and analysis
import numpy as np # numpy is a library for numerical computations
from tqdm import tqdm # tqdm is a library for adding progress bars to iterables

from src.productivity_estimation import ProductivityEstimator # Custom class for estimating productivity

## Load in the BLS data

This data is in a long format and contains many measures that we don't need for our analysis. After exploring the data, I have decided only to keep the TFP measure indexed to 2017 for two digit NAICS codes only.

In [None]:
# Read the Excel file into a pandas DataFrame
bls_data = pd.read_excel("../data/raw/bls_data.xlsx", sheet_name="MachineReadable") 

In [48]:
# Pivoting the DataFrame into a wide format two separate columns for Measure and Units
pivoted = bls_data.pivot_table(index=["NAICS", "Industry", "Year"], columns=["Measure", "Units"], values="Value", aggfunc="first")

In [49]:
# Select only the TFP column with the unit "Index (2017=100)"
tfp_naics_long = pivoted.loc[:, 'Total factor productivity']["Index (2017=100)"].copy()

tfp_naics_long = tfp_naics_long.reset_index()
tfp_naics_long = tfp_naics_long.rename(columns={"Index (2017=100)": "TFP"})

tfp_naics_long["TFP_growth"] = tfp_naics_long.groupby("NAICS")["TFP"].pct_change()

In [50]:
# Shorten the DataFrame further to include only 2 digit NAICS codes.
tfp_naics_short = tfp_naics_long.loc[(tfp_naics_long["NAICS"].str.len() < 3) & ~(tfp_naics_long["NAICS"].isin(["DM", "ND"]))].reset_index(drop=True)

In [51]:
tfp_naics_short

Unnamed: 0,NAICS,Industry,Year,TFP,TFP_growth
0,11,"Agriculture, forestry, fishing, and hunting",1987,68.776,
1,11,"Agriculture, forestry, fishing, and hunting",1988,63.369,-0.078618
2,11,"Agriculture, forestry, fishing, and hunting",1989,66.843,0.054822
3,11,"Agriculture, forestry, fishing, and hunting",1990,70.492,0.054591
4,11,"Agriculture, forestry, fishing, and hunting",1991,71.025,0.007561
...,...,...,...,...,...
624,MN,Manufacturing sector,2019,99.675,-0.016731
625,MN,Manufacturing sector,2020,98.769,-0.00909
626,MN,Manufacturing sector,2021,102.304,0.035791
627,MN,Manufacturing sector,2022,101.062,-0.01214


In [52]:
# Get unique pairs of NAICS and Industry
naics_industry = tfp_naics_short[["NAICS", "Industry"]].drop_duplicates().reset_index(drop=True)
naics_industry

Unnamed: 0,NAICS,Industry
0,11,"Agriculture, forestry, fishing, and hunting"
1,21,Mining
2,22,Utilities
3,23,Construction
4,42,Wholesale trade
5,51,Information
6,52,Finance and insurance
7,53,Real estate and rental and leasing
8,54,"Professional, scientific, and technical services"
9,55,Management of companies and enterprises


These NAICS codes represent the majority of the private sector economy in the US. MN represents the manufacturing NAICS codes 31-33.

## Load in the BDS data

This data is in a long format and contains only data at the 4 digit NAICS code granularity. In order to be consistent with the BLS data, I will convert it into the 2 digit NAICS code granularity and will also only keep the data for the same sectors and years as the BLS data.

In [None]:
bds_data = pd.read_csv("../data/raw/bds_4naics.csv")

In [54]:
bds_data = bds_data.copy()

# Remove columns that contain "rate". This is done to clean the DataFrame and keep only the raw data.
# It is means that when grouping, we can simply use 'sum' as the aggregate function.
bds_data = bds_data.loc[:, ~bds_data.columns.str.contains("rate", case=False)]

# Create the 2 digit NAICS code column using the feature that the first two digits of a longer NAICS code are that industry's 2 digit sector.
bds_data["2naics"] = bds_data["vcnaics4"].astype(str).str[:2]

# Merge codes 31-33 into MN for consistency with the BLS data.
bds_data["2naics"] = bds_data["2naics"].replace({"31": "MN", "32": "MN", "33": "MN"})

In [55]:
# replace values containing "D" with np.nan
bds_data = bds_data.replace("D", value=np.nan)

# convert columns to numeric for proper functioning of groupby.
for col in bds_data.columns:
    if col not in ["Year", "2naics"]:
        try:
            bds_data[col] = pd.to_numeric(bds_data[col])
        except ValueError:
            print(f"Column {col} cannot be converted to int. It may contain non-integer values.")

In [56]:
# Group the dataframe by the 2 digit NAICS codes and the year and aggregate them using the sum function.
bds_2naics = bds_data.groupby(["2naics", "year"]).agg("sum").reset_index()

# remove vcnaics4 column and rename 2naics to NAICS
bds_2naics = bds_2naics.rename(columns={"2naics": "NAICS", "year": "Year"})
bds_2naics = bds_2naics.drop(columns=["vcnaics4"])

In [57]:
bds_2naics

Unnamed: 0,NAICS,Year,firms,estabs,emp,denom,estabs_entry,estabs_exit,job_creation,job_creation_births,job_creation_continuers,job_destruction,job_destruction_deaths,job_destruction_continuers,net_job_creation,firmdeath_firms,firmdeath_estabs,firmdeath_emp
0,11,1978,20370,21019,185795,178800,4011.0,3698.0,57357,20570.0,36787.0,43697,17668.0,26029.0,13660,2690.0,2715.0,12047.0
1,11,1979,20037,20633,185894,186550,3612.0,3650.0,50074,19437.0,30637.0,51578,20607.0,30971.0,-1504,2618.0,2662.0,13911.0
2,11,1980,19994,20586,187244,187525,3282.0,3392.0,45609,16313.0,29296.0,46396,15232.0,31164.0,-787,2581.0,2581.0,10799.0
3,11,1981,19235,19812,179027,182600,2810.0,3489.0,41572,14777.0,26795.0,48802,19376.0,29426.0,-7230,2486.0,2489.0,11514.0
4,11,1982,17973,18614,169727,172432,2600.0,3469.0,42786,16078.0,26708.0,48092,16623.0,31469.0,-5306,2415.0,2420.0,11377.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,MN,2018,237183,273785,11867119,11778528,18299.0,18584.0,1113717,240980.0,872737.0,939197,238605.0,700592.0,174520,13696.0,13769.0,124547.0
941,MN,2019,235305,272129,12064265,11960958,17161.0,18815.0,1085270,222948.0,862322.0,877348,237753.0,639595.0,207922,13804.0,13903.0,119545.0
942,MN,2020,231324,268369,11960402,12010626,16630.0,20458.0,977357,216658.0,760699.0,1079621,256545.0,823076.0,-102264,14375.0,14470.0,127688.0
943,MN,2021,227887,265398,11658587,11810038,18251.0,20646.0,988187,234806.0,753381.0,1289836,259129.0,1030707.0,-301649,13914.0,14046.0,114600.0


As there is not a firm births column, we must infer it from the number of firms each period and the number of firm deaths.

$$
\text{firms}_{i+1} = \text{firms}_{i} + \text{births}_{i} - \text{deaths}_{i}\\
\text{births}_{i} = \text{firms}_{i+1} - \text{firms}_{i} + \text{deaths}_{i}
$$

The job reallocation rate formula is given by:

$$
\text{job\_creation\_rate} + \text{job\_destruction\_rate} - ||\text{net\_job\_creation\_rate}||\\
$$

In [58]:
# Calculate firm births
bds_2naics["firmbirth_firms"] = bds_2naics["firms"].shift(-1) - bds_2naics["firms"] + bds_2naics["firmdeath_firms"]

# Calculate firm birth and death rates
bds_2naics["firm_birth_rate"] = bds_2naics["firmbirth_firms"] / bds_2naics["firms"]
bds_2naics["firm_death_rate"] = bds_2naics["firmdeath_firms"] / bds_2naics["firms"]

# Calculate reallocation rate
bds_2naics["job_reallocation_rate"] = (bds_2naics["job_creation"] + bds_2naics["job_destruction"] - abs(bds_2naics["job_creation"] - bds_2naics["job_destruction"])) / bds_2naics["denom"]

In [59]:
# Keep only columns of interest
bds_2naics = bds_2naics[["NAICS", "Year", "job_reallocation_rate", "firm_birth_rate", "firm_death_rate", "emp"]]

## Load in the output data

In order to calculate aggregates, I will weight them by sector output. This means we need output data at the same granularity as the BLS and BDS data, which I have got from the Bureau of Economic Analysis.

In [None]:
output_pre97 = pd.read_csv("../data/raw/naics_output_pre97.csv")
output_post97 = pd.read_csv("../data/raw/naics_output_post97.csv")

output_pre97.drop(columns=["Line"], inplace=True)
output_post97.drop(columns=["Line"], inplace=True)

output = pd.merge(output_pre97, output_post97, how="outer", on="Industry")

In [61]:
# Convert the data from wide to long format
output_data = output.melt(id_vars=["Industry"], var_name="Year", value_name="output")
output_data["Year"] = output_data["Year"].astype(int)
output_data["output_share"] = output_data["output"] / output_data.groupby("Year")["output"].transform("sum")

output_data = output_data.replace("Manufacturing", value="Manufacturing sector")

## Merging the data sets

Before we merge the data, we need to ensure that we have the same NAICS codes and years in both data sets.

In [62]:
# Remove additional NAICS industries that aren't common to both datasets. Remove early years that are not in the BLS data.
bds_2naics = bds_2naics[bds_2naics["NAICS"].isin(tfp_naics_short["NAICS"])].reset_index(drop=True)
bds_2naics = bds_2naics[bds_2naics["Year"].isin(tfp_naics_short["Year"])].reset_index(drop=True)

# Removes the additional year that is in the BLS data but not in the BDS data.
tfp_naics_short = tfp_naics_short[tfp_naics_short["Year"].isin(bds_2naics["Year"])].reset_index(drop=True)

In [63]:
# Check the dimensions of each data set match (only the rows need to).
print(f"BDS data dimensions: {bds_2naics.shape}")
print(f"BLS data dimensions: {tfp_naics_short.shape}")

BDS data dimensions: (612, 6)
BLS data dimensions: (612, 5)


In [64]:
# Merge bds_2naics with tfp_naics_short on Year and NAICS.
merged_data = pd.merge(tfp_naics_short, bds_2naics, on=["Year", "NAICS"], how="inner")
merged_data = pd.merge(merged_data, output_data, on=["Industry", "Year"], how="inner")

In [None]:
merged_data.to_csv("../data/productivity_and_dynamism.csv", index=False)

# Compustat data cleaning

In [None]:
compustat = pd.read_csv("../data/raw/compustat_raw.csv")

compustat.head()
compustat.drop(columns=["datadate", "indfmt", "consol", "popsrc", "datafmt", "curcd", "costat"], inplace=True)
compustat.rename(columns={"fyear": "year", "gvkey": "company_id"}, inplace=True)

compustat['naics'] = compustat['naics'].astype(str).str.replace(".0", "", regex=False)

compustat["2naics"] = compustat["naics"].astype(str).str[:2]
compustat["3naics"] = compustat["naics"].astype(str).str[:3]
compustat["4naics"] = compustat["naics"].astype(str).str[:4]
compustat["5naics"] = compustat["naics"].astype(str).str[:5]

# Handle manufacturing sector codes (31, 32, 33 map to 'MN')
manufacturing_codes = ["31", "32", "33"]
# Filter compustat data to keep only relevant 2-digit NAICS codes
compustat = compustat[(compustat["2naics"].isin(naics_industry["NAICS"])) | 
                      (compustat["2naics"].isin(manufacturing_codes))].reset_index(drop=True)

# Map manufacturing codes to 'MN' for consistency with other datasets
compustat.loc[compustat["2naics"].isin(manufacturing_codes), "2naics"] = "MN"

# Filter out companies with fewer than 3 observations
obs_counts = compustat.groupby('company_id').size()

min_required_obs = 3
valid_companies = obs_counts[obs_counts >= min_required_obs].index

compustat = compustat[compustat['company_id'].isin(valid_companies)]

In [67]:
def winsorise(df, cols, p=0.01):

    result = df.copy()
    
    # Drop rows with negative, zero or missing values
    for col in cols:
        # Create a mask for rows to keep (positive values and not missing)
        mask = (result[col] > 0) & (~result[col].isna())
        result = result[mask]
    
    # Winsorise the data at the 4naics level, grouped by year
    for col in cols:
        # Use transform to apply quantile within each group
        result[col] = result.groupby(['4naics', 'year'])[col].transform(
            lambda x: x.clip(lower=x.quantile(p), upper=x.quantile(1-p))
        )
    
    return result

compustat = winsorise(compustat, cols=['sale', 'ppent', 'emp', 'cogs'])

In [68]:
compustat.to_csv("../data/compustat.csv", index=False)

In [None]:
def run_productivity_estimations(df_group):
    
    industry_code = df_group['5naics'].iloc[0]
    
    estimator = ProductivityEstimator(df_group)
    results = estimator.estimate(verbose=False)
    
    productivity_df = results['productivity']
    productivity_df['5naics'] = industry_code
    
    return productivity_df

# Apply the function to each industry group and collect results
all_results = []
for name, group in tqdm(compustat.groupby("5naics"), desc="Processing industries"):
    
    try:
        # Skip industries with too few observations
        if len(group) < 50:  # Adjust minimum sample size as needed
            # print(f"Skipping industry {name} - too few observations ({len(group)})")
            continue
        
        # Check for complete observations
        valid_rows = group[['sale', 'ppent', 'emp', 'cogs']].notna().all(axis=1).sum()
        if valid_rows < 40:
            # print(f"Skipping industry {name} - too few complete observations ({valid_rows})")
            continue
            
        result = run_productivity_estimations(group)
        all_results.append(result)
    except Exception as e:
        print(f"Error processing industry {name}: {e}")
        continue

if all_results:
    productivity_df = pd.concat(all_results, ignore_index=True)
    print(f"Successfully processed {len(all_results)} industries")
else:
    productivity_df = pd.DataFrame()
    print("No industries could be processed")

Processing industries:   0%|          | 1/822 [00:06<1:23:43,  6.12s/it]

Skipping industry 1111 - too few observations (12)


Processing industries:   1%|          | 5/822 [00:12<25:20,  1.86s/it]  

Skipping industry 11119 - too few observations (2)


Processing industries:   1%|          | 6/822 [00:17<39:45,  2.92s/it]

Skipping industry 11132 - too few observations (16)


Processing industries:   1%|          | 8/822 [00:18<23:24,  1.73s/it]

Skipping industry 1114 - too few observations (4)
Skipping industry 11141 - too few observations (48)


Processing industries:   4%|▍         | 33/822 [00:24<04:02,  3.26it/s]

Skipping industry 11192 - too few observations (10)
Skipping industry 11193 - too few observations (3)
Skipping industry 11194 - too few observations (7)
Skipping industry 11199 - too few observations (20)
Skipping industry 112 - too few observations (35)
Skipping industry 11211 - too few observations (28)
Skipping industry 11221 - too few observations (13)
Skipping industry 1123 - too few observations (3)
Skipping industry 11231 - too few observations (28)
Skipping industry 11251 - too few observations (45)
Skipping industry 11292 - too few observations (11)
Skipping industry 113 - too few observations (7)
Skipping industry 11311 - too few observations (37)
Skipping industry 11321 - too few observations (17)
Skipping industry 11331 - too few observations (39)
Skipping industry 11411 - too few observations (25)
Skipping industry 115 - too few observations (42)
Skipping industry 11511 - too few observations (6)
Skipping industry 11521 - too few observations (43)
Skipping industry 21 - t

Processing industries:   5%|▍         | 38/822 [00:24<03:29,  3.75it/s]

Skipping industry 2122 - too few observations (45)
Skipping industry 21221 - too few observations (37)


Processing industries:   6%|▌         | 49/822 [01:13<34:14,  2.66s/it]

Skipping industry 221 - too few observations (39)


Processing industries:   7%|▋         | 54/822 [01:14<18:18,  1.43s/it]

Skipping industry 2213 - too few observations (8)


Processing industries:   7%|▋         | 57/822 [01:20<24:36,  1.93s/it]

Skipping industry 22133 - too few observations (30)
Skipping industry 233 - too few observations (23)


Processing industries:   8%|▊         | 64/822 [01:50<54:17,  4.30s/it]

Skipping industry 23331 - too few observations (37)


Processing industries:   9%|▊         | 71/822 [01:50<13:36,  1.09s/it]

Skipping industry 234 - too few observations (44)
Skipping industry 23411 - too few observations (6)
Skipping industry 23412 - too few observations (6)
Skipping industry 2349 - too few observations (7)


Processing industries:   9%|▉         | 73/822 [01:56<19:08,  1.53s/it]

Skipping industry 23493 - too few observations (8)
Skipping industry 23499 - too few observations (33)


Processing industries:   9%|▉         | 75/822 [02:02<23:31,  1.89s/it]

Skipping industry 23511 - too few observations (15)
Skipping industry 23521 - too few observations (4)


Processing industries:  10%|█         | 84/822 [02:08<11:38,  1.06it/s]

Skipping industry 23542 - too few observations (18)
Skipping industry 23561 - too few observations (13)
Skipping industry 23591 - too few observations (3)
Skipping industry 23594 - too few observations (13)
Skipping industry 23595 - too few observations (5)
Skipping industry 2361 - too few observations (42)


Processing industries:  11%|█         | 87/822 [02:08<08:38,  1.42it/s]

Skipping industry 2362 - too few observations (1)
Skipping industry 23621 - too few observations (10)


Processing industries:  11%|█▏        | 94/822 [02:20<13:50,  1.14s/it]

Skipping industry 23711 - too few observations (44)


Processing industries:  12%|█▏        | 97/822 [02:39<36:48,  3.05s/it]

Skipping industry 238 - too few observations (23)
Skipping industry 23816 - too few observations (8)


Processing industries:  12%|█▏        | 100/822 [02:39<22:46,  1.89s/it]

Skipping industry 23822 - too few observations (47)
Skipping industry 23829 - too few observations (30)
Skipping industry 23831 - too few observations (23)


Processing industries:  14%|█▎        | 113/822 [03:07<37:37,  3.18s/it]

Skipping industry 31132 - too few observations (25)
Skipping industry 31133 - too few observations (9)


Processing industries:  14%|█▍        | 117/822 [03:07<15:26,  1.31s/it]

Skipping industry 3114 - too few observations (19)


Processing industries:  15%|█▍        | 120/822 [03:14<18:26,  1.58s/it]

Skipping industry 3115 - too few observations (33)


Processing industries:  15%|█▍        | 123/822 [03:14<10:29,  1.11it/s]

Skipping industry 3116 - too few observations (2)


Processing industries:  16%|█▌        | 129/822 [03:32<30:21,  2.63s/it]

Skipping industry 31183 - too few observations (13)
Skipping industry 3119 - too few observations (40)


Processing industries:  17%|█▋        | 136/822 [03:45<34:53,  3.05s/it]

Skipping industry 312 - too few observations (37)
Skipping industry 3121 - too few observations (31)


Processing industries:  18%|█▊        | 150/822 [04:06<15:19,  1.37s/it]

Skipping industry 31322 - too few observations (14)
Skipping industry 31323 - too few observations (28)
Skipping industry 31331 - too few observations (33)


Processing industries:  18%|█▊        | 152/822 [04:12<20:52,  1.87s/it]

Skipping industry 314 - too few observations (48)


Processing industries:  19%|█▉        | 155/822 [04:12<11:52,  1.07s/it]

Skipping industry 31491 - too few observations (48)


Processing industries:  19%|█▉        | 159/822 [04:24<26:15,  2.38s/it]

Skipping industry 31511 - too few observations (14)


Processing industries:  21%|██        | 169/822 [04:54<41:40,  3.83s/it]

Skipping industry 31599 - too few observations (46)
Skipping industry 316 - too few observations (46)
Skipping industry 31611 - too few observations (32)


Processing industries:  21%|██        | 174/822 [05:06<33:34,  3.11s/it]

Skipping industry 3169 - too few observations (15)


Processing industries:  22%|██▏       | 180/822 [05:18<34:51,  3.26s/it]

Skipping industry 3219 - too few observations (11)


Processing industries:  22%|██▏       | 184/822 [05:19<13:32,  1.27s/it]

Skipping industry 32192 - too few observations (2)
Skipping industry 322 - too few observations (1)


Processing industries:  24%|██▍       | 196/822 [05:50<34:45,  3.33s/it]

Skipping industry 32312 - too few observations (47)


Processing industries:  26%|██▌       | 213/822 [06:34<36:50,  3.63s/it]

Skipping industry 3254 - too few observations (39)


Processing industries:  27%|██▋       | 220/822 [06:37<07:12,  1.39it/s]

Skipping industry 3259 - too few observations (1)
Skipping industry 32591 - too few observations (28)


Processing industries:  27%|██▋       | 223/822 [06:43<14:36,  1.46s/it]

Skipping industry 326 - too few observations (10)


Processing industries:  28%|██▊       | 232/822 [07:07<30:15,  3.08s/it]

Skipping industry 3262 - too few observations (34)


Processing industries:  29%|██▊       | 236/822 [07:14<18:36,  1.91s/it]

Skipping industry 327 - too few observations (33)
Skipping industry 3271 - too few observations (43)


Processing industries:  29%|██▉       | 242/822 [07:26<16:46,  1.74s/it]

Skipping industry 3273 - too few observations (41)


Processing industries:  30%|███       | 247/822 [07:32<13:03,  1.36s/it]

Skipping industry 32733 - too few observations (27)
Skipping industry 32741 - too few observations (41)


Processing industries:  31%|███       | 251/822 [07:38<12:16,  1.29s/it]

Skipping industry 33 - too few observations (36)


Processing industries:  31%|███       | 255/822 [07:38<06:11,  1.53it/s]

Skipping industry 3311 - too few observations (11)


Processing industries:  33%|███▎      | 268/822 [08:14<20:08,  2.18s/it]

Skipping industry 3321 - too few observations (47)


Processing industries:  33%|███▎      | 272/822 [08:21<13:10,  1.44s/it]

Skipping industry 3323 - too few observations (34)


Processing industries:  34%|███▍      | 278/822 [08:32<19:32,  2.16s/it]

Skipping industry 33271 - too few observations (17)


Processing industries:  34%|███▍      | 280/822 [08:38<22:18,  2.47s/it]

Skipping industry 3328 - too few observations (19)


Processing industries:  35%|███▌      | 290/822 [08:58<24:14,  2.73s/it]

Skipping industry 3332 - too few observations (10)
Skipping industry 33321 - too few observations (45)
Skipping industry 33322 - too few observations (28)


Processing industries:  36%|███▌      | 295/822 [09:04<14:50,  1.69s/it]

Skipping industry 3333 - too few observations (5)


Processing industries:  36%|███▌      | 297/822 [09:04<10:08,  1.16s/it]

Skipping industry 3334 - too few observations (5)


Processing industries:  37%|███▋      | 306/822 [09:07<03:46,  2.27it/s]

Skipping industry 334 - too few observations (14)


Processing industries:  38%|███▊      | 309/822 [09:13<10:44,  1.26s/it]

Skipping industry 3342 - too few observations (16)


Processing industries:  39%|███▉      | 320/822 [09:16<02:56,  2.84it/s]

Skipping industry 33511 - too few observations (8)


Processing industries:  40%|████      | 332/822 [09:35<14:10,  1.74s/it]

Skipping industry 336 - too few observations (13)
Skipping industry 3361 - too few observations (9)


Processing industries:  41%|████      | 336/822 [09:41<12:18,  1.52s/it]

Skipping industry 3362 - too few observations (22)


Processing industries:  43%|████▎     | 351/822 [10:30<22:28,  2.86s/it]

Skipping industry 3369 - too few observations (2)


Processing industries:  43%|████▎     | 354/822 [10:30<11:02,  1.42s/it]

Skipping industry 337 - too few observations (21)


Processing industries:  44%|████▍     | 360/822 [10:31<04:35,  1.68it/s]

Skipping industry 3379 - too few observations (12)
Skipping industry 339 - too few observations (10)


Processing industries:  44%|████▍     | 364/822 [10:32<02:23,  3.20it/s]

Skipping industry 3399 - too few observations (28)


Processing industries:  45%|████▌     | 372/822 [10:57<20:37,  2.75s/it]

Skipping industry 421 - too few observations (44)
Skipping industry 4211 - too few observations (34)
Skipping industry 42111 - too few observations (24)


Processing industries:  46%|████▌     | 376/822 [10:57<08:30,  1.14s/it]

Skipping industry 42113 - too few observations (13)
Skipping industry 4212 - too few observations (13)
Skipping industry 42121 - too few observations (13)
Skipping industry 42122 - too few observations (48)
Skipping industry 4213 - too few observations (17)


Processing industries:  46%|████▋     | 382/822 [11:03<07:41,  1.05s/it]

Skipping industry 42133 - too few observations (7)
Skipping industry 42139 - too few observations (17)


Processing industries:  47%|████▋     | 387/822 [11:09<07:23,  1.02s/it]

Skipping industry 42144 - too few observations (7)


Processing industries:  47%|████▋     | 389/822 [11:15<11:16,  1.56s/it]

Skipping industry 42146 - too few observations (15)
Skipping industry 42149 - too few observations (39)
Skipping industry 4215 - too few observations (6)


Processing industries:  48%|████▊     | 396/822 [11:21<07:30,  1.06s/it]

Skipping industry 42162 - too few observations (10)
Skipping industry 4217 - too few observations (43)


Processing industries:  48%|████▊     | 398/822 [11:21<05:50,  1.21it/s]

Skipping industry 42172 - too few observations (9)
Skipping industry 42173 - too few observations (27)


Processing industries:  49%|████▉     | 401/822 [11:21<04:07,  1.70it/s]

Skipping industry 42182 - too few observations (38)


Processing industries:  49%|████▉     | 404/822 [11:27<07:10,  1.03s/it]

Skipping industry 42184 - too few observations (17)


Processing industries:  50%|████▉     | 407/822 [11:33<10:28,  1.51s/it]

Skipping industry 42191 - too few observations (20)
Skipping industry 42192 - too few observations (16)


Processing industries:  50%|████▉     | 410/822 [11:39<11:20,  1.65s/it]

Skipping industry 42194 - too few observations (25)


Processing industries:  50%|█████     | 412/822 [11:44<13:29,  1.97s/it]

Skipping industry 422 - too few observations (7)


Processing industries:  50%|█████     | 414/822 [11:50<14:58,  2.20s/it]

Skipping industry 42211 - too few observations (3)
Skipping industry 42212 - too few observations (48)
Skipping industry 42213 - too few observations (18)


Processing industries:  51%|█████     | 418/822 [11:56<12:35,  1.87s/it]

Skipping industry 4223 - too few observations (45)
Skipping industry 42231 - too few observations (18)
Skipping industry 42232 - too few observations (25)


Processing industries:  52%|█████▏    | 431/822 [12:08<06:06,  1.07it/s]

Skipping industry 42246 - too few observations (14)
Skipping industry 42247 - too few observations (10)
Skipping industry 42248 - too few observations (14)
Skipping industry 42249 - too few observations (46)
Skipping industry 42251 - too few observations (9)
Skipping industry 42259 - too few observations (10)
Skipping industry 42261 - too few observations (3)
Skipping industry 42269 - too few observations (12)
Skipping industry 42271 - too few observations (12)


Processing industries:  53%|█████▎    | 435/822 [12:14<07:45,  1.20s/it]

Skipping industry 4228 - too few observations (2)
Skipping industry 42281 - too few observations (37)
Skipping industry 4229 - too few observations (3)
Skipping industry 42291 - too few observations (16)
Skipping industry 42292 - too few observations (9)
Skipping industry 42293 - too few observations (10)
Skipping industry 42294 - too few observations (9)
Skipping industry 42295 - too few observations (3)
Skipping industry 42299 - too few observations (12)
Skipping industry 423 - too few observations (38)
Skipping industry 4231 - too few observations (2)
Skipping industry 42311 - too few observations (38)


Processing industries:  55%|█████▍    | 448/822 [12:15<03:05,  2.02it/s]

Skipping industry 42313 - too few observations (16)
Skipping industry 42314 - too few observations (22)
Skipping industry 42322 - too few observations (6)
Skipping industry 4233 - too few observations (17)
Skipping industry 42332 - too few observations (15)
Skipping industry 42333 - too few observations (33)
Skipping industry 42339 - too few observations (42)
Skipping industry 4234 - too few observations (3)


Processing industries:  56%|█████▋    | 463/822 [12:22<02:53,  2.07it/s]

Skipping industry 42362 - too few observations (5)


Processing industries:  57%|█████▋    | 465/822 [12:23<02:27,  2.43it/s]

Skipping industry 4237 - too few observations (32)


Processing industries:  57%|█████▋    | 469/822 [12:40<13:07,  2.23s/it]

Skipping industry 42374 - too few observations (31)
Skipping industry 4238 - too few observations (23)
Skipping industry 42381 - too few observations (40)
Skipping industry 42382 - too few observations (12)


Processing industries:  58%|█████▊    | 477/822 [12:46<07:19,  1.27s/it]

Skipping industry 42385 - too few observations (45)


Processing industries:  59%|█████▊    | 481/822 [13:03<16:49,  2.96s/it]

Skipping industry 42399 - too few observations (35)
Skipping industry 4241 - too few observations (9)
Skipping industry 42411 - too few observations (37)


Processing industries:  59%|█████▉    | 485/822 [13:09<11:46,  2.10s/it]

Skipping industry 42413 - too few observations (33)


Processing industries:  59%|█████▉    | 487/822 [13:15<13:07,  2.35s/it]

Skipping industry 4243 - too few observations (12)
Skipping industry 42431 - too few observations (3)


Processing industries:  60%|█████▉    | 491/822 [13:27<15:02,  2.73s/it]

Skipping industry 42442 - too few observations (5)
Skipping industry 42445 - too few observations (10)
Skipping industry 42447 - too few observations (10)


Processing industries:  61%|██████    | 499/822 [13:33<08:33,  1.59s/it]

Skipping industry 4246 - too few observations (20)
Skipping industry 42461 - too few observations (6)


Processing industries:  61%|██████▏   | 504/822 [13:45<10:40,  2.02s/it]

Skipping industry 4248 - too few observations (5)
Skipping industry 42482 - too few observations (28)
Skipping industry 42491 - too few observations (26)


Processing industries:  62%|██████▏   | 509/822 [13:51<07:47,  1.49s/it]

Skipping industry 42495 - too few observations (10)
Skipping industry 4251 - too few observations (3)
Skipping industry 42511 - too few observations (11)


Processing industries:  63%|██████▎   | 514/822 [13:57<06:44,  1.31s/it]

Skipping industry 51 - too few observations (9)
Skipping industry 5111 - too few observations (6)


Processing industries:  64%|██████▎   | 524/822 [14:16<07:01,  1.41s/it]

Skipping industry 512 - too few observations (3)


Processing industries:  64%|██████▍   | 528/822 [14:22<08:52,  1.81s/it]

Skipping industry 51223 - too few observations (6)
Skipping industry 51224 - too few observations (4)
Skipping industry 51225 - too few observations (21)


Processing industries:  65%|██████▍   | 532/822 [14:23<04:05,  1.18it/s]

Skipping industry 513 - too few observations (25)


Processing industries:  65%|██████▌   | 535/822 [14:23<02:59,  1.60it/s]

Skipping industry 51313 - too few observations (49)
Skipping industry 51314 - too few observations (5)


Processing industries:  66%|██████▌   | 542/822 [14:35<05:52,  1.26s/it]

Skipping industry 5133 - too few observations (15)


Processing industries:  66%|██████▋   | 546/822 [14:47<09:36,  2.09s/it]

Skipping industry 51411 - too few observations (7)


Processing industries:  67%|██████▋   | 549/822 [14:54<08:59,  1.97s/it]

Skipping industry 5151 - too few observations (8)


Processing industries:  68%|██████▊   | 556/822 [15:07<05:17,  1.20s/it]

Skipping industry 517 - too few observations (11)


Processing industries:  68%|██████▊   | 561/822 [15:14<06:43,  1.55s/it]

Skipping industry 51751 - too few observations (25)


Processing industries:  69%|██████▉   | 566/822 [15:21<07:11,  1.68s/it]

Skipping industry 519 - too few observations (27)
Skipping industry 51911 - too few observations (43)
Skipping industry 51912 - too few observations (4)


Processing industries:  69%|██████▉   | 570/822 [15:21<02:59,  1.41it/s]

Skipping industry 51919 - too few observations (6)
Skipping industry 51921 - too few observations (3)


Processing industries:  70%|██████▉   | 573/822 [15:22<02:04,  2.00it/s]

Skipping industry 522 - too few observations (25)


Processing industries:  70%|███████   | 578/822 [15:28<06:04,  1.49s/it]

Skipping industry 5222 - too few observations (44)


Processing industries:  71%|███████▏  | 586/822 [15:49<14:43,  3.74s/it]

Skipping industry 5231 - too few observations (1)


Processing industries:  72%|███████▏  | 589/822 [15:49<06:29,  1.67s/it]

Skipping industry 52313 - too few observations (21)
Skipping industry 52314 - too few observations (38)


Processing industries:  72%|███████▏  | 592/822 [15:50<03:30,  1.09it/s]

Skipping industry 52316 - too few observations (32)


Processing industries:  73%|███████▎  | 600/822 [16:15<08:38,  2.34s/it]

Skipping industry 524 - too few observations (26)


Processing industries:  73%|███████▎  | 604/822 [16:22<08:25,  2.32s/it]

Skipping industry 5242 - too few observations (9)


Processing industries:  74%|███████▍  | 607/822 [16:22<04:00,  1.12s/it]

Skipping industry 525 - too few observations (17)
Skipping industry 52511 - too few observations (16)
Skipping industry 52519 - too few observations (11)
Skipping industry 52591 - too few observations (12)


Processing industries:  75%|███████▍  | 613/822 [16:34<06:38,  1.90s/it]

Skipping industry 53 - too few observations (8)


Processing industries:  75%|███████▌  | 618/822 [16:42<05:17,  1.56s/it]

Skipping industry 53113 - too few observations (21)


Processing industries:  76%|███████▌  | 621/822 [16:54<10:05,  3.01s/it]

Skipping industry 5313 - too few observations (7)


Processing industries:  76%|███████▌  | 623/822 [16:54<06:17,  1.90s/it]

Skipping industry 53132 - too few observations (18)
Skipping industry 532 - too few observations (16)


Processing industries:  77%|███████▋  | 630/822 [16:56<02:14,  1.43it/s]

Skipping industry 53222 - too few observations (10)


Processing industries:  77%|███████▋  | 632/822 [16:56<01:36,  1.97it/s]

Skipping industry 53228 - too few observations (22)


Processing industries:  77%|███████▋  | 634/822 [16:56<01:11,  2.64it/s]

Skipping industry 53231 - too few observations (27)


Processing industries:  78%|███████▊  | 639/822 [17:15<08:49,  2.89s/it]

Skipping industry 54 - too few observations (3)
Skipping industry 541 - too few observations (5)


Processing industries:  78%|███████▊  | 643/822 [17:21<06:53,  2.31s/it]

Skipping industry 5412 - too few observations (3)


Processing industries:  78%|███████▊  | 645/822 [17:22<04:39,  1.58s/it]

Skipping industry 5413 - too few observations (18)
Skipping industry 54131 - too few observations (30)
Skipping industry 54132 - too few observations (7)


Processing industries:  79%|███████▉  | 651/822 [17:22<01:46,  1.60it/s]

Skipping industry 54141 - too few observations (7)


Processing industries:  79%|███████▉  | 653/822 [17:28<03:57,  1.40s/it]

Skipping industry 54149 - too few observations (1)


Processing industries:  80%|███████▉  | 656/822 [17:29<02:22,  1.16it/s]

Skipping industry 5416 - too few observations (40)


Processing industries:  81%|████████  | 662/822 [17:36<02:02,  1.30it/s]

Skipping industry 54169 - too few observations (37)
Skipping industry 5417 - too few observations (6)
Skipping industry 54172 - too few observations (11)


Processing industries:  81%|████████  | 665/822 [17:42<03:20,  1.28s/it]

Skipping industry 54182 - too few observations (28)
Skipping industry 54183 - too few observations (17)
Skipping industry 54184 - too few observations (36)


Processing industries:  82%|████████▏ | 670/822 [17:55<05:25,  2.14s/it]

Skipping industry 54187 - too few observations (29)


Processing industries:  82%|████████▏ | 672/822 [18:01<06:09,  2.46s/it]

Skipping industry 5419 - too few observations (20)


Processing industries:  82%|████████▏ | 675/822 [18:07<05:30,  2.25s/it]

Skipping industry 54193 - too few observations (16)
Skipping industry 54194 - too few observations (29)


Processing industries:  83%|████████▎ | 679/822 [18:13<04:24,  1.85s/it]

Skipping industry 5611 - too few observations (9)


Processing industries:  84%|████████▎ | 688/822 [18:45<10:17,  4.61s/it]

Skipping industry 56143 - too few observations (9)


Processing industries:  85%|████████▍ | 695/822 [19:09<09:16,  4.38s/it]

Skipping industry 5616 - too few observations (12)


Processing industries:  85%|████████▌ | 701/822 [19:24<05:26,  2.70s/it]

Skipping industry 56179 - too few observations (8)
Skipping industry 56191 - too few observations (32)


Processing industries:  86%|████████▌ | 706/822 [19:37<05:39,  2.93s/it]

Skipping industry 5621 - too few observations (8)


Processing industries:  86%|████████▋ | 710/822 [19:38<02:15,  1.21s/it]

Skipping industry 5622 - too few observations (3)
Skipping industry 56291 - too few observations (14)
Skipping industry 56292 - too few observations (32)
Skipping industry 56299 - too few observations (35)
Skipping industry 61 - too few observations (24)


Processing industries:  87%|████████▋ | 717/822 [19:44<02:03,  1.18s/it]

Skipping industry 61121 - too few observations (44)


Processing industries:  87%|████████▋ | 718/822 [19:45<01:48,  1.04s/it]

Skipping industry 61141 - too few observations (17)


Processing industries:  88%|████████▊ | 721/822 [19:56<03:55,  2.33s/it]

Skipping industry 6115 - too few observations (2)


Processing industries:  88%|████████▊ | 723/822 [20:02<04:08,  2.51s/it]

Skipping industry 6116 - too few observations (4)
Skipping industry 61162 - too few observations (8)
Skipping industry 61163 - too few observations (26)


Processing industries:  88%|████████▊ | 727/822 [20:02<02:05,  1.32s/it]

Skipping industry 61171 - too few observations (28)
Skipping industry 62 - too few observations (3)


Processing industries:  89%|████████▉ | 732/822 [20:21<03:57,  2.64s/it]

Skipping industry 62131 - too few observations (4)
Skipping industry 62132 - too few observations (16)
Skipping industry 62133 - too few observations (16)


Processing industries:  90%|█████████ | 740/822 [20:27<01:41,  1.24s/it]

Skipping industry 62139 - too few observations (11)
Skipping industry 6214 - too few observations (8)
Skipping industry 62141 - too few observations (20)


Processing industries:  90%|█████████ | 743/822 [20:34<02:03,  1.56s/it]

Skipping industry 6219 - too few observations (17)


Processing industries:  91%|█████████ | 746/822 [20:40<02:04,  1.64s/it]

Skipping industry 622 - too few observations (44)


Processing industries:  91%|█████████▏| 752/822 [20:47<01:02,  1.12it/s]

Skipping industry 623 - too few observations (12)
Skipping industry 62322 - too few observations (22)


Processing industries:  92%|█████████▏| 760/822 [20:47<00:16,  3.74it/s]

Skipping industry 62399 - too few observations (18)
Skipping industry 624 - too few observations (2)
Skipping industry 62411 - too few observations (30)
Skipping industry 62412 - too few observations (1)
Skipping industry 62419 - too few observations (7)
Skipping industry 71 - too few observations (23)
Skipping industry 711 - too few observations (32)
Skipping industry 7111 - too few observations (5)
Skipping industry 71111 - too few observations (24)
Skipping industry 71119 - too few observations (26)


Processing industries:  93%|█████████▎| 766/822 [20:53<00:35,  1.58it/s]

Skipping industry 7113 - too few observations (17)
Skipping industry 71131 - too few observations (38)
Skipping industry 71132 - too few observations (32)
Skipping industry 71141 - too few observations (27)
Skipping industry 71213 - too few observations (12)
Skipping industry 71219 - too few observations (1)


Processing industries:  94%|█████████▍| 774/822 [20:59<00:32,  1.48it/s]

Skipping industry 7132 - too few observations (14)


Processing industries:  95%|█████████▍| 780/822 [21:06<00:34,  1.22it/s]

Skipping industry 7139 - too few observations (39)
Skipping industry 71391 - too few observations (33)


Processing industries:  96%|█████████▌| 786/822 [21:34<01:36,  2.68s/it]

Skipping industry 7211 - too few observations (15)


Processing industries:  96%|█████████▌| 789/822 [21:52<02:22,  4.31s/it]

Skipping industry 72131 - too few observations (3)


Processing industries:  97%|█████████▋| 794/822 [21:59<01:03,  2.27s/it]

Skipping industry 72232 - too few observations (39)
Skipping industry 72241 - too few observations (34)


Processing industries:  97%|█████████▋| 797/822 [21:59<00:29,  1.16s/it]

Skipping industry 811 - too few observations (14)
Skipping industry 8111 - too few observations (2)
Skipping industry 81111 - too few observations (47)
Skipping industry 81112 - too few observations (43)


Processing industries:  98%|█████████▊| 803/822 [22:05<00:19,  1.05s/it]

Skipping industry 81131 - too few observations (6)
Skipping industry 81149 - too few observations (7)


Processing industries:  98%|█████████▊| 808/822 [22:11<00:14,  1.03s/it]

Skipping industry 8121 - too few observations (17)


Processing industries:  99%|█████████▊| 810/822 [22:12<00:09,  1.28it/s]

Skipping industry 81222 - too few observations (25)
Skipping industry 8123 - too few observations (12)
Skipping industry 81231 - too few observations (29)
Skipping industry 81232 - too few observations (15)


Processing industries:  99%|█████████▉| 815/822 [22:17<00:06,  1.00it/s]

Skipping industry 81291 - too few observations (5)


Processing industries: 100%|██████████| 822/822 [22:35<00:00,  1.65s/it]

Skipping industry 813 - too few observations (11)
Skipping industry 81391 - too few observations (5)
Skipping industry 81399 - too few observations (1)
Successfully processed 504 industries





In [70]:
df = pd.merge(compustat[['company_id', 'year', 'sale', 'cogs', '5naics']], productivity_df[['company_id', 'year', 'a', 'alpha_m']], on=['company_id', 'year'], how='left')

df['markup'] = df['alpha_m'] * (df['sale'] / df['cogs'])

df = df[df['markup'] < 10]
df= df[df['markup'] > 0]

df.dropna(inplace=True)

df.to_csv("../data/markup_and_productivity.csv", index=False)