# Data Preprocessing

### Setup and Load Data

In [114]:
import pandas as pd

df = pd.read_csv('data/layoffs_panel_final.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"Companies: {df['Company'].nunique()}")
print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"\nColumns ({len(df.columns)}):")
print(df.columns.tolist())

print(f"\nTarget variable distribution:")
print(df['Layoff_Event_Binary'].value_counts())

print(f"\nMissing values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
missing_df_filtered = missing_df[~missing_df.index.str.contains('_lag4|_lag5', regex=True)]

display(missing_df_filtered)

Dataset shape: (13936, 65)
Companies: 268
Date range: 2020-03-01 to 2024-06-01

Columns (65):
['Company', 'Date', 'Layoff_Event_Count', 'Total_Laid_Off_Sum', 'Avg_Layoff_Percentage', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary', 'unemployment_rate_lag1', 'cpi_lag1', 'consumer_confidence_lag1', 'gdp_growth_rate_lag1', 'fed_funds_rate_lag1', 'avg_jobless_claims_lag1', 'sp500_index_lag1', 'inflation_rate_yoy_lag1', 'sp500_change_6mo_lag1', 'company_sec', 'fiscal_year', 'fiscal_period', 'form', 'cash_lag1', 'cost_of_revenue_lag1', 'gross_profit_lag1', 'net_income_lag1', 'operating_expenses_lag1', 'operating_income_lag1', 'rd_expense_lag1', 'revenue_lag1', 'stockholders_equity_lag1', 'total_assets_lag1', 'total_liabilities_lag1', 'current_assets_lag1', 'current_liabilities_lag1', 'retained_earnings_lag1', 'cash_lag4', 'cost_of_revenue_lag4', 'gross_profit_lag4', 'net_income_lag4', 'operating_expenses_lag4', 'operating_income_lag4', 'rd_ex

Unnamed: 0,Missing_Count,Missing_Percentage
cash_lag1,12516,89.810563
revenue_lag1,10570,75.846728
cost_of_revenue_lag1,9015,64.688576
gross_profit_lag1,6111,43.850459
operating_expenses_lag1,5757,41.310276
rd_expense_lag1,4227,30.331515
total_liabilities_lag1,2788,20.005741
current_assets_lag1,2271,16.295924
current_liabilities_lag1,2270,16.288749
operating_income_lag1,1604,11.509759


### Remove Columns with Too Many Missing Values

In [115]:
missing_pct = (df.isnull().sum() / len(df)) * 100

threshold_column = 40
cols_to_drop = sorted(missing_pct[missing_pct > threshold_column].index.tolist())

print(f"Columns with >{threshold_column}% missing values ({len(cols_to_drop)}):")
for col in cols_to_drop:
    print(f"  {col}: {missing_pct[col]:.2f}%")

df_cleaned = df.drop(columns=cols_to_drop)

print(f"\nAfter column filtering:")
print(f"  Original shape: {df.shape}")
print(f"  Cleaned shape: {df_cleaned.shape}")
print(f"  Columns removed: {len(cols_to_drop)}")
print(f"  Columns remaining: {len(df_cleaned.columns)}")

Columns with >40% missing values (15):
  cash_lag1: 89.81%
  cash_lag4: 90.57%
  cash_lag5: 90.93%
  cost_of_revenue_lag1: 64.69%
  cost_of_revenue_lag4: 67.25%
  cost_of_revenue_lag5: 68.25%
  gross_profit_lag1: 43.85%
  gross_profit_lag4: 47.10%
  gross_profit_lag5: 48.62%
  operating_expenses_lag1: 41.31%
  operating_expenses_lag4: 45.16%
  operating_expenses_lag5: 46.70%
  revenue_lag1: 75.85%
  revenue_lag4: 76.23%
  revenue_lag5: 76.27%

After column filtering:
  Original shape: (13936, 65)
  Cleaned shape: (13936, 50)
  Columns removed: 15
  Columns remaining: 50


### Filter Companies by SEC Coverage

In [126]:
base_features = [
    'operating_income',
    'net_income', 
    'current_liabilities',
    'current_assets',
    'stockholders_equity',
    'total_assets'
]

base_lag1_features = [f'{feat}_lag1' for feat in base_features]
base_lag4_features = [f'{feat}_lag4' for feat in base_features]
base_lag5_features = [f'{feat}_lag5' for feat in base_features]
all_base_features = base_lag1_features + base_lag4_features + base_lag5_features

print(f"Base features: {base_features}")
print(f"Total features for company coverage check: {len(all_base_features)}")
print(f"  - Lag1: {len(base_lag1_features)}")
print(f"  - Lag4: {len(base_lag4_features)}")
print(f"  - Lag5: {len(base_lag5_features)}")

print("\nChecking feature coverage by company...")
company_coverage = df_cleaned.groupby('Company')[all_base_features].apply(
    lambda x: x.notna().mean().mean()
)

print(f"\nCoverage distribution:")
print(f"  Mean: {company_coverage.mean():.2%}")
print(f"  Median: {company_coverage.median():.2%}")
print(f"  Min: {company_coverage.min():.2%}")
print(f"  Max: {company_coverage.max():.2%}")

print(f"\nCompanies by coverage threshold:")
for threshold in [0, 0.3, 0.5, 0.7, 0.9]:
    count = (company_coverage > threshold).sum()
    print(f"  >{threshold*100:.0f}%: {count} companies")

threshold = 0.8
companies_with_data = company_coverage[company_coverage > threshold].index.tolist()
companies_without_data = company_coverage[company_coverage <= threshold].index.tolist()

print(f"\n=== Applying {threshold*100:.0f}% threshold ===")
print(f"Companies WITH sufficient SEC data: {len(companies_with_data)}")
print(f"Companies WITHOUT sufficient SEC data: {len(companies_without_data)}")

print(f"\nSample companies WITHOUT SEC data:")
print(companies_without_data[:10])

df_filtered = df_cleaned[df_cleaned['Company'].isin(companies_with_data)].copy()

print(f"\nFiltered dataset:")
print(f"  Original shape: {df_cleaned.shape}")
print(f"  Filtered shape: {df_filtered.shape}")
print(f"  Companies: {df_filtered['Company'].nunique()}")
print(f"  Layoff events: {df_filtered['Layoff_Event_Binary'].sum():.0f}")
print(f"  Class balance: {df_filtered['Layoff_Event_Binary'].mean():.4f}")

print(f"\nSample companies kept:")
print(sorted(companies_with_data)[:10])

print(f"\nMissing values after filtering:")
missing = df_filtered.isnull().sum()
missing_pct = (missing / len(df_filtered)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
missing_df_filtered = missing_df[~missing_df.index.str.contains('_lag4|_lag5', regex=True)]

display(missing_df_filtered)

Base features: ['operating_income', 'net_income', 'current_liabilities', 'current_assets', 'stockholders_equity', 'total_assets']
Total features for company coverage check: 18
  - Lag1: 6
  - Lag4: 6
  - Lag5: 6

Checking feature coverage by company...

Coverage distribution:
  Mean: 83.77%
  Median: 94.71%
  Min: 31.73%
  Max: 100.00%

Companies by coverage threshold:
  >0%: 268 companies
  >30%: 268 companies
  >50%: 253 companies
  >70%: 189 companies
  >90%: 145 companies

=== Applying 80% threshold ===
Companies WITH sufficient SEC data: 167
Companies WITHOUT sufficient SEC data: 101

Sample companies WITHOUT SEC data:
['23andme', 'absci', 'aeye', 'affirm', 'airbnb', 'akili interactive', 'akili labs', 'allbirds', 'amplitude', 'appharvest']

Filtered dataset:
  Original shape: (13936, 50)
  Filtered shape: (8684, 50)
  Companies: 167
  Layoff events: 345
  Class balance: 0.0397

Sample companies kept:
['10x genomics', '2u', '8x8', 'absolute software', 'acxiom', 'adaptive biotechnol

Unnamed: 0,Missing_Count,Missing_Percentage
rd_expense_lag1,1779,20.485951
total_liabilities_lag1,1294,14.900967
net_income_lag1,281,3.235836
stockholders_equity_lag1,245,2.821281
operating_income_lag1,142,1.635191
retained_earnings_lag1,120,1.381852
current_liabilities_lag1,110,1.266697
current_assets_lag1,104,1.197605
total_assets_lag1,101,1.163058


# Feature Engineering

### Create New Features

In [130]:
df_fe = df_filtered.copy()

print("1. Liquidity Ratios:")
df_fe['current_ratio_lag1'] = df_fe['current_assets_lag1'] / df_fe['current_liabilities_lag1']
print("   - current_ratio_lag1 = current_assets_lag1 / current_liabilities_lag1")

df_fe['working_capital_lag1'] = df_fe['current_assets_lag1'] - df_fe['current_liabilities_lag1']
print("   - working_capital_lag1 = current_assets_lag1 - current_liabilities_lag1")

print("\n2. Profitability Ratios:")
df_fe['roa_lag1'] = df_fe['net_income_lag1'] / df_fe['total_assets_lag1']
print("   - roa_lag1 = net_income_lag1 / total_assets_lag1")

df_fe['roe_lag1'] = df_fe['net_income_lag1'] / df_fe['stockholders_equity_lag1']
print("   - roe_lag1 = net_income_lag1 / stockholders_equity_lag1")

print("\n3. Leverage Ratios:")
df_fe['debt_to_assets_lag1'] = df_fe['total_liabilities_lag1'] / df_fe['total_assets_lag1']
print("   - debt_to_assets_lag1 = total_liabilities_lag1 / total_assets_lag1")

df_fe['debt_to_equity_lag1'] = df_fe['total_liabilities_lag1'] / df_fe['stockholders_equity_lag1']
print("   - debt_to_equity_lag1 = total_liabilities_lag1 / stockholders_equity_lag1")

print("\n4. Efficiency Ratios:")
df_fe['rd_to_assets_lag1'] = df_fe['rd_expense_lag1'] / df_fe['total_assets_lag1']
print("   - rd_to_assets_lag1 = rd_expense_lag1 / total_assets_lag1")

print("\n5. YoY Growth Indicators (with epsilon for stability):")
financial_metrics = ['net_income', 'total_assets', 'operating_income', 
                     'stockholders_equity', 'current_assets', 'current_liabilities']

epsilon = 1e-6

for metric in financial_metrics:
    lag1_col = f'{metric}_lag1'
    lag4_col = f'{metric}_lag4'
    
    if lag1_col in df_fe.columns and lag4_col in df_fe.columns:
        df_fe[f'{metric}_growth_yoy'] = ((df_fe[lag1_col] - df_fe[lag4_col]) / (df_fe[lag4_col].abs() + epsilon)) * 100
        print(f"   - {metric}_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100")

print("\n6. Quarter-over-Quarter Changes (with epsilon):")
for metric in ['net_income', 'operating_income']:
    lag1_col = f'{metric}_lag1'
    lag2_col = f'{metric}_lag2'
    
    if lag1_col in df_fe.columns:
        df_fe[lag2_col] = df_fe.groupby('Company')[lag1_col].shift(1)
        df_fe[f'{metric}_change_qoq'] = ((df_fe[lag1_col] - df_fe[lag2_col]) / (df_fe[lag2_col].abs() + epsilon)) * 100
        print(f"   - {metric}_change_qoq = ((lag1 - lag2) / (|lag2| + ε)) × 100")

print("\n7. Financial Distress Indicators:")
df_fe['negative_equity_lag1'] = (df_fe['stockholders_equity_lag1'] < 0).astype(int)
print("   - negative_equity_lag1 = 1 if stockholders_equity_lag1 < 0, else 0")

df_fe['negative_income_lag1'] = (df_fe['net_income_lag1'] < 0).astype(int)
print("   - negative_income_lag1 = 1 if net_income_lag1 < 0, else 0")

df_fe['negative_operating_income_lag1'] = (df_fe['operating_income_lag1'] < 0).astype(int)
print("   - negative_operating_income_lag1 = 1 if operating_income_lag1 < 0, else 0")

df_fe['declining_income_yoy'] = (df_fe['net_income_growth_yoy'] < 0).astype(int)
print("   - declining_income_yoy = 1 if net_income_growth_yoy < 0, else 0")

df_fe['declining_operating_income_yoy'] = (df_fe['operating_income_growth_yoy'] < 0).astype(int)
print("   - declining_operating_income_yoy = 1 if operating_income_growth_yoy < 0, else 0")

print("\n8. Ratio Changes YoY (using lag5 data):")
if 'net_income_lag5' in df_fe.columns and 'total_assets_lag5' in df_fe.columns:
    df_fe['roa_lag5'] = df_fe['net_income_lag5'] / df_fe['total_assets_lag5']
    df_fe['roa_change_yoy'] = df_fe['roa_lag1'] - df_fe['roa_lag5']
    print("   - roa_lag5 = net_income_lag5 / total_assets_lag5")
    print("   - roa_change_yoy = roa_lag1 - roa_lag5")
else:
    print("   - Skipping roa_change_yoy (lag5 data not available)")

if 'total_liabilities_lag5' in df_fe.columns and 'total_assets_lag5' in df_fe.columns:
    df_fe['debt_to_assets_lag5'] = df_fe['total_liabilities_lag5'] / df_fe['total_assets_lag5']
    df_fe['debt_to_assets_change_yoy'] = df_fe['debt_to_assets_lag1'] - df_fe['debt_to_assets_lag5']
    print("   - debt_to_assets_lag5 = total_liabilities_lag5 / total_assets_lag5")
    print("   - debt_to_assets_change_yoy = debt_to_assets_lag1 - debt_to_assets_lag5")
else:
    print("   - Skipping debt_to_assets_change_yoy (lag5 data not available)")

print("\n9. Economic Interaction Features:")
df_fe['income_growth_gdp_interaction'] = df_fe['net_income_growth_yoy'] * df_fe['gdp_growth_rate_lag1']
print("   - income_growth_gdp_interaction = net_income_growth_yoy × gdp_growth_rate_lag1")

df_fe['unemployment_income_interaction'] = df_fe['unemployment_rate_lag1'] * df_fe['net_income_growth_yoy']
print("   - unemployment_income_interaction = unemployment_rate_lag1 × net_income_growth_yoy")

df_fe['operating_income_gdp_interaction'] = df_fe['operating_income_growth_yoy'] * df_fe['gdp_growth_rate_lag1']
print("   - operating_income_gdp_interaction = operating_income_growth_yoy × gdp_growth_rate_lag1")

print("\n10. Lagged Layoff Event (MOST IMPORTANT):")
df_fe['layoff_event_lag1'] = df_fe.groupby('Company')['Layoff_Event_Binary'].shift(1)
print("   - layoff_event_lag1 = Layoff_Event_Binary shifted by 1 month")
print("   - Captures operational momentum: whether company had layoff last month")

print("\n11. Months Since Last Layoff (Operational Stability):")
df_fe = df_fe.sort_values(['Company', 'Date'])
df_fe['months_since_last_layoff'] = 0

for company in df_fe['Company'].unique():
    company_mask = df_fe['Company'] == company
    company_data = df_fe[company_mask].copy()
    
    months_counter = 0
    months_list = []
    
    for idx, row in company_data.iterrows():
        if row['Layoff_Event_Binary'] == 1:
            months_counter = 0
        else:
            months_counter += 1
        months_list.append(months_counter)
    
    df_fe.loc[company_mask, 'months_since_last_layoff'] = months_list

print("   - months_since_last_layoff = months elapsed since company's last layoff event")
print("   - Low value (1-2) = recent layoff, still restructuring")
print("   - High value = stable period, new layoff would be significant shift")

new_features = [col for col in df_fe.columns if col not in df_filtered.columns]
print(f"\n=== SUMMARY ===")
print(f"Total new features created: {len(new_features)}")

print(f"\nFeature categories:")
print(f"  - Ratios: {len([f for f in new_features if 'ratio' in f or '_to_' in f or 'roa' in f or 'roe' in f])}")
print(f"  - Growth (YoY): {len([f for f in new_features if 'growth_yoy' in f])}")
print(f"  - Growth (QoQ): {len([f for f in new_features if 'change_qoq' in f])}")
print(f"  - Distress flags: {len([f for f in new_features if 'negative' in f or 'declining' in f])}")
print(f"  - Temporal: {len([f for f in new_features if 'lag' in f or 'months_since' in f])}")

print(f"\nMissing values in new features (top 10):")
new_features_missing = df_fe[new_features].isnull().sum()
missing_pct = (new_features_missing / len(df_fe)) * 100
missing_df = pd.DataFrame({'Count': new_features_missing, 'Missing_Percentage': missing_pct})
display(missing_df[missing_df['Count'] > 0].sort_values('Count', ascending=False).head(10))

df_fe.to_csv('data/layoffs_feature_engineered.csv', index=False)
print(f"\nSaved to: data/layoffs_feature_engineered.csv")

1. Liquidity Ratios:
   - current_ratio_lag1 = current_assets_lag1 / current_liabilities_lag1
   - working_capital_lag1 = current_assets_lag1 - current_liabilities_lag1

2. Profitability Ratios:
   - roa_lag1 = net_income_lag1 / total_assets_lag1
   - roe_lag1 = net_income_lag1 / stockholders_equity_lag1

3. Leverage Ratios:
   - debt_to_assets_lag1 = total_liabilities_lag1 / total_assets_lag1
   - debt_to_equity_lag1 = total_liabilities_lag1 / stockholders_equity_lag1

4. Efficiency Ratios:
   - rd_to_assets_lag1 = rd_expense_lag1 / total_assets_lag1

5. YoY Growth Indicators (with epsilon for stability):
   - net_income_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - total_assets_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - operating_income_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - stockholders_equity_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - current_assets_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - current_liabilities_growth_yoy

Unnamed: 0,Count,Missing_Percentage
rd_to_assets_lag1,1830,21.073238
debt_to_assets_change_yoy,1581,18.205896
debt_to_assets_lag5,1519,17.491939
debt_to_equity_lag1,1401,16.133118
debt_to_assets_lag1,1294,14.900967
roa_change_yoy,703,8.095348
roa_lag5,653,7.519576
roe_lag1,488,5.61953
net_income_change_qoq,462,5.320129
income_growth_gdp_interaction,446,5.135882



Saved to: data/layoffs_feature_engineered.csv


### Remove Building Blocks and Redundant Columns

In [134]:
import pandas as pd

df_fe = pd.read_csv('data/layoffs_feature_engineered.csv')
df_fe['Date'] = pd.to_datetime(df_fe['Date'])

print(f"Original shape: {df_fe.shape}")
print(f"Original columns: {len(df_fe.columns)}")

# Columns to DROP
columns_to_drop = [
    # 1. Raw building blocks (lag1) - used to create ratios
    'net_income_lag1',
    'operating_income_lag1', 
    'rd_expense_lag1',
    'stockholders_equity_lag1',
    'total_assets_lag1',
    'total_liabilities_lag1',
    'current_assets_lag1',
    'current_liabilities_lag1',
    'retained_earnings_lag1',
    
    # 2. All lag4 columns - only used for YoY growth calculation
    'net_income_lag4',
    'operating_income_lag4',
    'rd_expense_lag4',
    'stockholders_equity_lag4',
    'total_assets_lag4',
    'total_liabilities_lag4',
    'current_assets_lag4',
    'current_liabilities_lag4',
    'retained_earnings_lag4',
    
    # 3. All lag5 columns - only used for ratio change calculation
    'net_income_lag5',
    'operating_income_lag5',
    'rd_expense_lag5',
    'stockholders_equity_lag5',
    'total_assets_lag5',
    'total_liabilities_lag5',
    'current_assets_lag5',
    'current_liabilities_lag5',
    'retained_earnings_lag5',
    
    # 4. Redundant ratios
    'working_capital_lag1',  # Redundant with current_ratio_lag1
    
    # 5. Target leakage columns
    'Layoff_Event_Count',  # Highly correlated with target
    'Total_Laid_Off_Sum',  # Highly correlated with target
    'Avg_Layoff_Percentage',  # Highly correlated with target
    
    # 6. Administrative columns
    'company_sec',  # Administrative identifier
    'fiscal_year',  # Administrative
    'fiscal_period',  # Administrative
    'form',  # Administrative
    
    # 7. Raw CPI (keep derived inflation_rate_yoy_lag1)
    'cpi_lag1'
]

# Remove columns that exist in dataframe
columns_to_drop_existing = [col for col in columns_to_drop if col in df_fe.columns]

df_cleaned = df_fe.drop(columns=columns_to_drop_existing)

print(f"\nCleaned shape: {df_cleaned.shape}")
print(f"Cleaned columns: {len(df_cleaned.columns)}")
print(f"Columns removed: {len(columns_to_drop_existing)}")

print(f"\n=== FINAL FEATURE SET ===")
print(f"\nPrimary Identifiers ({6} columns):")
identifiers = ['Company', 'Date', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised']
print(identifiers)

print(f"\nTarget Variable (1 column):")
print(['Layoff_Event_Binary'])

print(f"\nMacro/Market Features (Lagged) ({8} columns):")
macro_features = [col for col in df_cleaned.columns if col in [
    'unemployment_rate_lag1', 'consumer_confidence_lag1', 'gdp_growth_rate_lag1',
    'fed_funds_rate_lag1', 'avg_jobless_claims_lag1', 'sp500_index_lag1',
    'inflation_rate_yoy_lag1', 'sp500_change_6mo_lag1'
]]
print(macro_features)

print(f"\nFinal Ratios/Growth Features:")
ratio_growth_features = [col for col in df_cleaned.columns if any(x in col for x in [
    'ratio', 'roa', 'roe', 'debt_to', 'rd_to', 'growth_yoy', 'change_qoq', 'change_yoy'
])]
print(f"  Count: {len(ratio_growth_features)}")
print(f"  {ratio_growth_features}")

print(f"\nBinary/Temporal Features:")
binary_temporal = [col for col in df_cleaned.columns if any(x in col for x in [
    'negative', 'declining', 'layoff_event_lag1', 'months_since'
])]
print(f"  Count: {len(binary_temporal)}")
print(f"  {binary_temporal}")

print(f"\nInteraction Features:")
interaction = [col for col in df_cleaned.columns if 'interaction' in col]
print(f"  Count: {len(interaction)}")
print(f"  {interaction}")

print(f"\n=== SUMMARY ===")
print(f"Total features: {len(df_cleaned.columns)}")
print(f"  - Identifiers: {len(identifiers)}")
print(f"  - Target: 1")
print(f"  - Macro/Market: {len(macro_features)}")
print(f"  - Ratios/Growth: {len(ratio_growth_features)}")
print(f"  - Binary/Temporal: {len(binary_temporal)}")
print(f"  - Interactions: {len(interaction)}")

df_cleaned.to_csv('data/layoffs_features_cleaned.csv', index=False)
print(f"\n Saved final modeling dataset to: data/layoffs_features_cleaned.csv")

print(f"\nFinal columns:")
print(df_cleaned.columns.tolist())

Original shape: (8684, 81)
Original columns: 81

Cleaned shape: (8684, 45)
Cleaned columns: 45
Columns removed: 36

=== FINAL FEATURE SET ===

Primary Identifiers (6 columns):
['Company', 'Date', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised']

Target Variable (1 column):
['Layoff_Event_Binary']

Macro/Market Features (Lagged) (8 columns):
['unemployment_rate_lag1', 'consumer_confidence_lag1', 'gdp_growth_rate_lag1', 'fed_funds_rate_lag1', 'avg_jobless_claims_lag1', 'sp500_index_lag1', 'inflation_rate_yoy_lag1', 'sp500_change_6mo_lag1']

Final Ratios/Growth Features:
  Count: 18
  ['current_ratio_lag1', 'roa_lag1', 'roe_lag1', 'debt_to_assets_lag1', 'debt_to_equity_lag1', 'rd_to_assets_lag1', 'net_income_growth_yoy', 'total_assets_growth_yoy', 'operating_income_growth_yoy', 'stockholders_equity_growth_yoy', 'current_assets_growth_yoy', 'current_liabilities_growth_yoy', 'net_income_change_qoq', 'operating_income_change_qoq', 'roa_lag5', 'roa_change_yoy', 'deb

In [135]:
import pandas as pd
import numpy as np

df_cleaned = pd.read_csv('data/layoffs_features_cleaned.csv')
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])

print(f"Dataset shape: {df_cleaned.shape}")
print(f"Companies: {df_cleaned['Company'].nunique()}")

print(f"\n=== Missing Values Before Imputation ===")
missing = df_cleaned.isnull().sum()
missing_pct = (missing / len(df_cleaned)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
display(missing_df)

# Identify columns to impute (exclude identifiers and target)
cols_to_exclude = ['Company', 'Date', 'Latest_Industry', 'Latest_Country', 
                   'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary']

cols_to_impute = [col for col in df_cleaned.columns if col not in cols_to_exclude and df_cleaned[col].isnull().sum() > 0]

print(f"\n=== Imputation Strategy ===")
print(f"Columns to impute: {len(cols_to_impute)}")
print(f"Method: Industry Median (group by Latest_Industry)")

df_imputed = df_cleaned.copy()

for col in cols_to_impute:
    print(f"\nImputing {col}...")
    missing_before = df_imputed[col].isnull().sum()
    
    # Step 1: Industry median imputation
    df_imputed[col] = df_imputed.groupby('Latest_Industry')[col].transform(
        lambda x: x.fillna(x.median())
    )
    
    # Step 2: Global median fallback for remaining NaNs
    remaining_missing = df_imputed[col].isnull().sum()
    if remaining_missing > 0:
        global_median = df_imputed[col].median()
        df_imputed[col] = df_imputed[col].fillna(global_median)
        print(f"  Industry median: {missing_before - remaining_missing} values")
        print(f"  Global median fallback: {remaining_missing} values")
    else:
        print(f"  Industry median: {missing_before} values")

print(f"\n=== Missing Values After Imputation ===")
missing_after = df_imputed.isnull().sum()
missing_after_pct = (missing_after / len(df_imputed)) * 100
missing_after_df = pd.DataFrame({
    'Missing_Count': missing_after[missing_after > 0],
    'Missing_Percentage': missing_after_pct[missing_after > 0]
}).sort_values('Missing_Count', ascending=False)

if len(missing_after_df) > 0:
    display(missing_after_df)
else:
    print("No missing values remaining!")

print(f"\n=== Imputation Summary ===")
print(f"Total columns imputed: {len(cols_to_impute)}")
print(f"Total missing values before: {missing.sum()}")
print(f"Total missing values after: {missing_after.sum()}")
print(f"Imputation success rate: {(1 - missing_after.sum() / missing.sum()) * 100:.2f}%")

df_imputed.to_csv('data/layoffs_imputed.csv', index=False)
print(f"\nSaved imputed dataset to: data/layoffs_imputed.csv")

Dataset shape: (8684, 45)
Companies: 167

=== Missing Values Before Imputation ===


Unnamed: 0,Missing_Count,Missing_Percentage
rd_to_assets_lag1,1830,21.073238
debt_to_assets_change_yoy,1581,18.205896
debt_to_assets_lag5,1519,17.491939
debt_to_equity_lag1,1401,16.133118
debt_to_assets_lag1,1294,14.900967
roa_change_yoy,703,8.095348
roa_lag5,653,7.519576
roe_lag1,488,5.61953
net_income_change_qoq,462,5.320129
unemployment_income_interaction,446,5.135882



=== Imputation Strategy ===
Columns to impute: 24
Method: Industry Median (group by Latest_Industry)

Imputing current_ratio_lag1...
  Industry median: 110 values

Imputing roa_lag1...
  Industry median: 344 values

Imputing roe_lag1...
  Industry median: 488 values

Imputing debt_to_assets_lag1...
  Industry median: 1242 values
  Global median fallback: 52 values

Imputing debt_to_equity_lag1...
  Industry median: 1349 values
  Global median fallback: 52 values

Imputing rd_to_assets_lag1...
  Industry median: 1622 values
  Global median fallback: 208 values

Imputing net_income_growth_yoy...
  Industry median: 446 values

Imputing total_assets_growth_yoy...
  Industry median: 284 values

Imputing operating_income_growth_yoy...
  Industry median: 279 values

Imputing stockholders_equity_growth_yoy...
  Industry median: 360 values

Imputing current_assets_growth_yoy...
  Industry median: 293 values

Imputing current_liabilities_growth_yoy...
  Industry median: 308 values

Imputing net

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


  Industry median: 462 values

Imputing operating_income_lag2...
  Industry median: 305 values

Imputing operating_income_change_qoq...
  Industry median: 320 values

Imputing roa_lag5...
  Industry median: 653 values

Imputing roa_change_yoy...
  Industry median: 703 values

Imputing debt_to_assets_lag5...
  Industry median: 1467 values
  Global median fallback: 52 values

Imputing debt_to_assets_change_yoy...
  Industry median: 1529 values
  Global median fallback: 52 values

Imputing income_growth_gdp_interaction...
  Industry median: 446 values

Imputing unemployment_income_interaction...
  Industry median: 446 values

Imputing operating_income_gdp_interaction...
  Industry median: 279 values

Imputing layoff_event_lag1...
  Industry median: 167 values

=== Missing Values After Imputation ===
No missing values remaining!

=== Imputation Summary ===
Total columns imputed: 24
Total missing values before: 14761
Total missing values after: 0
Imputation success rate: 100.00%

Saved impu

### Verify Imputation Results

In [136]:
print(f"Final dataset shape: {df_imputed.shape}")
print(f"Companies: {df_imputed['Company'].nunique()}")
print(f"Date range: {df_imputed['Date'].min().date()} to {df_imputed['Date'].max().date()}")

print(f"\n=== Target Distribution ===")
print(df_imputed['Layoff_Event_Binary'].value_counts())
print(f"Class balance: {df_imputed['Layoff_Event_Binary'].mean():.4f}")

print(f"\n=== Feature Summary ===")
feature_cols = [col for col in df_imputed.columns if col not in 
                ['Company', 'Date', 'Latest_Industry', 'Latest_Country', 
                 'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary']]

print(f"Total features for modeling: {len(feature_cols)}")

print(f"\n=== Sample Statistics ===")
print("Example: roa_lag1 after imputation")
print(df_imputed['roa_lag1'].describe())

print(f"\n=== Final Confirmation ===")
print(f"Missing values in entire dataset: {df_imputed.isnull().sum().sum()}")
print(f"Ready for modeling: {'Yes' if df_imputed.isnull().sum().sum() == 0 else 'No'}")


Final dataset shape: (8684, 45)
Companies: 167
Date range: 2020-03-01 to 2024-06-01

=== Target Distribution ===
0.0    8339
1.0     345
Name: Layoff_Event_Binary, dtype: int64
Class balance: 0.0397

=== Feature Summary ===
Total features for modeling: 38

=== Sample Statistics ===
Example: roa_lag1 after imputation
count    8684.000000
mean       -0.066625
std         0.308736
min        -9.693796
25%        -0.095415
50%        -0.016773
75%         0.027251
max         3.332031
Name: roa_lag1, dtype: float64

=== Final Confirmation ===
Missing values in entire dataset: 0
Ready for modeling: Yes
