# Data Preprocessing

### Setup and Load Data

In [1]:
import pandas as pd

df = pd.read_csv('../data/layoffs_panel_final.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"Companies: {df['Company'].nunique()}")
print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"\nColumns ({len(df.columns)}):")
print(df.columns.tolist())

print(f"\nTarget variable distribution:")
print(df['Layoff_Event_Binary'].value_counts())

print(f"\nMissing values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
missing_df_filtered = missing_df[~missing_df.index.str.contains('_lag2|_lag4|_lag5', regex=True)]

display(missing_df_filtered)

Dataset shape: (2880, 82)
Companies: 160
Date range: 2020-01-01 to 2024-04-01

Columns (82):
['Company', 'Date', 'year', 'quarter', 'year_quarter', 'Layoff_Event_Count', 'Total_Laid_Off_Sum', 'Avg_Layoff_Percentage', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary', 'unemployment_rate_lag1', 'gdp_growth_rate_lag1', 'fed_funds_rate_lag1', 'cpi_lag1', 'consumer_confidence_lag1', 'avg_jobless_claims_lag1', 'inflation_rate_yoy_lag1', 'sp500_change_6mo_lag1', 'sp500_index_lag1', 'company_sec', 'fiscal_year', 'fiscal_period', 'form', 'cash_lag1', 'cost_of_revenue_lag1', 'gross_profit_lag1', 'net_income_lag1', 'operating_expenses_lag1', 'operating_income_lag1', 'rd_expense_lag1', 'revenue_lag1', 'stockholders_equity_lag1', 'total_assets_lag1', 'total_liabilities_lag1', 'current_assets_lag1', 'current_liabilities_lag1', 'retained_earnings_lag1', 'cash_lag2', 'cost_of_revenue_lag2', 'gross_profit_lag2', 'net_income_lag2', 'operating_expenses_lag2

Unnamed: 0,Missing_Count,Missing_Percentage
Avg_Layoff_Percentage,2686,93.263889
cash_lag1,2651,92.048611
revenue_lag1,2165,75.173611
cost_of_revenue_lag1,2001,69.479167
gross_profit_lag1,1218,42.291667
operating_expenses_lag1,1210,42.013889
rd_expense_lag1,727,25.243056
total_liabilities_lag1,383,13.298611
operating_income_lag1,184,6.388889
net_income_lag1,176,6.111111


### Remove Columns with Too Many Missing Values

In [2]:
missing_pct = (df.isnull().sum() / len(df)) * 100

threshold_column = 40
cols_to_drop = sorted(missing_pct[missing_pct > threshold_column].index.tolist())

print(f"Columns with >{threshold_column}% missing values ({len(cols_to_drop)}):")
for col in cols_to_drop:
    print(f"  {col}: {missing_pct[col]:.2f}%")

df_cleaned = df.drop(columns=cols_to_drop)

print(f"\nAfter column filtering:")
print(f"  Original shape: {df.shape}")
print(f"  Cleaned shape: {df_cleaned.shape}")
print(f"  Columns removed: {len(cols_to_drop)}")
print(f"  Columns remaining: {len(df_cleaned.columns)}")

Columns with >40% missing values (21):
  Avg_Layoff_Percentage: 93.26%
  cash_lag1: 92.05%
  cash_lag2: 92.15%
  cash_lag4: 92.15%
  cash_lag5: 92.43%
  cost_of_revenue_lag1: 69.48%
  cost_of_revenue_lag2: 69.55%
  cost_of_revenue_lag4: 69.65%
  cost_of_revenue_lag5: 69.72%
  gross_profit_lag1: 42.29%
  gross_profit_lag2: 42.53%
  gross_profit_lag4: 43.16%
  gross_profit_lag5: 43.72%
  operating_expenses_lag1: 42.01%
  operating_expenses_lag2: 42.08%
  operating_expenses_lag4: 42.33%
  operating_expenses_lag5: 42.67%
  revenue_lag1: 75.17%
  revenue_lag2: 75.00%
  revenue_lag4: 73.82%
  revenue_lag5: 73.30%

After column filtering:
  Original shape: (2880, 82)
  Cleaned shape: (2880, 61)
  Columns removed: 21
  Columns remaining: 61


### Filter Companies by SEC Coverage

In [3]:
base_features = [
    'operating_income',
    'net_income', 
    'current_liabilities',
    'current_assets',
    'stockholders_equity',
    'total_assets'
]

base_lag1_features = [f'{feat}_lag1' for feat in base_features]
base_lag2_features = [f'{feat}_lag2' for feat in base_features]
base_lag4_features = [f'{feat}_lag4' for feat in base_features]
base_lag5_features = [f'{feat}_lag5' for feat in base_features]
all_base_features = base_lag1_features + base_lag2_features + base_lag4_features + base_lag5_features

print(f"Base features: {base_features}")
print(f"Total features for company coverage check: {len(all_base_features)}")
print(f"  - Lag1: {len(base_lag1_features)}")
print(f"  - Lag2: {len(base_lag2_features)}")
print(f"  - Lag4: {len(base_lag4_features)}")
print(f"  - Lag5: {len(base_lag5_features)}")

print("\nChecking feature coverage by company...")
company_coverage = df_cleaned.groupby('Company')[all_base_features].apply(
    lambda x: x.notna().mean().mean()
)

print(f"\nCoverage distribution:")
print(f"  Mean: {company_coverage.mean():.2%}")
print(f"  Median: {company_coverage.median():.2%}")
print(f"  Min: {company_coverage.min():.2%}")
print(f"  Max: {company_coverage.max():.2%}")

print(f"\nCompanies by coverage threshold:")
for threshold in [0, 0.3, 0.5, 0.7, 0.9]:
    count = (company_coverage > threshold).sum()
    print(f"  >{threshold*100:.0f}%: {count} companies")

threshold = 0.8
companies_with_data = company_coverage[company_coverage > threshold].index.tolist()
companies_without_data = company_coverage[company_coverage <= threshold].index.tolist()

print(f"\n=== Applying {threshold*100:.0f}% threshold ===")
print(f"Companies WITH sufficient SEC data: {len(companies_with_data)}")
print(f"Companies WITHOUT sufficient SEC data: {len(companies_without_data)}")

print(f"\nSample companies WITHOUT SEC data:")
print(companies_without_data[:10])

df_filtered = df_cleaned[df_cleaned['Company'].isin(companies_with_data)].copy()

print(f"\nFiltered dataset:")
print(f"  Original shape: {df_cleaned.shape}")
print(f"  Filtered shape: {df_filtered.shape}")
print(f"  Companies: {df_filtered['Company'].nunique()}")
print(f"  Layoff events: {df_filtered['Layoff_Event_Binary'].sum():.0f}")
print(f"  Class balance: {df_filtered['Layoff_Event_Binary'].mean():.4f}")

print(f"\nSample companies kept:")
print(sorted(companies_with_data)[:10])

print(f"\nMissing values after filtering:")
missing = df_filtered.isnull().sum()
missing_pct = (missing / len(df_filtered)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
missing_df_filtered = missing_df[~missing_df.index.str.contains('_lag4|_lag5', regex=True)]

display(missing_df_filtered)

Base features: ['operating_income', 'net_income', 'current_liabilities', 'current_assets', 'stockholders_equity', 'total_assets']
Total features for company coverage check: 24
  - Lag1: 6
  - Lag2: 6
  - Lag4: 6
  - Lag5: 6

Checking feature coverage by company...

Coverage distribution:
  Mean: 94.75%
  Median: 100.00%
  Min: 33.33%
  Max: 100.00%

Companies by coverage threshold:
  >0%: 160 companies
  >30%: 160 companies
  >50%: 153 companies
  >70%: 152 companies
  >90%: 132 companies

=== Applying 80% threshold ===
Companies WITH sufficient SEC data: 152
Companies WITHOUT sufficient SEC data: 8

Sample companies WITHOUT SEC data:
['bm technologies', 'capital one', 'lending club', 'mindbody', 'sage therapeutics', 'sage therapeutics copy', 'stone', 'xp']

Filtered dataset:
  Original shape: (2880, 61)
  Filtered shape: (2736, 61)
  Companies: 152
  Layoff events: 295
  Class balance: 0.1078

Sample companies kept:
['2u', '8x8', 'absolute software', 'acxiom', 'adaptive biotechnologie

Unnamed: 0,Missing_Count,Missing_Percentage
rd_expense_lag2,603,22.039474
rd_expense_lag1,590,21.564327
total_liabilities_lag2,388,14.181287
total_liabilities_lag1,383,13.998538
net_income_lag2,140,5.116959
net_income_lag1,131,4.788012
stockholders_equity_lag2,83,3.033626
stockholders_equity_lag1,79,2.887427
operating_income_lag2,65,2.375731
operating_income_lag1,58,2.119883


# Feature Engineering

### Create New Features

In [4]:
df_fe = df_filtered.copy()

print("=== FEATURE ENGINEERING ===\n")

print("1. Liquidity Ratios:")
df_fe['current_ratio_lag1'] = df_fe['current_assets_lag1'] / df_fe['current_liabilities_lag1']
print("   - current_ratio_lag1 = current_assets_lag1 / current_liabilities_lag1")

df_fe['working_capital_lag1'] = df_fe['current_assets_lag1'] - df_fe['current_liabilities_lag1']
print("   - working_capital_lag1 = current_assets_lag1 - current_liabilities_lag1")

print("\n2. Profitability Ratios:")
df_fe['roa_lag1'] = df_fe['net_income_lag1'] / df_fe['total_assets_lag1']
print("   - roa_lag1 = net_income_lag1 / total_assets_lag1")

df_fe['roe_lag1'] = df_fe['net_income_lag1'] / df_fe['stockholders_equity_lag1']
print("   - roe_lag1 = net_income_lag1 / stockholders_equity_lag1")

print("\n3. Leverage Ratios:")
df_fe['debt_to_assets_lag1'] = df_fe['total_liabilities_lag1'] / df_fe['total_assets_lag1']
print("   - debt_to_assets_lag1 = total_liabilities_lag1 / total_assets_lag1")

df_fe['debt_to_equity_lag1'] = df_fe['total_liabilities_lag1'] / df_fe['stockholders_equity_lag1']
print("   - debt_to_equity_lag1 = total_liabilities_lag1 / stockholders_equity_lag1")

print("\n4. Efficiency Ratios:")
df_fe['rd_to_assets_lag1'] = df_fe['rd_expense_lag1'] / df_fe['total_assets_lag1']
print("   - rd_to_assets_lag1 = rd_expense_lag1 / total_assets_lag1")

print("\n5. YoY Growth Indicators (4 quarters = 1 year):")
financial_metrics = ['net_income', 'total_assets', 'operating_income', 
                     'stockholders_equity', 'current_assets', 'current_liabilities']

epsilon = 1e-6

for metric in financial_metrics:
    lag1_col = f'{metric}_lag1'
    lag4_col = f'{metric}_lag4'
    
    if lag1_col in df_fe.columns and lag4_col in df_fe.columns:
        df_fe[f'{metric}_growth_yoy'] = ((df_fe[lag1_col] - df_fe[lag4_col]) / (df_fe[lag4_col].abs() + epsilon)) * 100
        print(f"   - {metric}_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100")

print("\n6. Quarter-over-Quarter Changes:")
for metric in ['net_income', 'operating_income']:
    lag1_col = f'{metric}_lag1'
    lag2_col = f'{metric}_lag2'
    
    if lag1_col in df_fe.columns and lag2_col in df_fe.columns:
        df_fe[f'{metric}_change_qoq'] = ((df_fe[lag1_col] - df_fe[lag2_col]) / (df_fe[lag2_col].abs() + epsilon)) * 100
        print(f"   - {metric}_change_qoq = ((lag1 - lag2) / (|lag2| + ε)) × 100")

print("\n7. Financial Distress Indicators:")
df_fe['negative_equity_lag1'] = (df_fe['stockholders_equity_lag1'] < 0).astype(int)
print("   - negative_equity_lag1 = 1 if stockholders_equity_lag1 < 0, else 0")

df_fe['negative_income_lag1'] = (df_fe['net_income_lag1'] < 0).astype(int)
print("   - negative_income_lag1 = 1 if net_income_lag1 < 0, else 0")

df_fe['negative_operating_income_lag1'] = (df_fe['operating_income_lag1'] < 0).astype(int)
print("   - negative_operating_income_lag1 = 1 if operating_income_lag1 < 0, else 0")

df_fe['declining_income_yoy'] = (df_fe['net_income_growth_yoy'] < 0).astype(int)
print("   - declining_income_yoy = 1 if net_income_growth_yoy < 0, else 0")

df_fe['declining_operating_income_yoy'] = (df_fe['operating_income_growth_yoy'] < 0).astype(int)
print("   - declining_operating_income_yoy = 1 if operating_income_growth_yoy < 0, else 0")

print("\n8. Ratio Changes YoY (using lag5 = 5 quarters back):")
if 'net_income_lag5' in df_fe.columns and 'total_assets_lag5' in df_fe.columns:
    df_fe['roa_lag5'] = df_fe['net_income_lag5'] / df_fe['total_assets_lag5']
    df_fe['roa_change_yoy'] = df_fe['roa_lag1'] - df_fe['roa_lag5']
    print("   - roa_lag5 = net_income_lag5 / total_assets_lag5")
    print("   - roa_change_yoy = roa_lag1 - roa_lag5")
else:
    print("   - Skipping roa_change_yoy (lag5 data not available)")

if 'total_liabilities_lag5' in df_fe.columns and 'total_assets_lag5' in df_fe.columns:
    df_fe['debt_to_assets_lag5'] = df_fe['total_liabilities_lag5'] / df_fe['total_assets_lag5']
    df_fe['debt_to_assets_change_yoy'] = df_fe['debt_to_assets_lag1'] - df_fe['debt_to_assets_lag5']
    print("   - debt_to_assets_lag5 = total_liabilities_lag5 / total_assets_lag5")
    print("   - debt_to_assets_change_yoy = debt_to_assets_lag1 - debt_to_assets_lag5")
else:
    print("   - Skipping debt_to_assets_change_yoy (lag5 data not available)")

print("\n9. Economic Interaction Features:")
df_fe['income_growth_gdp_interaction'] = df_fe['net_income_growth_yoy'] * df_fe['gdp_growth_rate_lag1']
print("   - income_growth_gdp_interaction = net_income_growth_yoy × gdp_growth_rate_lag1")

df_fe['unemployment_income_interaction'] = df_fe['unemployment_rate_lag1'] * df_fe['net_income_growth_yoy']
print("   - unemployment_income_interaction = unemployment_rate_lag1 × net_income_growth_yoy")

df_fe['operating_income_gdp_interaction'] = df_fe['operating_income_growth_yoy'] * df_fe['gdp_growth_rate_lag1']
print("   - operating_income_gdp_interaction = operating_income_growth_yoy × gdp_growth_rate_lag1")

print("\n10. Lagged Layoff Event:")
df_fe['layoff_event_lag1'] = df_fe.groupby('Company')['Layoff_Event_Binary'].shift(1)
print("   - layoff_event_lag1 = Layoff_Event_Binary shifted by 1 quarter")

print("\n11. Quarters Since Last Layoff:")
df_fe = df_fe.sort_values(['Company', 'Date'])

df_fe['last_layoff_date'] = df_fe['Date'].where(df_fe['Layoff_Event_Binary'] == 1, pd.NaT)
df_fe['last_layoff_date'] = df_fe.groupby('Company')['last_layoff_date'].ffill()

df_fe['quarters_since_last_layoff'] = ((df_fe['Date'] - df_fe['last_layoff_date']).dt.days / 91.25).round()
df_fe['quarters_since_last_layoff'] = df_fe.groupby('Company')['quarters_since_last_layoff'].shift(1)
df_fe['quarters_since_last_layoff'] = df_fe['quarters_since_last_layoff'].fillna(999)

df_fe = df_fe.drop(columns=['last_layoff_date'])

print("   - quarters_since_last_layoff = (Date - last_layoff_date) / 91.25, then lagged by 1 quarter")
print("   - Uses only information available BEFORE current quarter (no leakage)")
print("   - Fill NaN with 999 for companies with no prior layoffs")

new_features = [col for col in df_fe.columns if col not in df_filtered.columns]
print(f"\n=== SUMMARY ===")
print(f"Total new features created: {len(new_features)}")

print(f"\nFeature categories:")
print(f"  - Ratios: {len([f for f in new_features if 'ratio' in f or '_to_' in f or 'roa' in f or 'roe' in f])}")
print(f"  - Growth (YoY): {len([f for f in new_features if 'growth_yoy' in f])}")
print(f"  - Growth (QoQ): {len([f for f in new_features if 'change_qoq' in f])}")
print(f"  - Distress flags: {len([f for f in new_features if 'negative' in f or 'declining' in f])}")
print(f"  - Temporal: {len([f for f in new_features if 'lag' in f or 'quarters_since' in f])}")

print(f"\nMissing values in new features (top 10):")
new_features_missing = df_fe[new_features].isnull().sum()
missing_pct = (new_features_missing / len(df_fe)) * 100
missing_df = pd.DataFrame({'Count': new_features_missing, 'Percentage': missing_pct})
display(missing_df[missing_df['Count'] > 0].sort_values('Count', ascending=False).head(10))

df_fe.to_csv('../data/layoffs_feature_engineered.csv', index=False)
print(f"\nSaved to: ../data/layoffs_feature_engineered.csv")

=== FEATURE ENGINEERING ===

1. Liquidity Ratios:
   - current_ratio_lag1 = current_assets_lag1 / current_liabilities_lag1
   - working_capital_lag1 = current_assets_lag1 - current_liabilities_lag1

2. Profitability Ratios:
   - roa_lag1 = net_income_lag1 / total_assets_lag1
   - roe_lag1 = net_income_lag1 / stockholders_equity_lag1

3. Leverage Ratios:
   - debt_to_assets_lag1 = total_liabilities_lag1 / total_assets_lag1
   - debt_to_equity_lag1 = total_liabilities_lag1 / stockholders_equity_lag1

4. Efficiency Ratios:
   - rd_to_assets_lag1 = rd_expense_lag1 / total_assets_lag1

5. YoY Growth Indicators (4 quarters = 1 year):
   - net_income_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - total_assets_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - operating_income_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - stockholders_equity_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - current_assets_growth_yoy = ((lag1 - lag4) / (|lag4| + ε)) × 100
   - current_

Unnamed: 0,Count,Percentage
rd_to_assets_lag1,590,21.564327
debt_to_assets_change_yoy,461,16.849415
debt_to_assets_lag5,449,16.410819
debt_to_equity_lag1,422,15.423977
debt_to_assets_lag1,383,13.998538
roa_change_yoy,235,8.589181
roa_lag5,219,8.004386
roe_lag1,204,7.45614
unemployment_income_interaction,201,7.346491
net_income_growth_yoy,201,7.346491



Saved to: ../data/layoffs_feature_engineered.csv


### Drop Intermediate Features and Prepare Final Dataset

In [5]:
import pandas as pd

df_fe = pd.read_csv('../data/layoffs_feature_engineered.csv')
df_fe['Date'] = pd.to_datetime(df_fe['Date'])

print(f"Original shape: {df_fe.shape}")
print(f"Original columns: {len(df_fe.columns)}")

columns_to_drop = [
    # 1. Raw building blocks (lag1) - used to create ratios
    'net_income_lag1',
    'operating_income_lag1', 
    'rd_expense_lag1',
    'stockholders_equity_lag1',
    'total_assets_lag1',
    'total_liabilities_lag1',
    'current_assets_lag1',
    'current_liabilities_lag1',
    'retained_earnings_lag1',
    
    # 2. All lag2 columns - only used for QoQ calculation
    'net_income_lag2',
    'operating_income_lag2',
    'rd_expense_lag2',
    'stockholders_equity_lag2',
    'total_assets_lag2',
    'total_liabilities_lag2',
    'current_assets_lag2',
    'current_liabilities_lag2',
    'retained_earnings_lag2',
    
    # 3. All lag4 columns - only used for YoY growth calculation
    'net_income_lag4',
    'operating_income_lag4',
    'rd_expense_lag4',
    'stockholders_equity_lag4',
    'total_assets_lag4',
    'total_liabilities_lag4',
    'current_assets_lag4',
    'current_liabilities_lag4',
    'retained_earnings_lag4',
    
    # 4. All lag5 columns - only used for ratio change calculation
    'net_income_lag5',
    'operating_income_lag5',
    'rd_expense_lag5',
    'stockholders_equity_lag5',
    'total_assets_lag5',
    'total_liabilities_lag5',
    'current_assets_lag5',
    'current_liabilities_lag5',
    'retained_earnings_lag5',
    
    # 5. Intermediate ratio calculations
    'roa_lag5',
    'debt_to_assets_lag5',
    
    # 6. Redundant ratios
    'working_capital_lag1',
    
    # 7. Target leakage columns
    'Layoff_Event_Count',
    'Total_Laid_Off_Sum',
    
    # 8. Administrative columns
    'company_sec',
    'fiscal_year',
    'fiscal_period',
    'form',
    
    # 9. Raw CPI
    'cpi_lag1'
]

columns_to_drop_existing = [col for col in columns_to_drop if col in df_fe.columns]

df_cleaned = df_fe.drop(columns=columns_to_drop_existing)

print(f"\nCleaned shape: {df_cleaned.shape}")
print(f"Cleaned columns: {len(df_cleaned.columns)}")
print(f"Columns removed: {len(columns_to_drop_existing)}")

print(f"\n=== FINAL FEATURE SET ===")
print(f"\nPrimary Identifiers:")
identifiers = ['Company', 'Date', 'year', 'quarter', 'year_quarter', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised']
print(f"  Count: {len([col for col in identifiers if col in df_cleaned.columns])}")

print(f"\nTarget Variable:")
print(f"  Layoff_Event_Binary")

print(f"\nMacro/Market Features:")
macro_features = [col for col in df_cleaned.columns if col in [
    'unemployment_rate_lag1', 'consumer_confidence_lag1', 'gdp_growth_rate_lag1',
    'fed_funds_rate_lag1', 'avg_jobless_claims_lag1', 'sp500_index_lag1',
    'inflation_rate_yoy_lag1', 'sp500_change_6mo_lag1'
]]
print(f"  Count: {len(macro_features)}")

print(f"\nRatios/Growth Features:")
ratio_growth_features = [col for col in df_cleaned.columns if any(x in col for x in [
    'ratio', 'roa', 'roe', 'debt_to', 'rd_to', 'growth_yoy', 'change_qoq', 'change_yoy'
]) and not any(x in col for x in ['lag5'])]
print(f"  Count: {len(ratio_growth_features)}")

print(f"\nBinary/Temporal Features:")
binary_temporal = [col for col in df_cleaned.columns if any(x in col for x in [
    'negative', 'declining', 'layoff_event_lag1', 'quarters_since'
])]
print(f"  Count: {len(binary_temporal)}")

print(f"\nInteraction Features:")
interaction = [col for col in df_cleaned.columns if 'interaction' in col]
print(f"  Count: {len(interaction)}")

print(f"\n=== SUMMARY ===")
print(f"Total features: {len(df_cleaned.columns)}")

df_cleaned.to_csv('../data/layoffs_features_cleaned.csv', index=False)
print(f"\nSaved final modeling dataset to: ../data/layoffs_features_cleaned.csv")

Original shape: (2736, 90)
Original columns: 90

Cleaned shape: (2736, 44)
Cleaned columns: 44
Columns removed: 46

=== FINAL FEATURE SET ===

Primary Identifiers:
  Count: 9

Target Variable:
  Layoff_Event_Binary

Macro/Market Features:
  Count: 8

Ratios/Growth Features:
  Count: 16

Binary/Temporal Features:
  Count: 7

Interaction Features:
  Count: 3

=== SUMMARY ===
Total features: 44

Saved final modeling dataset to: ../data/layoffs_features_cleaned.csv


### Impute Missing data with Forward Fill and Industry Median Imputation

In [8]:
import pandas as pd
import numpy as np

df_cleaned = pd.read_csv('../data/layoffs_features_cleaned.csv')
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'])

print(f"Dataset shape: {df_cleaned.shape}")
print(f"Companies: {df_cleaned['Company'].nunique()}")

print(f"\n=== Missing Values Before Imputation ===")
missing = df_cleaned.isnull().sum()
missing_pct = (missing / len(df_cleaned)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
display(missing_df)

cols_to_exclude = ['Company', 'Date', 'year', 'quarter', 'year_quarter', 'Latest_Industry', 'Latest_Country', 
                   'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary']

cols_to_impute = [col for col in df_cleaned.columns if col not in cols_to_exclude and df_cleaned[col].isnull().sum() > 0]

print(f"\n=== Imputation Strategy ===")
print(f"Columns to impute: {len(cols_to_impute)}")
print(f"Method: Forward Fill within Company → Industry Median → Global Median")

df_imputed = df_cleaned.copy()
df_imputed = df_imputed.sort_values(['Company', 'Date'])

for col in cols_to_impute:
    print(f"\nImputing {col}...")
    missing_before = df_imputed[col].isnull().sum()
    
    # Step 1: Forward fill within each company (uses past quarters only)
    df_imputed[col] = df_imputed.groupby('Company')[col].ffill()
    after_ffill = df_imputed[col].isnull().sum()
    ffill_filled = missing_before - after_ffill
    
    # Step 2: Industry median for remaining NaN (early quarters with no history)
    df_imputed[col] = df_imputed.groupby('Latest_Industry')[col].transform(
        lambda x: x.fillna(x.median())
    )
    after_industry = df_imputed[col].isnull().sum()
    industry_filled = after_ffill - after_industry
    
    # Step 3: Global median fallback
    remaining_missing = df_imputed[col].isnull().sum()
    if remaining_missing > 0:
        global_median = df_imputed[col].median()
        df_imputed[col] = df_imputed[col].fillna(global_median)
        print(f"  Forward fill: {ffill_filled} values")
        print(f"  Industry median: {industry_filled} values")
        print(f"  Global median: {remaining_missing} values")
    else:
        print(f"  Forward fill: {ffill_filled} values")
        print(f"  Industry median: {industry_filled} values")

print(f"\n=== Missing Values After Imputation ===")
missing_after = df_imputed.isnull().sum()
missing_after_pct = (missing_after / len(df_imputed)) * 100
missing_after_df = pd.DataFrame({
    'Missing_Count': missing_after[missing_after > 0],
    'Missing_Percentage': missing_after_pct[missing_after > 0]
}).sort_values('Missing_Count', ascending=False)

if len(missing_after_df) > 0:
    display(missing_after_df)
else:
    print("No missing values remaining!")

print(f"\n=== Imputation Summary ===")
print(f"Total columns imputed: {len(cols_to_impute)}")
print(f"Total missing values before: {missing.sum()}")
print(f"Total missing values after: {missing_after.sum()}")
print(f"Imputation success rate: {(1 - missing_after.sum() / missing.sum()) * 100:.2f}%")

df_imputed.to_csv('../data/layoffs_imputed.csv', index=False)
print(f"\nSaved imputed dataset to: ../data/layoffs_imputed.csv")

Dataset shape: (2736, 44)
Companies: 152

=== Missing Values Before Imputation ===


Unnamed: 0,Missing_Count,Missing_Percentage
rd_to_assets_lag1,590,21.564327
debt_to_assets_change_yoy,461,16.849415
debt_to_equity_lag1,422,15.423977
debt_to_assets_lag1,383,13.998538
roa_change_yoy,235,8.589181
roe_lag1,204,7.45614
income_growth_gdp_interaction,201,7.346491
net_income_growth_yoy,201,7.346491
unemployment_income_interaction,201,7.346491
layoff_event_lag1,152,5.555556



=== Imputation Strategy ===
Columns to impute: 20
Method: Forward Fill within Company → Industry Median → Global Median

Imputing current_ratio_lag1...
  Forward fill: 1 values
  Industry median: 0 values

Imputing roa_lag1...
  Forward fill: 42 values
  Industry median: 89 values

Imputing roe_lag1...
  Forward fill: 78 values
  Industry median: 126 values

Imputing debt_to_assets_lag1...
  Forward fill: 18 values
  Industry median: 347 values
  Global median: 18 values

Imputing debt_to_equity_lag1...
  Forward fill: 38 values
  Industry median: 366 values
  Global median: 18 values

Imputing rd_to_assets_lag1...
  Forward fill: 51 values
  Industry median: 467 values
  Global median: 72 values

Imputing net_income_growth_yoy...
  Forward fill: 41 values
  Industry median: 160 values

Imputing total_assets_growth_yoy...
  Forward fill: 0 values
  Industry median: 35 values

Imputing operating_income_growth_yoy...
  Forward fill: 2 values
  Industry median: 99 values

Imputing stockh

  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)



Saved imputed dataset to: ../data/layoffs_imputed.csv


### Verify Imputation Results

In [10]:
print(f"Final dataset shape: {df_imputed.shape}")
print(f"Companies: {df_imputed['Company'].nunique()}")
print(f"Quarters: {df_imputed['Date'].nunique()}")
print(f"Date range: {df_imputed['Date'].min().date()} to {df_imputed['Date'].max().date()}")

print(f"\n=== Target Distribution ===")
print(df_imputed['Layoff_Event_Binary'].value_counts())
print(f"Class balance: {df_imputed['Layoff_Event_Binary'].mean():.4f}")

print(f"\n=== Feature Summary ===")
feature_cols = [col for col in df_imputed.columns if col not in 
                ['Company', 'Date', 'year', 'quarter', 'year_quarter', 'Latest_Industry', 'Latest_Country', 
                 'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary']]

print(f"Total features for modeling: {len(feature_cols)}")

print(f"\n=== Sample Statistics ===")
print("Example: roa_lag1 after imputation")
print(df_imputed['roa_lag1'].describe())

Final dataset shape: (2736, 44)
Companies: 152
Quarters: 18
Date range: 2020-01-01 to 2024-04-01

=== Target Distribution ===
0.0    2441
1.0     295
Name: Layoff_Event_Binary, dtype: int64
Class balance: 0.1078

=== Feature Summary ===
Total features for modeling: 34

=== Sample Statistics ===
Example: roa_lag1 after imputation
count    2736.000000
mean       -0.052829
std         0.301086
min        -9.693796
25%        -0.083536
50%        -0.010649
75%         0.034923
max         6.204819
Name: roa_lag1, dtype: float64


### Check Categorical Features

In [11]:
df_imputed = pd.read_csv('../data/layoffs_imputed.csv')
df_imputed['Date'] = pd.to_datetime(df_imputed['Date'])

categorical_cols = ['Latest_Industry', 'Latest_Country', 'Latest_Stage']

print("Categorical feature analysis:\n")
for col in categorical_cols:
    print(f"{col}:")
    print(f"  Unique values: {df_imputed[col].nunique()}")
    print(f"  Top 5 categories:\n{df_imputed[col].value_counts().head()}")
    print()

Categorical feature analysis:

Latest_Industry:
  Unique values: 24
  Top 5 categories:
Healthcare        378
Other             264
Transportation    253
Consumer          242
Hardware          199
Name: Latest_Industry, dtype: int64

Latest_Country:
  Unique values: 11
  Top 5 categories:
United States    2577
Canada             36
France             36
Poland             18
Switzerland        18
Name: Latest_Country, dtype: int64

Latest_Stage:
  Unique values: 7
  Top 5 categories:
Post-IPO          2665
Acquired            17
Private Equity      14
Series H            13
Series D             9
Name: Latest_Stage, dtype: int64



### Create One-Hot Encoded Features for Categorical Variables

In [12]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

industry_counts = df_imputed['Latest_Industry'].value_counts()
top_industries = industry_counts.head(15).index.tolist()

df_imputed['Latest_Industry_Grouped'] = df_imputed['Latest_Industry'].apply(
    lambda x: x if x in top_industries else 'Other'
)

print(f"Industry grouping:")
print(f"  Original categories: {df_imputed['Latest_Industry'].nunique()}")
print(f"  After grouping: {df_imputed['Latest_Industry_Grouped'].nunique()}")
print(f"\nGrouped distribution:\n{df_imputed['Latest_Industry_Grouped'].value_counts()}")

industry_dummies = pd.get_dummies(df_imputed['Latest_Industry_Grouped'], 
                                  prefix='Industry', drop_first=True)

stage_dummies = pd.get_dummies(df_imputed['Latest_Stage'], 
                               prefix='Stage', drop_first=True)

print(f"\nOne-hot encoded features:")
print(f"  Industry features: {industry_dummies.shape[1]}")
print(f"  Stage features: {stage_dummies.shape[1]}")
print(f"  Total new features: {industry_dummies.shape[1] + stage_dummies.shape[1]}")

Industry grouping:
  Original categories: 24
  After grouping: 15

Grouped distribution:
Other             576
Healthcare        378
Transportation    253
Consumer          242
Hardware          199
Retail            188
Security          180
Marketing         162
Finance           144
Data              108
Real Estate        72
Media              72
Food               54
Infrastructure     54
Travel             54
Name: Latest_Industry_Grouped, dtype: int64

One-hot encoded features:
  Industry features: 14
  Stage features: 6
  Total new features: 20


### Log Transform Funds Raised

In [13]:
df_imputed['Latest_Funds_Raised_Log'] = np.log1p(df_imputed['Latest_Funds_Raised'])

print("Log transformation of Latest_Funds_Raised:")
print(f"\nOriginal (Latest_Funds_Raised):")
print(df_imputed['Latest_Funds_Raised'].describe())
print(f"\nLog transformed (Latest_Funds_Raised_Log):")
print(df_imputed['Latest_Funds_Raised_Log'].describe())

Log transformation of Latest_Funds_Raised:

Original (Latest_Funds_Raised):
count      2736.000000
mean        967.758224
std        7464.556156
min           0.000000
25%           0.000000
50%           0.000000
75%         162.000000
max      121900.000000
Name: Latest_Funds_Raised, dtype: float64

Log transformed (Latest_Funds_Raised_Log):
count    2736.000000
mean        2.142908
std         3.022598
min         0.000000
25%         0.000000
50%         0.000000
75%         5.093750
max        11.710965
Name: Latest_Funds_Raised_Log, dtype: float64


### Clip All Ratio and Growth Features

In [14]:
features_to_clip = [
    # 1. High-Volatility Ratios (Leverage and Profitability)
    'roe_lag1', 
    'debt_to_equity_lag1',
    'current_ratio_lag1',
    'roa_lag1', 
    'debt_to_assets_lag1',
    'rd_to_assets_lag1',

    # 2. Financial Momentum/Growth Metrics (YoY and QoQ Change)
    'net_income_growth_yoy', 
    'total_assets_growth_yoy',
    'operating_income_growth_yoy',
    'stockholders_equity_growth_yoy',
    'current_assets_growth_yoy',
    'current_liabilities_growth_yoy',
    'net_income_change_qoq',
    'operating_income_change_qoq',

    # 3. Ratio Change Metrics
    'roa_change_yoy',
    'debt_to_assets_change_yoy',

    # 4. Interaction Terms
    'income_growth_gdp_interaction', 
    'unemployment_income_interaction',
    'operating_income_gdp_interaction'
]

print(f"Clipping {len(features_to_clip)} ratio/growth features:\n")
for col in features_to_clip:
    p1 = df_imputed[col].quantile(0.01)
    p99 = df_imputed[col].quantile(0.99)
    
    before_min = df_imputed[col].min()
    before_max = df_imputed[col].max()
    
    df_imputed[col] = df_imputed[col].clip(lower=p1, upper=p99)
    
    print(f"{col}: [{before_min:.2f}, {before_max:.2f}] -> [{p1:.2f}, {p99:.2f}]")

print(f"\nClipping complete")

Clipping 19 ratio/growth features:

roe_lag1: [-33.11, 46.15] -> [-3.54, 3.23]
debt_to_equity_lag1: [-869.72, 598.32] -> [-25.53, 26.29]
current_ratio_lag1: [0.01, 85.59] -> [0.25, 16.94]
roa_lag1: [-9.69, 6.20] -> [-0.80, 0.24]
debt_to_assets_lag1: [0.00, 12.32] -> [0.04, 1.39]
rd_to_assets_lag1: [0.00, 0.85] -> [0.01, 0.35]
net_income_growth_yoy: [-308486.80, 30400.00] -> [-3137.54, 1609.37]
total_assets_growth_yoy: [-97.10, 82858.22] -> [-40.93, 253.76]
operating_income_growth_yoy: [-87839.41, 11239.66] -> [-3764.30, 1323.85]
stockholders_equity_growth_yoy: [-7062.70, 165737.58] -> [-396.45, 1647.28]
current_assets_growth_yoy: [-94.63, 1123333.55] -> [-63.03, 1401.79]
current_liabilities_growth_yoy: [-94.43, 23689.85] -> [-60.80, 489.89]
net_income_change_qoq: [-96098.81, 57838.24] -> [-1268.07, 963.59]
operating_income_change_qoq: [-19804.23, 13063.52] -> [-782.74, 545.74]
roa_change_yoy: [-8.82, 7.59] -> [-0.54, 0.42]
debt_to_assets_change_yoy: [-16.42, 10.85] -> [-0.43, 0.45]
inc

### Combine All Features

In [15]:
exclude_cols = ['Company', 'Date', 'year', 'quarter', 'year_quarter', 
                'Latest_Industry', 'Latest_Country', 
                'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary',
                'Latest_Industry_Grouped']

numeric_features = [col for col in df_imputed.columns if col not in exclude_cols]

df_final = pd.concat([
    df_imputed[numeric_features],
    industry_dummies,
    stage_dummies
], axis=1)

df_final['Date'] = df_imputed['Date']
df_final['Company'] = df_imputed['Company']
df_final['year'] = df_imputed['year']
df_final['quarter'] = df_imputed['quarter']
df_final['year_quarter'] = df_imputed['year_quarter']
df_final['Latest_Country'] = df_imputed['Latest_Country']
df_final['Layoff_Event_Binary'] = df_imputed['Layoff_Event_Binary']

print(f"Final feature matrix:")
print(f"  Numeric features: {len(numeric_features)}")
print(f"  Industry dummies: {industry_dummies.shape[1]}")
print(f"  Stage dummies: {stage_dummies.shape[1]}")
print(f"  Total features: {df_final.shape[1] - 7}")
print(f"\nFinal shape: {df_final.shape}")
print(f"Missing values: {df_final.isnull().sum().sum()}")

Final feature matrix:
  Numeric features: 35
  Industry dummies: 14
  Stage dummies: 6
  Total features: 55

Final shape: (2736, 62)
Missing values: 0


### Save Preprocessed Dataset

In [16]:
df_final.to_csv('../data/layoffs_modeling_ready.csv', index=False)
print(f"Saved preprocessed dataset to: ../data/layoffs_modeling_ready.csv")
print(f"Shape: {df_final.shape}")
print(f"Features for modeling: {df_final.shape[1] - 4}")

Saved preprocessed dataset to: ../data/layoffs_modeling_ready.csv
Shape: (2736, 62)
Features for modeling: 58
