### Setup and Load Data

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

df = pd.read_csv('data/layoffs_panel_final.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"Companies: {df['Company'].nunique()}")
print(f"Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"\nColumns ({len(df.columns)}):")
print(df.columns.tolist())

print(f"\nTarget variable distribution:")
print(df['Layoff_Event_Binary'].value_counts())

print(f"\nFirst few rows:")
display(df.head())

print(f"\nData types:")
display(df.dtypes)

print(f"\nMissing values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing_Count': missing[missing > 0],
    'Missing_Percentage': missing_pct[missing > 0]
}).sort_values('Missing_Count', ascending=False)
display(missing_df)

Dataset shape: (14976, 37)
Companies: 288
Date range: 2020-03-01 to 2024-06-01

Columns (37):
['Company', 'Date', 'Layoff_Event_Count', 'Total_Laid_Off_Sum', 'Avg_Layoff_Percentage', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary', 'unemployment_rate', 'gdp_growth_rate', 'fed_funds_rate', 'cpi', 'inflation_rate_yoy', 'consumer_confidence', 'sp500_index', 'sp500_change_6mo', 'avg_jobless_claims', 'company_sec', 'fiscal_year', 'fiscal_period', 'form', 'cash', 'cost_of_revenue', 'gross_profit', 'net_income', 'operating_expenses', 'operating_income', 'rd_expense', 'revenue', 'stockholders_equity', 'total_assets', 'total_liabilities', 'current_assets', 'current_liabilities', 'retained_earnings']

Target variable distribution:
0.0    14423
1.0      553
Name: Layoff_Event_Binary, dtype: int64

First few rows:


Unnamed: 0,Company,Date,Layoff_Event_Count,Total_Laid_Off_Sum,Avg_Layoff_Percentage,Latest_Industry,Latest_Country,Latest_Stage,Latest_Funds_Raised,Layoff_Event_Binary,...,operating_expenses,operating_income,rd_expense,revenue,stockholders_equity,total_assets,total_liabilities,current_assets,current_liabilities,retained_earnings
0,10x genomics,2020-03-01,0.0,0.0,0.0,Healthcare,United States,Post-IPO,0.0,0.0,...,,,,,,45604000.0,78139000.0,,,
1,10x genomics,2020-04-01,0.0,0.0,0.0,Healthcare,United States,Post-IPO,0.0,0.0,...,76681000.0,-19884000.0,,71905000.0,409068000.0,610384000.0,201316000.0,429732000.0,59068000.0,-283510000.0
2,10x genomics,2020-05-01,0.0,0.0,0.0,Healthcare,United States,Post-IPO,0.0,0.0,...,76681000.0,-19884000.0,,71905000.0,409068000.0,610384000.0,201316000.0,429732000.0,59068000.0,-283510000.0
3,10x genomics,2020-06-01,0.0,0.0,0.0,Healthcare,United States,Post-IPO,0.0,0.0,...,76681000.0,-19884000.0,,71905000.0,409068000.0,610384000.0,201316000.0,429732000.0,59068000.0,-283510000.0
4,10x genomics,2020-07-01,0.0,0.0,0.0,Healthcare,United States,Post-IPO,0.0,0.0,...,148954000.0,-59304000.0,,,391269000.0,582802000.0,191533000.0,451584000.0,122180000.0,-323677000.0



Data types:


Company                          object
Date                     datetime64[ns]
Layoff_Event_Count              float64
Total_Laid_Off_Sum              float64
Avg_Layoff_Percentage           float64
Latest_Industry                  object
Latest_Country                   object
Latest_Stage                     object
Latest_Funds_Raised             float64
Layoff_Event_Binary             float64
unemployment_rate               float64
gdp_growth_rate                 float64
fed_funds_rate                  float64
cpi                             float64
inflation_rate_yoy              float64
consumer_confidence             float64
sp500_index                     float64
sp500_change_6mo                float64
avg_jobless_claims              float64
company_sec                      object
fiscal_year                     float64
fiscal_period                    object
form                             object
cash                            float64
cost_of_revenue                 float64



Missing values:


Unnamed: 0,Missing_Count,Missing_Percentage
inflation_rate_yoy,14976,100.0
sp500_change_6mo,14976,100.0
cash,13351,89.149306
revenue,11447,76.43563
cost_of_revenue,9454,63.127671
gross_profit,6516,43.509615
operating_expenses,6032,40.277778
rd_expense,4312,28.792735
total_liabilities,2746,18.336004
current_assets,2263,15.110844


### Remove Columns with >50% Missing Values

In [7]:
missing_pct = (df.isnull().sum() / len(df)) * 100

cols_to_drop = missing_pct[missing_pct > 50].index.tolist()

print(f"Columns with >50% missing values ({len(cols_to_drop)}):")
for col in cols_to_drop:
    print(f"  {col}: {missing_pct[col]:.2f}%")

df_cleaned = df.drop(columns=cols_to_drop)

print(f"\nOriginal shape: {df.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")
print(f"Columns removed: {len(cols_to_drop)}")
print(f"Columns remaining: {len(df_cleaned.columns)}")

print(f"\nRemaining columns:")
print(df_cleaned.columns.tolist())

Columns with >50% missing values (5):
  inflation_rate_yoy: 100.00%
  sp500_change_6mo: 100.00%
  cash: 89.15%
  cost_of_revenue: 63.13%
  revenue: 76.44%

Original shape: (14976, 37)
Cleaned shape: (14976, 32)
Columns removed: 5
Columns remaining: 32

Remaining columns:
['Company', 'Date', 'Layoff_Event_Count', 'Total_Laid_Off_Sum', 'Avg_Layoff_Percentage', 'Latest_Industry', 'Latest_Country', 'Latest_Stage', 'Latest_Funds_Raised', 'Layoff_Event_Binary', 'unemployment_rate', 'gdp_growth_rate', 'fed_funds_rate', 'cpi', 'consumer_confidence', 'sp500_index', 'avg_jobless_claims', 'company_sec', 'fiscal_year', 'fiscal_period', 'form', 'gross_profit', 'net_income', 'operating_expenses', 'operating_income', 'rd_expense', 'stockholders_equity', 'total_assets', 'total_liabilities', 'current_assets', 'current_liabilities', 'retained_earnings']
