## Load and Prepare Data

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('data/layoffs_modeling_ready.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
print(f"\nTarget distribution:\n{df['Layoff_Event_Binary'].value_counts()}")
print(f"Class balance: {df['Layoff_Event_Binary'].mean():.4f}")

Dataset shape: (8788, 59)
Date range: 2020-03-01 00:00:00 to 2024-06-01 00:00:00

Target distribution:
0.0    8441
1.0     347
Name: Layoff_Event_Binary, dtype: int64
Class balance: 0.0395


In [39]:
df[df['Company'] == 'amazon'][['Company', 'Date', 'layoff_event_lag1','Layoff_Event_Binary']].tail(30)

Unnamed: 0,Company,Date,layoff_event_lag1,Layoff_Event_Binary
490,amazon,2022-01-01,0.0,0.0
491,amazon,2022-02-01,0.0,0.0
492,amazon,2022-03-01,0.0,0.0
493,amazon,2022-04-01,0.0,0.0
494,amazon,2022-05-01,0.0,0.0
495,amazon,2022-06-01,0.0,0.0
496,amazon,2022-07-01,0.0,0.0
497,amazon,2022-08-01,0.0,0.0
498,amazon,2022-09-01,0.0,0.0
499,amazon,2022-10-01,0.0,1.0


In [41]:
df['layoff_event_lag1'].describe()
df['layoff_event_lag1'].value_counts(dropna=False)

0.0    8443
1.0     345
Name: layoff_event_lag1, dtype: int64

## Prepare Features and Target

In [26]:
exclude_cols = ['Company', 'Date', 'Latest_Country', 'Layoff_Event_Binary']

feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
y = df['Layoff_Event_Binary']

print(f"Final features: {len(feature_cols)}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Missing values: {X.isnull().sum().sum()}")

Final features: 55
X shape: (8788, 55)
y shape: (8788,)
Missing values: 0


## Temporal Train-Test Split

In [27]:
split_date = '2023-06-01'

train_mask = df['Date'] < split_date
test_mask = df['Date'] >= split_date

X_train = X[train_mask]
X_test = X[test_mask]
y_train = y[train_mask]
y_test = y[test_mask]

print(f"Split date: {split_date}")
print(f"\nTrain set:")
print(f"  Size: {len(X_train)}")
print(f"  Positive class: {y_train.sum()} ({y_train.mean():.4f})")
print(f"  Date range: {df[train_mask]['Date'].min().date()} to {df[train_mask]['Date'].max().date()}")
print(f"\nTest set:")
print(f"  Size: {len(X_test)}")
print(f"  Positive class: {y_test.sum()} ({y_test.mean():.4f})")
print(f"  Date range: {df[test_mask]['Date'].min().date()} to {df[test_mask]['Date'].max().date()}")

Split date: 2023-06-01

Train set:
  Size: 6591
  Positive class: 218.0 (0.0331)
  Date range: 2020-03-01 to 2023-05-01

Test set:
  Size: 2197
  Positive class: 129.0 (0.0587)
  Date range: 2023-06-01 to 2024-06-01


## Scale Features

In [28]:
continuous_features = [col for col in feature_cols if not col.startswith(('Industry_', 'Stage_', 'negative_', 'declining_')) and col != 'layoff_event_lag1']
binary_features = [col for col in feature_cols if col.startswith(('Industry_', 'Stage_', 'negative_', 'declining_')) or col == 'layoff_event_lag1']

print(f"Continuous features to scale: {len(continuous_features)}")
print(f"Binary features (no scaling): {len(binary_features)}")

scaler = StandardScaler()

X_train_continuous = scaler.fit_transform(X_train[continuous_features])
X_test_continuous = scaler.transform(X_test[continuous_features])

X_train_scaled = np.hstack([X_train_continuous, X_train[binary_features].values])
X_test_scaled = np.hstack([X_test_continuous, X_test[binary_features].values])

print(f"\nScaled train shape: {X_train_scaled.shape}")
print(f"Scaled test shape: {X_test_scaled.shape}")
print(f"Train mean: {X_train_scaled.mean():.6f}")
print(f"Train std: {X_train_scaled.std():.6f}")

Continuous features to scale: 29
Binary features (no scaling): 26

Scaled train shape: (6591, 55)
Scaled test shape: (2197, 55)
Train mean: 0.076137
Train std: 0.773054


## Baseline - Decision Tree

In [34]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split=50, min_samples_leaf=20)
dt_model.fit(X_train_scaled, y_train)

y_train_pred_dt = dt_model.predict(X_train_scaled)
y_test_pred_dt = dt_model.predict(X_test_scaled)

train_acc_dt = accuracy_score(y_train, y_train_pred_dt)
test_acc_dt = accuracy_score(y_test, y_test_pred_dt)
test_f1_dt = f1_score(y_test, y_test_pred_dt)

print("Decision Tree Results:")
print(f"  Train Accuracy: {train_acc_dt:.4f}")
print(f"  Test Accuracy: {test_acc_dt:.4f}")
print(f"  Test F1-Score: {test_f1_dt:.4f}")
print(f"\nTest Confusion Matrix:")
cm_dt = confusion_matrix(y_test, y_test_pred_dt)
print(cm_dt)

print(f"\nFeature Importances (non-zero only):")
feature_names = continuous_features + binary_features
importances_dt = dt_model.feature_importances_
non_zero_idx = np.where(importances_dt > 0)[0]
sorted_idx = non_zero_idx[np.argsort(importances_dt[non_zero_idx])[::-1]]

for idx in sorted_idx:
    print(f"  {feature_names[idx]}: {importances_dt[idx]:.6f}")

Decision Tree Results:
  Train Accuracy: 0.9848
  Test Accuracy: 0.9536
  Test F1-Score: 0.3462

Test Confusion Matrix:
[[2068    0]
 [ 102   27]]

Feature Importances (non-zero only):
  months_since_last_layoff: 0.765187
  Latest_Funds_Raised_Log: 0.134756
  avg_jobless_claims_lag1: 0.018276
  current_ratio_lag1: 0.012266
  fed_funds_rate_lag1: 0.012053
  debt_to_equity_lag1: 0.011560
  roa_lag1: 0.008212
  stockholders_equity_growth_yoy: 0.006820
  sp500_change_6mo_lag1: 0.006156
  debt_to_assets_change_yoy: 0.004962
  roe_lag1: 0.004753
  Industry_Hardware: 0.004157
  current_assets_growth_yoy: 0.004090
  current_liabilities_growth_yoy: 0.002346
  operating_income_growth_yoy: 0.002315
  rd_to_assets_lag1: 0.001165
  inflation_rate_yoy_lag1: 0.000476
  Industry_Infrastructure: 0.000307
  unemployment_income_interaction: 0.000143
