In [None]:
### import packages ###

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import lightgbm as lgb

In [None]:
### Load the data ###

url = "https://github.com/pgeertsema/AIForCorporateFinance/raw/refs/heads/main/jkp_seo.dta"
df = pd.read_stata(url)

# Reduce fragmentation post read_stata()
df = df.copy()

In [None]:
### Preprocess the data ###

# Convert seo_next_yr to binary (assume > 0 as 1, else 0)
df['seo_next_yr'] = (df['seo_next_yr'] > 0).astype(int)

# Extract year as an integer into nyear
df['nyear'] = df['year'].dt.year

# Encode ff49 as categorical
df['ff49'] = df['ff49'].astype('category')

# Features: all variables from "bidask" to "debt_at"
features = df.loc[:, 'bidask':'debt_at'].columns.tolist()

# Target: seo_next_yr
target = 'seo_next_yr'


In [None]:
### Split data into train/val/test ###

# Test: last 5 years
test_years = df['nyear'].max() - 4
val_years = df['nyear'].max() - 7

test_data  = df[ df['nyear'] >= test_years]
val_data   = df[(df['nyear'] >= val_years) & (df['nyear'] < test_years)]
train_data = df[ df['nyear'] <  val_years]

# Split into features and target
X_train, y_train = train_data[features], train_data[target]
X_val, y_val = val_data[features], val_data[target]
X_test, y_test = test_data[features], test_data[target]


In [None]:
###  Train LightGBM Classifier ###

# Create a LightGBM Dataset
train_set = lgb.Dataset(X_train, label=y_train)
val_set = lgb.Dataset(X_val, label=y_val, reference=train_set)

# Specify the model
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'seed': 42
}

# Train the model with early stopping
model = lgb.train(
    params,
    train_set,
    num_boost_round=1000,
    valid_sets=[train_set, val_set],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(1)
    ]
)

[LightGBM] [Info] Number of positive: 1153, number of negative: 9678
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29119
[LightGBM] [Info] Number of data points in the train set: 10831, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.106454 -> initscore=-2.127488
[LightGBM] [Info] Start training from score -2.127488
[1]	training's binary_error: 0.106454	valid_1's binary_error: 0.0996252
Training until validation scores don't improve for 50 rounds
[2]	training's binary_error: 0.106454	valid_1's binary_error: 0.0996252
[3]	training's binary_error: 0.106454	valid_1's binary_error: 0.0996252
[4]	training's binary_error: 0.106454	valid_1's binary_error: 0.0996252
[5]	training's binary_error: 0.106269	valid_1's binary_error: 0.0996252
[6]	training's binary_error: 0.105438	valid_1's binary_error: 0.0996252
[7]	training'

In [None]:
### Evaluate the model on the test set ###

# Predict on the test set
y_pred_prob = model.predict(X_test, num_iteration=model.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Feature Importance
importance = model.feature_importance(importance_type='gain')
feature_names = X_train.columns
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Output results
print("Test Accuracy:", test_accuracy)
print("\nConfusion Matrix:")
print(conf_matrix)

print("Fraction of target that is positive (SEO happened):", y_test.mean())
print("Fraction of target that is negative (SEO did not happen):", 1- y_test.mean())

print("\nFeature Importance:")
print(feature_importance[0:20])

Test Accuracy: 0.8926489637305699

Confusion Matrix:
[[5482   24]
 [ 639   31]]
Fraction of target that is positive (SEO happened): 0.10848445595854922
Fraction of target that is negative (SEO did not happen): 0.8915155440414508

Feature Importance:
          Feature   Importance
7      chcsho_12m  1824.975616
1            ff49   613.058296
10        ret_3_1   526.613196
35     eqnetis_at   442.197598
6       div12m_me   295.240095
16       sale_gr1   269.575799
32          gp_at   268.328805
8       eqnpo_12m   257.016201
40        opex_at   256.353605
107      fincf_at   209.998903
111      eqnpo_at   180.328502
115       debt_at   177.145798
42   sale_emp_gr1   149.202299
11        ret_6_1   148.207598
41        cash_at   147.824899
106         ni_at   141.560498
14      ret_60_12   117.190500
2          dolvol   116.792100
62        sga_gr1   104.387801
114        nwc_at    93.626401


In [None]:
### Report actual SEO fractions by buckets sorted on predicted probabilities (quintiles) ###

test_results = pd.DataFrame({
    'predicted_prob': y_pred_prob,
    'actual_target': y_test.values
})

# Create quintiles based on predicted probabilities
test_results['quintile'] = pd.qcut(test_results['predicted_prob'], q=5, labels=False)

# Calculate the percentage of target == 1 in each quintile
quintile_summary = test_results.groupby('quintile').apply(
    lambda group: pd.Series({
        'count': len(group),
        'target_1_percentage': group['actual_target'].mean() * 100
    }),
    include_groups=False
)

# Display the results
quintile_summary.columns = ['Count', 'Target == 1 Percentage (%)']
print("\nPercentage of target == 1 in each quintile:")
print(quintile_summary)



Percentage of target == 1 in each quintile:
           Count  Target == 1 Percentage (%)
quintile                                    
0         1236.0                    2.508091
1         1235.0                    3.562753
2         1235.0                    7.125506
3         1235.0                   11.174089
4         1235.0                   29.878543
