# Structure

median imputation
- 1. no column creation
    - 1.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 1.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
- 2. yes column creation
    - 2.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 2.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network

## Load the data and the packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tensorflow import keras
from tensorflow.keras import layers
from imblearn.over_sampling import ADASYN

In [2]:
df = pd.read_csv('../../database/2016-2022.csv')

In [3]:
binary = ['unequal_voting','classified_board_system','poison_pill','operating_margin_below_3y_average']
non_ratio_variables = [
    "capex",
    "net_capex",
    "short_term_wc",
    "long_term_wc",
    "modified_wc",
    "ebitda",
    "ebit",
    "net_income",
    "net_debt",
    "ev",
    "repurchase",
    "board_size",
    "net_repurchase",
    "total_compensation_to_executives",
    "total_compensation_to_board_members",
    "dividend_to_common",
    "dividend_to_preferred"
]

df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)

ratio_variables = [
    "ebitda_margin",
    "operating_margin",
    "sales_to_total_assets",
    "roe",
    "normalized_roe",
    "operating_roe",
    "operating_roic",
    "eps_adjusted_diluted",
    "ev_to_sales",
    "tobin_q_ratio",
    "pb_ratio",
    "pe_ratio",
    "fcf_to_equity",
    "sales_growth_rate",
    "dividend_per_share",
    "dividend_payout_ratio",
    "asset_to_equity",
    "cash_conversion_cycle",
    "ev_ebitda",
    "ev_ebit",
]

technical_variables = [
    "free_float_percentage",
    "rsi_14d",
    "rsi_30d",
    "volatility_30d",
    "volatility_90d",
    "volatility_180d",
    "volume_30d_average_to_outstanding",
    "insider_shares_percentage",
    "institution_ownership_percentage",
    "ceo_tenure",
    "total_return_5y",
    "total_return_4y",
    "total_return_3y",
    "total_return_2y",
    "total_return_1y",
    "total_return_6m",
    "total_return_3m",
    "employee_growth_rate",
    "fcf_yield"
]

supportive = ["bic_level_2","bic_level_3","market_cap"]
factors = binary + non_ratio_variables + ratio_variables + technical_variables

df["bic_level_2"] = df["bic_level_2"].astype('category')
df["bic_level_3"] = df["bic_level_3"].astype('category')


## 1. No column creation

### 1.1. raw data

In [4]:
medians_by_year = df.groupby('year')[factors].transform('median')
df[factors] = df[factors].fillna(medians_by_year)

In [5]:
# Data Split
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Cross-validation setup
n_splits = 5

In [6]:
# Assuming df is your DataFrame and factors is your list of columns
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors].values
y_test = test_data['targeted'].values

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result1 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result1)




      Model  Train AUC  Test AUC
0        LR   0.465153  0.468358
1        RF   0.630798  0.574929
2       XGB   0.647650  0.672888
3      LGBM   0.656241  0.633316
4  CatBoost   0.659881  0.636113
5        NN   0.921570  0.610119


### 1.2. oversampling with ADASYN

In [7]:
# Split data
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)


In [8]:
# Assuming df is your DataFrame and factors is your list of columns
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result2 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result2)

      Model  Train AUC  Test AUC
0        LR   0.465153  0.468358
1        RF   0.624648  0.562094
2       XGB   0.647650  0.672888
3      LGBM   0.656241  0.633316
4  CatBoost   0.659881  0.636113
5        NN   0.929599  0.604023


### 1.3. oversampling with GAN

In [9]:
import tensorflow as tf
from tensorflow import keras

def build_generator(input_dim):
    model = keras.models.Sequential([
        keras.layers.Dense(128, activation='relu', input_dim=input_dim),
        keras.layers.Dense(input_dim, activation='sigmoid')
    ])
    return model

def build_discriminator(input_dim):
    model = keras.models.Sequential([
        keras.layers.Dense(128, activation='relu', input_dim=input_dim),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = keras.models.Sequential([generator, discriminator])
    return model

def train_gan(X_train, epochs, batch_size=128):
    real = np.ones((batch_size, 1))
    fake = np.zeros((batch_size, 1))
    
    for epoch in range(epochs):
        # Train Discriminator
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_samples = X_train[idx]
        
        noise = np.random.normal(0, 1, (batch_size, input_dim))
        generated_samples = generator.predict(noise)
        
        d_loss_real = discriminator.train_on_batch(real_samples, real)
        d_loss_fake = discriminator.train_on_batch(generated_samples, fake)
        d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
        
        # Train Generator
        noise = np.random.normal(0, 1, (batch_size, input_dim))
        g_loss = gan.train_on_batch(noise, real)
        
        # Print the progress (optional)
        if epoch % 100 == 0:
            print(f"{epoch}/{epochs} [D loss: {d_loss[0]} | D accuracy: {100 * d_loss[1]}] [G loss: {g_loss[0]} | G accuracy: {100 * g_loss[1]}]")


In [10]:
# Assuming df is your DataFrame and factors is your list of columns
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors].values
y_train = train_data['targeted'].values

X_test = test_data[factors].values
y_test = test_data['targeted'].values

# Set input dimensions
input_dim = X_train.shape[1]

# Build and compile the discriminator
discriminator = build_discriminator(input_dim)
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Build and compile the gan
generator = build_generator(input_dim)
gan = build_gan(generator, discriminator)
gan.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Extract minority samples
minority_samples = X_train[y_train == 1]

# Train GAN on minority class
train_gan(minority_samples, epochs=5000)

# Generate synthetic samples
num_synthetic_samples = X_train[y_train == 0].shape[0] - minority_samples.shape[0]
if num_synthetic_samples > 0:
    noise = np.random.normal(0, 1, (num_synthetic_samples, input_dim))
    synthetic_samples = generator.predict(noise)

    # Add synthetic samples to the training data
    X_train = np.concatenate((X_train, synthetic_samples))
    y_train = np.concatenate((y_train, np.ones(num_synthetic_samples)))


0/5000 [D loss: 1811235.1116535068 | D accuracy: 57.8125] [G loss: 0.6939365863800049 | G accuracy: 53.90625]
100/5000 [D loss: 0.19237764179706573 | D accuracy: 100.0] [G loss: 1.1441360712051392 | G accuracy: 0.0]
200/5000 [D loss: 0.11597306281328201 | D accuracy: 100.0] [G loss: 1.5823863744735718 | G accuracy: 0.0]
300/5000 [D loss: 0.07409785687923431 | D accuracy: 100.0] [G loss: 1.9832196235656738 | G accuracy: 0.0]


400/5000 [D loss: 0.04991234093904495 | D accuracy: 100.0] [G loss: 2.35483455657959 | G accuracy: 0.0]
500/5000 [D loss: 0.035297028720378876 | D accuracy: 100.0] [G loss: 2.688007354736328 | G accuracy: 0.0]
600/5000 [D loss: 0.025948524475097656 | D accuracy: 100.0] [G loss: 2.9855308532714844 | G accuracy: 0.0]


700/5000 [D loss: 0.01985764503479004 | D accuracy: 100.0] [G loss: 3.2470977306365967 | G accuracy: 0.0]
800/5000 [D loss: 0.015677016228437424 | D accuracy: 100.0] [G loss: 3.479264736175537 | G accuracy: 0.0]
900/5000 [D loss: 0.012708839029073715 | D accuracy: 100.0] [G loss: 3.686002492904663 | G accuracy: 0.0]


1000/5000 [D loss: 0.010533653199672699 | D accuracy: 100.0] [G loss: 3.871366500854492 | G accuracy: 0.0]
1100/5000 [D loss: 0.008845346979796886 | D accuracy: 100.0] [G loss: 4.044560432434082 | G accuracy: 0.0]
1200/5000 [D loss: 0.008079210296273232 | D accuracy: 100.0] [G loss: 4.1347198486328125 | G accuracy: 0.0]


1300/5000 [D loss: 0.006722894497215748 | D accuracy: 100.0] [G loss: 4.316845893859863 | G accuracy: 0.0]
1400/5000 [D loss: 0.005689561367034912 | D accuracy: 100.0] [G loss: 4.4825849533081055 | G accuracy: 0.0]
1500/5000 [D loss: 0.004868496209383011 | D accuracy: 100.0] [G loss: 4.637351989746094 | G accuracy: 0.0]


1600/5000 [D loss: 0.004198482725769281 | D accuracy: 100.0] [G loss: 4.784913063049316 | G accuracy: 0.0]
1700/5000 [D loss: 0.0036428808234632015 | D accuracy: 100.0] [G loss: 4.926113128662109 | G accuracy: 0.0]
1800/5000 [D loss: 0.0031780051067471504 | D accuracy: 100.0] [G loss: 5.062216758728027 | G accuracy: 0.0]


1900/5000 [D loss: 0.002785137854516506 | D accuracy: 100.0] [G loss: 5.193727493286133 | G accuracy: 0.0]
2000/5000 [D loss: 0.002450441475957632 | D accuracy: 100.0] [G loss: 5.321407794952393 | G accuracy: 0.0]
2100/5000 [D loss: 0.0021633836440742016 | D accuracy: 100.0] [G loss: 5.4458160400390625 | G accuracy: 0.0]


2200/5000 [D loss: 0.0019153981702402234 | D accuracy: 100.0] [G loss: 5.5672454833984375 | G accuracy: 0.0]
2300/5000 [D loss: 0.0017003254033625126 | D accuracy: 100.0] [G loss: 5.686153411865234 | G accuracy: 0.0]
2400/5000 [D loss: 0.0015127169899642467 | D accuracy: 100.0] [G loss: 5.802807331085205 | G accuracy: 0.0]


2500/5000 [D loss: 0.0013485481031239033 | D accuracy: 100.0] [G loss: 5.91754150390625 | G accuracy: 0.0]
2600/5000 [D loss: 0.0012042283779010177 | D accuracy: 100.0] [G loss: 6.03048038482666 | G accuracy: 0.0]
2700/5000 [D loss: 0.0010770675726234913 | D accuracy: 100.0] [G loss: 6.141952991485596 | G accuracy: 0.0]


2800/5000 [D loss: 0.0009646481485106051 | D accuracy: 100.0] [G loss: 6.252091407775879 | G accuracy: 0.0]
2900/5000 [D loss: 0.0008650519885122776 | D accuracy: 100.0] [G loss: 6.361005783081055 | G accuracy: 0.0]
3000/5000 [D loss: 0.0007765889749862254 | D accuracy: 100.0] [G loss: 6.46879768371582 | G accuracy: 0.0]


3100/5000 [D loss: 0.0006978348246775568 | D accuracy: 100.0] [G loss: 6.575679779052734 | G accuracy: 0.0]
3200/5000 [D loss: 0.0006275734631344676 | D accuracy: 100.0] [G loss: 6.681670188903809 | G accuracy: 0.0]
3300/5000 [D loss: 0.0005648631486110389 | D accuracy: 100.0] [G loss: 6.786922454833984 | G accuracy: 0.0]


3400/5000 [D loss: 0.00050875055603683 | D accuracy: 100.0] [G loss: 6.891481399536133 | G accuracy: 0.0]
3500/5000 [D loss: 0.00045850526657886803 | D accuracy: 100.0] [G loss: 6.995428085327148 | G accuracy: 0.0]
3600/5000 [D loss: 0.00041344729834236205 | D accuracy: 100.0] [G loss: 7.098818778991699 | G accuracy: 0.0]


3700/5000 [D loss: 0.00037300196709111333 | D accuracy: 100.0] [G loss: 7.201702117919922 | G accuracy: 0.0]
3800/5000 [D loss: 0.0003366655146237463 | D accuracy: 100.0] [G loss: 7.3041582107543945 | G accuracy: 0.0]
3900/5000 [D loss: 0.00030399500974453986 | D accuracy: 100.0] [G loss: 7.406171798706055 | G accuracy: 0.0]


4000/5000 [D loss: 0.0002745950478129089 | D accuracy: 100.0] [G loss: 7.507887840270996 | G accuracy: 0.0]
4100/5000 [D loss: 0.000248111377004534 | D accuracy: 100.0] [G loss: 7.609277725219727 | G accuracy: 0.0]
4200/5000 [D loss: 0.00022425771749112755 | D accuracy: 100.0] [G loss: 7.710300445556641 | G accuracy: 0.0]


4300/5000 [D loss: 0.00020274989947210997 | D accuracy: 100.0] [G loss: 7.8111186027526855 | G accuracy: 0.0]
4400/5000 [D loss: 0.00018334691412746906 | D accuracy: 100.0] [G loss: 7.911691188812256 | G accuracy: 0.0]
4500/5000 [D loss: 0.00016583112301304936 | D accuracy: 100.0] [G loss: 8.012054443359375 | G accuracy: 0.0]


4600/5000 [D loss: 0.00015002797590568662 | D accuracy: 100.0] [G loss: 8.11221981048584 | G accuracy: 0.0]
4700/5000 [D loss: 0.00013575004413723946 | D accuracy: 100.0] [G loss: 8.21221923828125 | G accuracy: 0.0]
4800/5000 [D loss: 0.0001228500623255968 | D accuracy: 100.0] [G loss: 8.312053680419922 | G accuracy: 0.0]


4900/5000 [D loss: 0.00011119280679849908 | D accuracy: 100.0] [G loss: 8.411733627319336 | G accuracy: 0.0]


In [11]:
# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result2 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result2)

      Model  Train AUC  Test AUC
0        LR   0.979963  0.468193
1        RF   0.987332  0.603945
2       XGB   0.988157  0.631210
3      LGBM   0.988875  0.654185
4  CatBoost   0.987900  0.647191
5        NN   0.999055  0.586252


## 2.1 with column creation

In [27]:
df = pd.read_csv('../../database/2016-2022.csv')
df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)

In [28]:
for col in non_ratio_variables:
    
    # 1. _percentile
    percentile_col = col + '_percentile'
    df[percentile_col] = df.groupby('year')[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_col].fillna(50, inplace=True)
    
    # 2. _10bins_percentile
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.cut(x, bins=10))
    percentile_10bins_col = col + '_10bins_percentile'
    df[percentile_10bins_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_10bins_col].fillna(50, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)

    # 3. _10bins_normalized
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    normalized_col = col + '_10bins_normalized'
    df[normalized_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: (x - x.mean()) / x.std())
    df[normalized_col].fillna(0, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)
    
    # 4. _div_market_cap
    div_market_cap_col = col + '_div_market_cap'
    df[div_market_cap_col] = df[col] / df['market_cap']
    
    # 5. _div_log_market_cap
    df['log_market_cap'] = np.log(df['market_cap'])
    div_log_market_cap_col = col + '_div_log_market_cap'
    df[div_log_market_cap_col] = df[col] / df['log_market_cap']
    
    for new_col in [div_market_cap_col, div_log_market_cap_col]:
        median_values = df.groupby('year')[new_col].transform('median')
        df[new_col].fillna(median_values, inplace=True)
    df.drop('log_market_cap', axis=1, inplace=True)

In [29]:
def compute_percentile(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return group.rank(pct=True) * 100

def normalize(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return (group - group.mean()) / group.std()

for col in ratio_variables:
    percentile_col = col + '_industry_peers_percentile'
    df[percentile_col] = df.groupby(['year', 'bic_level_3'])[col].transform(compute_percentile)
    mask = df[percentile_col].isna()
    df.loc[mask, percentile_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(compute_percentile)
    df[percentile_col].fillna(50, inplace=True)
    df[percentile_col] = df[percentile_col].astype(float)
    normalized_col = col + '_industry_peers_normalized'
    df[normalized_col] = df.groupby(['year', 'bic_level_3'])[col].transform(normalize)
    mask = df[normalized_col].isna()
    df.loc[mask, normalized_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(normalize)
    df[normalized_col].fillna(0, inplace=True)
    df[normalized_col] = df[normalized_col].astype(float)
    df[col].fillna(df.groupby('year')[col].transform('median'), inplace=True)

In [30]:
factors = []
for col in non_ratio_variables:
    factors.extend([
        col,
        f'{col}_percentile',
        f'{col}_10bins_percentile',
        f'{col}_10bins_normalized',
        f'{col}_div_market_cap',
        f'{col}_div_log_market_cap'
    ])

for col in ratio_variables:
    factors.extend([
        col,
        f'{col}_industry_peers_percentile',
        f'{col}_industry_peers_normalized'
    ])

factors = factors + binary + technical_variables
medians_by_year = df.groupby('year')[factors].transform('median')
df[factors] = df[factors].fillna(medians_by_year)

### 2.1. raw data

In [13]:
# Data Split
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Cross-validation setup
n_splits = 5

In [14]:
# Assuming df is your DataFrame and factors is your list of columns
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result3 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

      Model  Train AUC  Test AUC
0        LR   0.512744  0.500827
1        RF   0.620995  0.552784
2       XGB   0.638421  0.644899
3      LGBM   0.659770  0.671237
4  CatBoost   0.641320  0.614170
5        NN   0.942795  0.637870


### 2.2. oversampling the ADASYN

In [15]:
# Split data
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)

# Cross-validation setup
n_splits = 5

In [16]:
# Assuming df is your DataFrame and factors is your list of columns
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result4 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result4)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

      Model  Train AUC  Test AUC
0        LR   0.512744  0.500827
1        RF   0.616958  0.582753
2       XGB   0.638421  0.644899
3      LGBM   0.659770  0.671237
4  CatBoost   0.641320  0.614170
5        NN   0.942475  0.649155


### 2.3. Oversampling with GAN

In [31]:
# Assuming df is your DataFrame and factors is your list of columns
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors].values
y_train = train_data['targeted'].values

X_test = test_data[factors].values
y_test = test_data['targeted'].values

# Set input dimensions
input_dim = X_train.shape[1]

# Build and compile the discriminator
discriminator = build_discriminator(input_dim)
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Build and compile the gan
generator = build_generator(input_dim)
gan = build_gan(generator, discriminator)
gan.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Extract minority samples
minority_samples = X_train[y_train == 1]

# Train GAN on minority class
train_gan(minority_samples, epochs=5000)

# Generate synthetic samples
num_synthetic_samples = X_train[y_train == 0].shape[0] - minority_samples.shape[0]
if num_synthetic_samples > 0:
    noise = np.random.normal(0, 1, (num_synthetic_samples, input_dim))
    synthetic_samples = generator.predict(noise)

    # Add synthetic samples to the training data
    X_train = np.concatenate((X_train, synthetic_samples))
    y_train = np.concatenate((y_train, np.ones(num_synthetic_samples)))


0/5000 [D loss: 1696287.5988501608 | D accuracy: 69.140625] [G loss: 0.7657411694526672 | G accuracy: 18.75]
100/5000 [D loss: 0.06961598247289658 | D accuracy: 100.0] [G loss: 2.0659985542297363 | G accuracy: 0.0]
200/5000 [D loss: 0.10171013325452805 | D accuracy: 100.0] [G loss: 1.7020870447158813 | G accuracy: 0.0]
300/5000 [D loss: 0.052814677357673645 | D accuracy: 100.0] [G loss: 2.3038339614868164 | G accuracy: 0.0]


400/5000 [D loss: 0.03044985421001911 | D accuracy: 100.0] [G loss: 2.832367420196533 | G accuracy: 0.0]
500/5000 [D loss: 0.02395021915435791 | D accuracy: 100.0] [G loss: 3.065704822540283 | G accuracy: 0.0]
600/5000 [D loss: 0.019847091287374496 | D accuracy: 100.0] [G loss: 3.244765043258667 | G accuracy: 0.0]


700/5000 [D loss: 0.01746988296508789 | D accuracy: 100.0] [G loss: 3.374666929244995 | G accuracy: 0.0]
800/5000 [D loss: 0.012682788074016571 | D accuracy: 100.0] [G loss: 3.688483476638794 | G accuracy: 0.0]
900/5000 [D loss: 0.01035401877015829 | D accuracy: 100.0] [G loss: 3.890583038330078 | G accuracy: 0.0]


1000/5000 [D loss: 0.007141088135540485 | D accuracy: 100.0] [G loss: 4.257646560668945 | G accuracy: 0.0]
1100/5000 [D loss: 0.005495513789355755 | D accuracy: 100.0] [G loss: 4.517376899719238 | G accuracy: 0.0]
1200/5000 [D loss: 0.00422805268317461 | D accuracy: 100.0] [G loss: 4.778295040130615 | G accuracy: 0.0]


1300/5000 [D loss: 0.0033484091982245445 | D accuracy: 100.0] [G loss: 5.010605335235596 | G accuracy: 0.0]
1400/5000 [D loss: 0.0027214086148887873 | D accuracy: 100.0] [G loss: 5.217206001281738 | G accuracy: 0.0]
1500/5000 [D loss: 0.0022563710808753967 | D accuracy: 100.0] [G loss: 5.403711795806885 | G accuracy: 0.0]


1600/5000 [D loss: 0.0018998015439137816 | D accuracy: 100.0] [G loss: 5.575936794281006 | G accuracy: 0.0]
1700/5000 [D loss: 0.0016182480612769723 | D accuracy: 100.0] [G loss: 5.735635757446289 | G accuracy: 0.0]
1800/5000 [D loss: 0.0013917715987190604 | D accuracy: 100.0] [G loss: 5.886181831359863 | G accuracy: 0.0]


1900/5000 [D loss: 0.0012063877657055855 | D accuracy: 100.0] [G loss: 6.028822898864746 | G accuracy: 0.0]
2000/5000 [D loss: 0.0010076407343149185 | D accuracy: 100.0] [G loss: 6.205407619476318 | G accuracy: 0.0]
2100/5000 [D loss: 0.0008065004949457943 | D accuracy: 100.0] [G loss: 6.431673049926758 | G accuracy: 0.0]


2200/5000 [D loss: 0.0006577264866791666 | D accuracy: 100.0] [G loss: 6.63521146774292 | G accuracy: 0.0]
2300/5000 [D loss: 0.0005550427595153451 | D accuracy: 100.0] [G loss: 6.805204391479492 | G accuracy: 0.0]
2400/5000 [D loss: 0.0005124448798596859 | D accuracy: 100.0] [G loss: 6.8856024742126465 | G accuracy: 0.0]


2500/5000 [D loss: 0.00037549249827861786 | D accuracy: 100.0] [G loss: 7.195940971374512 | G accuracy: 0.0]
2600/5000 [D loss: 0.0003507666988298297 | D accuracy: 100.0] [G loss: 7.26420783996582 | G accuracy: 0.0]
2700/5000 [D loss: 0.0004238366964273155 | D accuracy: 100.0] [G loss: 7.076435089111328 | G accuracy: 0.0]


2800/5000 [D loss: 0.0005153962410986423 | D accuracy: 100.0] [G loss: 6.883986473083496 | G accuracy: 0.0]
2900/5000 [D loss: 0.0004741399025078863 | D accuracy: 100.0] [G loss: 6.973051071166992 | G accuracy: 0.0]
3000/5000 [D loss: 0.00023925180721562356 | D accuracy: 100.0] [G loss: 7.65143346786499 | G accuracy: 0.0]


3100/5000 [D loss: 0.0001398955355398357 | D accuracy: 100.0] [G loss: 8.185919761657715 | G accuracy: 0.0]
3200/5000 [D loss: 0.00010069659037981182 | D accuracy: 100.0] [G loss: 8.514570236206055 | G accuracy: 0.0]
3300/5000 [D loss: 0.00012074322148691863 | D accuracy: 100.0] [G loss: 8.335569381713867 | G accuracy: 0.0]


3400/5000 [D loss: 7.044695666991174e-05 | D accuracy: 100.0] [G loss: 8.872766494750977 | G accuracy: 0.0]
3500/5000 [D loss: 4.253226506989449e-05 | D accuracy: 100.0] [G loss: 9.37598991394043 | G accuracy: 0.0]
3600/5000 [D loss: 3.0722807423444465e-05 | D accuracy: 100.0] [G loss: 9.700777053833008 | G accuracy: 0.0]


3700/5000 [D loss: 2.0997544197598472e-05 | D accuracy: 100.0] [G loss: 10.082195281982422 | G accuracy: 0.0]
3800/5000 [D loss: 1.5564024579362012e-05 | D accuracy: 100.0] [G loss: 10.37952995300293 | G accuracy: 0.0]
3900/5000 [D loss: 2.7937312552239746e-05 | D accuracy: 100.0] [G loss: 9.797554969787598 | G accuracy: 0.0]


4000/5000 [D loss: 1.4988251678005327e-05 | D accuracy: 100.0] [G loss: 10.418197631835938 | G accuracy: 0.0]
4100/5000 [D loss: 1.3397515431279317e-05 | D accuracy: 100.0] [G loss: 10.544828414916992 | G accuracy: 0.0]
4200/5000 [D loss: 1.2191231689939741e-05 | D accuracy: 100.0] [G loss: 10.624893188476562 | G accuracy: 0.0]


4300/5000 [D loss: 1.4878849469823763e-05 | D accuracy: 100.0] [G loss: 10.471817970275879 | G accuracy: 0.0]
4400/5000 [D loss: 1.178726142825326e-05 | D accuracy: 100.0] [G loss: 10.659125328063965 | G accuracy: 0.0]
4500/5000 [D loss: 1.3112798114889301e-05 | D accuracy: 100.0] [G loss: 10.552755355834961 | G accuracy: 0.0]


4600/5000 [D loss: 9.308497283200268e-06 | D accuracy: 100.0] [G loss: 10.894407272338867 | G accuracy: 0.0]
4700/5000 [D loss: 1.0368255061621312e-05 | D accuracy: 100.0] [G loss: 10.78783130645752 | G accuracy: 0.0]
4800/5000 [D loss: 7.642263881280087e-06 | D accuracy: 100.0] [G loss: 11.092445373535156 | G accuracy: 0.0]


4900/5000 [D loss: 9.992850209528115e-06 | D accuracy: 100.0] [G loss: 10.826326370239258 | G accuracy: 0.0]


In [32]:
# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result2 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


      Model  Train AUC  Test AUC
0        LR   0.978842  0.499816
1        RF   0.987630  0.600542
2       XGB   0.988000  0.625917
3      LGBM   0.989383  0.637757
4  CatBoost   0.988004  0.625380
5        NN   0.999906  0.584598
