# Structure

Imputation with kNN
- 1. no column creation
    - 1.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 1.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
- 2. yes column creation
    - 2.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 2.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network

## Load the data and the packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tensorflow import keras
from tensorflow.keras import layers
from imblearn.over_sampling import ADASYN
from fancyimpute import KNN

(CVXPY) Oct 24 05:00:49 PM: Encountered unexpected exception importing solver CVXOPT:
ImportError("dlopen(/Users/minwukim/anaconda3/lib/python3.11/site-packages/cvxopt/base.cpython-311-darwin.so, 0x0002): Library not loaded: @rpath/liblapack.3.dylib\n  Referenced from: <E25E40AB-7857-39B9-8DE7-28B7B0E4806B> /Users/minwukim/anaconda3/lib/python3.11/site-packages/cvxopt/base.cpython-311-darwin.so\n  Reason: tried: '/Users/minwukim/anaconda3/lib/python3.11/site-packages/cvxopt/../../../liblapack.3.dylib' (no such file), '/Users/minwukim/anaconda3/lib/python3.11/site-packages/cvxopt/../../../liblapack.3.dylib' (no such file), '/Users/minwukim/anaconda3/bin/../lib/liblapack.3.dylib' (no such file), '/Users/minwukim/anaconda3/bin/../lib/liblapack.3.dylib' (no such file), '/usr/local/lib/liblapack.3.dylib' (no such file), '/usr/lib/liblapack.3.dylib' (no such file, not in dyld cache)")
(CVXPY) Oct 24 05:00:49 PM: Encountered unexpected exception importing solver GLPK:
ImportError("dlopen(/Use

In [2]:
df = pd.read_csv('../../database/2016-2022.csv')

In [3]:
binary = ['unequal_voting','classified_board_system','poison_pill','operating_margin_below_3y_average']
non_ratio_variables = [
    "capex",
    "net_capex",
    "short_term_wc",
    "long_term_wc",
    "modified_wc",
    "ebitda",
    "ebit",
    "net_income",
    "net_debt",
    "ev",
    "repurchase",
    "board_size",
    "net_repurchase",
    "total_compensation_to_executives",
    "total_compensation_to_board_members",
    "dividend_to_common",
    "dividend_to_preferred"
]

df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)

ratio_variables = [
    "ebitda_margin",
    "operating_margin",
    "sales_to_total_assets",
    "roe",
    "normalized_roe",
    "operating_roe",
    "operating_roic",
    "eps_adjusted_diluted",
    "ev_to_sales",
    "tobin_q_ratio",
    "pb_ratio",
    "pe_ratio",
    "fcf_to_equity",
    "sales_growth_rate",
    "dividend_per_share",
    "dividend_payout_ratio",
    "asset_to_equity",
    "cash_conversion_cycle",
    "ev_ebitda",
    "ev_ebit",
]

technical_variables = [
    "free_float_percentage",
    "rsi_14d",
    "rsi_30d",
    "volatility_30d",
    "volatility_90d",
    "volatility_180d",
    "volume_30d_average_to_outstanding",
    "insider_shares_percentage",
    "institution_ownership_percentage",
    "ceo_tenure",
    "total_return_5y",
    "total_return_4y",
    "total_return_3y",
    "total_return_2y",
    "total_return_1y",
    "total_return_6m",
    "total_return_3m",
    "employee_growth_rate",
    "fcf_yield"
]

supportive = ["bic_level_2","bic_level_3","market_cap"]
factors = binary + non_ratio_variables + ratio_variables + technical_variables

df["bic_level_2"] = df["bic_level_2"].astype('category')
df["bic_level_3"] = df["bic_level_3"].astype('category')

# factors.append("targeted")

## 1. No column creation

### 1.1. raw data

In [4]:
knn_imputer = KNN(k=5)
df_imputed = pd.DataFrame(knn_imputer.fit_transform(df[factors]), columns=factors, index=df[factors].index)

Imputing row 1/18213 with 5 missing, elapsed time: 73.891
Imputing row 101/18213 with 2 missing, elapsed time: 73.965
Imputing row 201/18213 with 3 missing, elapsed time: 74.009
Imputing row 301/18213 with 4 missing, elapsed time: 74.045
Imputing row 401/18213 with 1 missing, elapsed time: 74.082
Imputing row 501/18213 with 0 missing, elapsed time: 74.140
Imputing row 601/18213 with 1 missing, elapsed time: 74.193
Imputing row 701/18213 with 6 missing, elapsed time: 74.231
Imputing row 801/18213 with 4 missing, elapsed time: 74.267
Imputing row 901/18213 with 1 missing, elapsed time: 74.306
Imputing row 1001/18213 with 3 missing, elapsed time: 74.355
Imputing row 1101/18213 with 2 missing, elapsed time: 74.404
Imputing row 1201/18213 with 3 missing, elapsed time: 74.449
Imputing row 1301/18213 with 6 missing, elapsed time: 74.512
Imputing row 1401/18213 with 0 missing, elapsed time: 74.549
Imputing row 1501/18213 with 15 missing, elapsed time: 74.584
Imputing row 1601/18213 with 6 miss

Imputing row 13601/18213 with 12 missing, elapsed time: 80.195
Imputing row 13701/18213 with 12 missing, elapsed time: 80.237
Imputing row 13801/18213 with 1 missing, elapsed time: 80.270
Imputing row 13901/18213 with 2 missing, elapsed time: 80.315
Imputing row 14001/18213 with 5 missing, elapsed time: 80.369
Imputing row 14101/18213 with 5 missing, elapsed time: 80.420
Imputing row 14201/18213 with 1 missing, elapsed time: 80.465
Imputing row 14301/18213 with 1 missing, elapsed time: 80.514
Imputing row 14401/18213 with 1 missing, elapsed time: 80.557
Imputing row 14501/18213 with 7 missing, elapsed time: 80.589
Imputing row 14601/18213 with 0 missing, elapsed time: 80.629
Imputing row 14701/18213 with 11 missing, elapsed time: 80.662
Imputing row 14801/18213 with 1 missing, elapsed time: 80.696
Imputing row 14901/18213 with 9 missing, elapsed time: 80.754
Imputing row 15001/18213 with 6 missing, elapsed time: 80.796
Imputing row 15101/18213 with 9 missing, elapsed time: 80.840
Imput

In [5]:
df = pd.concat([df_imputed, df[['year', 'targeted',"market_cap","bic_level_2","bic_level_3"]]], axis=1)

In [6]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result1 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result1)


      Model  Train AUC  Test AUC
0        LR   0.484504  0.478142
1        RF   0.609897  0.624728
2       XGB   0.638907  0.609764
3      LGBM   0.651705  0.619696
4  CatBoost   0.647292  0.627056
5        NN   0.924070  0.591044


### 1.2. oversampling with ADASYN

In [7]:
# Split data
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)


In [8]:
# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result2 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result2)

      Model  Train AUC  Test AUC
0        LR   0.575810  0.513087
1        RF   0.999408  0.603214
2       XGB   0.997633  0.606136
3      LGBM   0.995711  0.639186
4  CatBoost   0.995480  0.602602
5        NN   0.998786  0.578502


## 2.1 with column creation

In [9]:
df = pd.read_csv('../../database/2016-2022.csv')
df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)

In [10]:
for col in non_ratio_variables:
    
    # 1. _percentile
    percentile_col = col + '_percentile'
    df[percentile_col] = df.groupby('year')[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_col].fillna(50, inplace=True)
    
    # 2. _10bins_percentile
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.cut(x, bins=10))
    percentile_10bins_col = col + '_10bins_percentile'
    df[percentile_10bins_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_10bins_col].fillna(50, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)

    # 3. _10bins_normalized
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    normalized_col = col + '_10bins_normalized'
    df[normalized_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: (x - x.mean()) / x.std())
    df[normalized_col].fillna(0, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)
    
    # 4. _div_market_cap
    div_market_cap_col = col + '_div_market_cap'
    df[div_market_cap_col] = df[col] / df['market_cap']
    
    # 5. _div_log_market_cap
    df['log_market_cap'] = np.log(df['market_cap'])
    div_log_market_cap_col = col + '_div_log_market_cap'
    df[div_log_market_cap_col] = df[col] / df['log_market_cap']
    

In [11]:
def compute_percentile(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return group.rank(pct=True) * 100

def normalize(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return (group - group.mean()) / group.std()

for col in ratio_variables:
    percentile_col = col + '_industry_peers_percentile'
    df[percentile_col] = df.groupby(['year', 'bic_level_3'])[col].transform(compute_percentile)
    mask = df[percentile_col].isna()
    df.loc[mask, percentile_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(compute_percentile)
    df[percentile_col].fillna(50, inplace=True)
    df[percentile_col] = df[percentile_col].astype(float)
    normalized_col = col + '_industry_peers_normalized'
    df[normalized_col] = df.groupby(['year', 'bic_level_3'])[col].transform(normalize)
    mask = df[normalized_col].isna()
    df.loc[mask, normalized_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(normalize)
    df[normalized_col].fillna(0, inplace=True)
    df[normalized_col] = df[normalized_col].astype(float)


In [12]:
factors = []
for col in non_ratio_variables:
    factors.extend([
        col,
        f'{col}_percentile',
        f'{col}_10bins_percentile',
        f'{col}_10bins_normalized',
        f'{col}_div_market_cap',
        f'{col}_div_log_market_cap'
    ])

for col in ratio_variables:
    factors.extend([
        col,
        f'{col}_industry_peers_percentile',
        f'{col}_industry_peers_normalized'
    ])

factors = factors + binary + technical_variables


In [13]:
knn_imputer = KNN(k=5)
df_imputed = pd.DataFrame(knn_imputer.fit_transform(df[factors]), columns=factors, index=df[factors].index)
df = pd.concat([df_imputed, df[['year', 'targeted',"market_cap","bic_level_2","bic_level_3"]]], axis=1)

Imputing row 1/18213 with 7 missing, elapsed time: 164.340
Imputing row 101/18213 with 6 missing, elapsed time: 164.470
Imputing row 201/18213 with 7 missing, elapsed time: 164.555
Imputing row 301/18213 with 8 missing, elapsed time: 164.632
Imputing row 401/18213 with 1 missing, elapsed time: 164.705
Imputing row 501/18213 with 0 missing, elapsed time: 164.802
Imputing row 601/18213 with 1 missing, elapsed time: 164.893
Imputing row 701/18213 with 14 missing, elapsed time: 164.967
Imputing row 801/18213 with 8 missing, elapsed time: 165.039
Imputing row 901/18213 with 1 missing, elapsed time: 165.112
Imputing row 1001/18213 with 7 missing, elapsed time: 165.205
Imputing row 1101/18213 with 6 missing, elapsed time: 165.297
Imputing row 1201/18213 with 7 missing, elapsed time: 165.384
Imputing row 1301/18213 with 16 missing, elapsed time: 165.470
Imputing row 1401/18213 with 0 missing, elapsed time: 165.538
Imputing row 1501/18213 with 31 missing, elapsed time: 165.608
Imputing row 1601

Imputing row 13301/18213 with 0 missing, elapsed time: 175.414
Imputing row 13401/18213 with 1 missing, elapsed time: 175.474
Imputing row 13501/18213 with 0 missing, elapsed time: 175.527
Imputing row 13601/18213 with 24 missing, elapsed time: 175.612
Imputing row 13701/18213 with 24 missing, elapsed time: 175.680
Imputing row 13801/18213 with 1 missing, elapsed time: 175.734
Imputing row 13901/18213 with 2 missing, elapsed time: 175.815
Imputing row 14001/18213 with 13 missing, elapsed time: 175.878
Imputing row 14101/18213 with 11 missing, elapsed time: 175.961
Imputing row 14201/18213 with 1 missing, elapsed time: 176.034
Imputing row 14301/18213 with 1 missing, elapsed time: 176.101
Imputing row 14401/18213 with 3 missing, elapsed time: 176.175
Imputing row 14501/18213 with 17 missing, elapsed time: 176.225
Imputing row 14601/18213 with 0 missing, elapsed time: 176.286
Imputing row 14701/18213 with 19 missing, elapsed time: 176.336
Imputing row 14801/18213 with 1 missing, elapsed 

### 2.1. raw data

In [14]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result3 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

      Model  Train AUC  Test AUC
0        LR   0.524444  0.505737
1        RF   0.625340  0.558816
2       XGB   0.655281  0.657532
3      LGBM   0.650973  0.670624
4  CatBoost   0.643042  0.674861
5        NN   0.942524  0.619263


### 2.2. oversampling the ADASYN

In [15]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result4 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result4)

      Model  Train AUC  Test AUC
0        LR   0.595549  0.562235
1        RF   0.999778  0.613273
2       XGB   0.998425  0.641217
3      LGBM   0.996706  0.619914
4  CatBoost   0.994914  0.601928
5        NN   0.999998  0.643285
