# 5. Model Training

This notebook trains different models with data made of different imputation & oversampling methods:

4 imputation methods:
- Median
- KNN
- MiceForest
- GAIN

5 oversampling methods:
- Original
- Random Oversampling
- SMOTENC
- borderline SMOTE
- ADASYN

7 models:
- Logistic Regression
- Random Forest
- XGBoost
- LightGBM
- CatBoost
- Multilayer Perceptron
- Dropout Perceptron 

It makes 4 x 5 x 7 = 140 combinations

Plus, Xgboost, LightGBM, and Catboost can have sparse data.
Therefore, we handle 140 + 3 = 143 in total

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import joblib

In [2]:
governance = ["ceo_is_female","unequal_voting","ceo_tenure","board_size","classified_board_system","poison_pill","buyback_yield",
              "dividend_payout_ratio","cf_to_total_compensation_to_executives","cf_to_total_compensation_to_board_members"]

operation = ["cf_to_capex_industry_peers_percentile","net_debt_to_ebitda_industry_peers_percentile",
             "current_ratio_industry_peers_percentile","ebitda_margin_industry_peers_percentile",
             "sales_to_total_assets_industry_peers_percentile","employee_growth_rate_industry_peers_percentile",
             "fcf_yield_industry_peers_percentile","sales_growth_rate_industry_peers_percentile",
             "cash_conversion_cycle_industry_peers_percentile","interest_coverage_ratio_industry_peers_percentile"]

ownership = ["free_float_percentage","institution_ownership_percentage","insider_shares_percentage"]

technical= ['rsi_14d','rsi_30d','volatility_30d','volatility_90d','volatility_180d',"volume_30d_average_to_outstanding"]

returns = ['total_return_5y', 'total_return_4y', 'total_return_3y','total_return_2y', 'total_return_1y', 'total_return_6m','total_return_3m']

valuation = ["roe_industry_peers_percentile","operating_roic_industry_peers_percentile","pe_ratio_industry_peers_percentile",
             "eps_industry_peers_percentile","ev_to_sales_industry_peers_percentile","tobin_q_ratio_industry_peers_percentile",
             "pb_ratio_industry_peers_percentile","asset_to_equity_industry_peers_percentile","ev_ebitda_industry_peers_percentile", "ev_to_asset_industry_peers_percentile"]

binary = ["ceo_is_female","unequal_voting", "classified_board_system","poison_pill"]

features = governance + operation + ownership + technical + returns + valuation

In [3]:
class DropoutMLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super(DropoutMLPClassifier, self).__init__()
        self.layer1 = nn.Linear(input_dim, 100)
        self.layer2 = nn.Linear(100, 50)
        self.dropout = nn.Dropout(0.5)
        self.output_layer = nn.Linear(50, 1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.output_layer(x))
        return x

def train_pytorch_model(model, X_train, y_train, X_test, y_test, epochs=5000, learning_rate=1e-3):
    X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.values.astype(np.float32)).view(-1, 1)
    X_test_tensor = torch.tensor(X_test.values.astype(np.float32))

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        y_train_pred = model(X_train_tensor).cpu().detach().numpy()
        y_test_pred = model(X_test_tensor).cpu().detach().numpy()

    return y_train_pred.squeeze(), y_test_pred.squeeze()


In [4]:
def evaluate_and_save(model, full_model_name, X_train, y_train, X_test, y_test, results, path):
    
    if isinstance(model, nn.Module):
        # For PyTorch models, use the custom training and prediction function
        y_train_pred, y_test_pred = train_pytorch_model(model, X_train, y_train, X_test, y_test)
    else:
        # For sklearn models, fit the model and predict
        model.fit(X_train, y_train)
        y_train_pred = model.predict_proba(X_train)[:, 1]
        y_test_pred = model.predict_proba(X_test)[:, 1]
    
    # Evaluate model performance
    auroc_train = roc_auc_score(y_train, y_train_pred)
    auroc_test = roc_auc_score(y_test, y_test_pred)
    
    # Correctly handle top 30 hit rate calculation
    sorted_indices = np.argsort(y_test_pred)[-30:]  # Indices of top 30 predictions
    top_30_hits = y_test.reset_index(drop=True).iloc[sorted_indices].sum()
    top_30_hit_rate = top_30_hits / 30

    results.append({
        'model': full_model_name,
        'auroc_train': auroc_train,
        'auroc_test': auroc_test,
        'top_30_hit_num': int(top_30_hits),
        'top_30_hit_rate': top_30_hit_rate
    })
    
    # Save the model
    model_path = f"{path}{full_model_name}.joblib"
    joblib.dump(model, model_path)

def train_and_evaluate_models(df, models, features, target_column, path, imputation, oversampling):
    train_data = df[df['training_data'] == 1]
    test_data = df[df['training_data'] == 0]

    X_train = train_data[features]
    y_train = train_data[target_column]
    X_test = test_data[features]
    y_test = test_data[target_column]

    results = []
    
    combination_name = f"{imputation}_{oversampling}"
    print(f"Training {combination_name}...")
    
    for model in models:
        model_name = type(model).__name__
        full_model_name = f"{imputation}_{oversampling}_{model_name}"
        evaluate_and_save(model, full_model_name, X_train, y_train, X_test, y_test, results, path)

    return pd.DataFrame(results)


In [5]:
models = [
    LogisticRegression(max_iter=10000), 
    RandomForestClassifier(n_estimators=100),
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    LGBMClassifier(verbose=-1),
    CatBoostClassifier(verbose=0),
    MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=5000),
    DropoutMLPClassifier(input_dim=len(features))
]



## 1. Sparse data

In [6]:
df_sparse = pd.read_csv('../database/companies/original/2016-2023.csv')

  df_sparse = pd.read_csv('../database/companies/original/2016-2023.csv')


In [7]:
gbm_models = [
    XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    LGBMClassifier(verbose=-1),
    CatBoostClassifier(verbose=0)
]

train_data = df_sparse[df_sparse['training_data'] == 1]
test_data = df_sparse[df_sparse['training_data'] == 0]

X_train = train_data[features]
y_train = train_data['targeted']
X_test = test_data[features]
y_test = test_data['targeted']

sparse_result = []

for model in gbm_models:
    
    model_name = model.__class__.__name__
    full_model_name = f"Sparse{model_name}"
    
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict_proba(X_train)[:, 1]
    y_test_pred = model.predict_proba(X_test)[:, 1]
    
    auroc_train = roc_auc_score(y_train, y_train_pred)
    auroc_test = roc_auc_score(y_test, y_test_pred)
    
    # Calculate top 30 hits
    top_30_idx = np.argsort(y_test_pred)[-30:]
    top_30_hits = y_test.iloc[top_30_idx].sum()
    top_30_hit_rate = top_30_hits / 30
    
    sparse_result.append({
        'model': full_model_name,
        'auroc_train': auroc_train,
        'auroc_test': auroc_test,
        'top_30_hit_num': top_30_hits,
        'top_30_hit_rate': top_30_hit_rate
    })
    
    path = '../models/Original/'
    model_path = f"{path}{full_model_name}.joblib"
    joblib.dump(model, model_path)


sparse_result = pd.DataFrame(sparse_result)



In [8]:
sparse_result

Unnamed: 0,model,auroc_train,auroc_test,top_30_hit_num,top_30_hit_rate
0,SparseXGBClassifier,1.0,0.659513,3,0.1
1,SparseLGBMClassifier,0.999935,0.671421,3,0.1
2,SparseCatBoostClassifier,0.99746,0.692081,4,0.133333


## 2. Median Imputation

In [9]:
df_median_original = pd.read_csv('../database/companies/imputation/median/median_original.csv')
df_median_rose = pd.read_csv('../database/companies/imputation/median/median_ROSE.csv')
df_median_smotenc = pd.read_csv('../database/companies/imputation/median/median_SMOTENC.csv')
df_median_borderlinesmote = pd.read_csv('../database/companies/imputation/median/median_BORDERLINESMOTE.csv')
df_median_adasyn = pd.read_csv('../database/companies/imputation/median/median_ADASYN.csv')

  df_median_original = pd.read_csv('../database/companies/imputation/median/median_original.csv')
  df_median_rose = pd.read_csv('../database/companies/imputation/median/median_ROSE.csv')
  df_median_smotenc = pd.read_csv('../database/companies/imputation/median/median_SMOTENC.csv')
  df_median_borderlinesmote = pd.read_csv('../database/companies/imputation/median/median_BORDERLINESMOTE.csv')
  df_median_adasyn = pd.read_csv('../database/companies/imputation/median/median_ADASYN.csv')


In [10]:
median_original_path = "../models/Median/Original/"
median_original_result = train_and_evaluate_models(df_median_original, models, features, "targeted", median_original_path, "Median", "Original")

median_rose_path = "../models/Median/Rose/"
median_rose_result = train_and_evaluate_models(df_median_rose, models, features, "targeted", median_rose_path, "Median", "Rose")

median_smotenc_path = "../models/Median/SmoteNC/"
median_smotenc_result = train_and_evaluate_models(df_median_smotenc, models, features, "targeted", median_smotenc_path, "Median", "SmoteNC")

median_borderlinesmote_path = "../models/Median/BorderlineSmote/"
median_borderlinesmote_result = train_and_evaluate_models(df_median_borderlinesmote, models, features, "targeted", median_borderlinesmote_path, "Median", "BorderlineSmote")

median_adasyn_path = "../models/Median/Adasyn/"
median_adasyn_result = train_and_evaluate_models(df_median_adasyn, models, features, "targeted", median_adasyn_path, "Median", "Adasyn")


Training Median_Original...
Training Median_Rose...
Training Median_SmoteNC...
Training Median_BorderlineSmote...
Training Median_Adasyn...


In [11]:
median_result = pd.concat([median_original_result, median_rose_result, median_smotenc_result, median_borderlinesmote_result, median_adasyn_result], ignore_index=True)

## 3. KNN Imputation

In [12]:
df_KNN_original = pd.read_csv('../database/companies/imputation/kNN/kNN_original.csv')
df_KNN_rose = pd.read_csv('../database/companies/imputation/kNN/kNN_ROSE.csv')
df_KNN_smotenc = pd.read_csv('../database/companies/imputation/kNN/kNN_SMOTENC.csv')
df_KNN_borderlinesmote = pd.read_csv('../database/companies/imputation/kNN/kNN_BORDERLINESMOTE.csv')
df_KNN_adasyn = pd.read_csv('../database/companies/imputation/kNN/kNN_ADASYN.csv')

  df_KNN_rose = pd.read_csv('../database/companies/imputation/kNN/kNN_ROSE.csv')
  df_KNN_smotenc = pd.read_csv('../database/companies/imputation/kNN/kNN_SMOTENC.csv')
  df_KNN_borderlinesmote = pd.read_csv('../database/companies/imputation/kNN/kNN_BORDERLINESMOTE.csv')
  df_KNN_adasyn = pd.read_csv('../database/companies/imputation/kNN/kNN_ADASYN.csv')


In [13]:
KNN_original_path = "../models/KNN/Original/"
KNN_original_result = train_and_evaluate_models(df_KNN_original, models, features, "targeted", KNN_original_path, "KNN", "Original")

KNN_rose_path = "../models/KNN/Rose/"
KNN_rose_result = train_and_evaluate_models(df_KNN_rose, models, features, "targeted", KNN_rose_path, "KNN", "Rose")

KNN_smotenc_path = "../models/KNN/SmoteNC/"
KNN_smotenc_result = train_and_evaluate_models(df_KNN_smotenc, models, features, "targeted", KNN_smotenc_path, "KNN", "SmoteNC")

KNN_borderlinesmote_path = "../models/KNN/BorderlineSmote/"
KNN_borderlinesmote_result = train_and_evaluate_models(df_KNN_borderlinesmote, models, features, "targeted", KNN_borderlinesmote_path, "KNN", "BorderlineSmote")

KNN_adasyn_path = "../models/KNN/Adasyn/"
KNN_adasyn_result = train_and_evaluate_models(df_KNN_adasyn, models, features, "targeted", KNN_adasyn_path, "KNN", "Adasyn")


Training KNN_Original...
Training KNN_Rose...
Training KNN_SmoteNC...
Training KNN_BorderlineSmote...
Training KNN_Adasyn...


In [14]:
KNN_result = pd.concat([KNN_original_result, KNN_rose_result, KNN_smotenc_result, KNN_borderlinesmote_result, KNN_adasyn_result], ignore_index=True)

## 4. MiceForest Imputation

In [15]:
df_MiceForest_original = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_original.csv')
df_MiceForest_rose = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_ROSE.csv')
df_MiceForest_smotenc = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_SMOTENC.csv')
df_MiceForest_borderlinesmote = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_BORDERLINESMOTE.csv')
df_MiceForest_adasyn = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_ADASYN.csv')

  df_MiceForest_rose = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_ROSE.csv')
  df_MiceForest_smotenc = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_SMOTENC.csv')
  df_MiceForest_borderlinesmote = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_BORDERLINESMOTE.csv')
  df_MiceForest_adasyn = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_ADASYN.csv')


In [16]:
MiceForest_original_path = "../models/MiceForest/Original/"
MiceForest_original_result = train_and_evaluate_models(df_MiceForest_original, models, features, "targeted", MiceForest_original_path, "MiceForest", "Original")

MiceForest_rose_path = "../models/MiceForest/Rose/"
MiceForest_rose_result = train_and_evaluate_models(df_MiceForest_rose, models, features, "targeted", MiceForest_rose_path, "MiceForest", "Rose")

MiceForest_smotenc_path = "../models/MiceForest/SmoteNC/"
MiceForest_smotenc_result = train_and_evaluate_models(df_MiceForest_smotenc, models, features, "targeted", MiceForest_smotenc_path, "MiceForest", "SmoteNC")

MiceForest_borderlinesmote_path = "../models/MiceForest/BorderlineSmote/"
MiceForest_borderlinesmote_result = train_and_evaluate_models(df_MiceForest_borderlinesmote, models, features, "targeted", MiceForest_borderlinesmote_path, "MiceForest", "BorderlineSmote")

MiceForest_adasyn_path = "../models/MiceForest/Adasyn/"
MiceForest_adasyn_result = train_and_evaluate_models(df_MiceForest_adasyn, models, features, "targeted", MiceForest_adasyn_path, "MiceForest", "Adasyn")


Training MiceForest_Original...
Training MiceForest_Rose...
Training MiceForest_SmoteNC...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training MiceForest_BorderlineSmote...
Training MiceForest_Adasyn...


In [17]:
MiceForest_result = pd.concat([MiceForest_original_result, MiceForest_rose_result, MiceForest_smotenc_result, MiceForest_borderlinesmote_result, MiceForest_adasyn_result], ignore_index=True)

## 5. GAIN Imputation

In [18]:
df_GAIN_original = pd.read_csv('../database/companies/imputation/GAIN/GAIN_original.csv')
df_GAIN_rose = pd.read_csv('../database/companies/imputation/GAIN/GAIN_ROSE.csv')
df_GAIN_smotenc = pd.read_csv('../database/companies/imputation/GAIN/GAIN_SMOTENC.csv')
df_GAIN_borderlinesmote = pd.read_csv('../database/companies/imputation/GAIN/GAIN_BORDERLINESMOTE.csv')
df_GAIN_adasyn = pd.read_csv('../database/companies/imputation/GAIN/GAIN_ADASYN.csv')

  df_GAIN_rose = pd.read_csv('../database/companies/imputation/GAIN/GAIN_ROSE.csv')
  df_GAIN_smotenc = pd.read_csv('../database/companies/imputation/GAIN/GAIN_SMOTENC.csv')
  df_GAIN_borderlinesmote = pd.read_csv('../database/companies/imputation/GAIN/GAIN_BORDERLINESMOTE.csv')
  df_GAIN_adasyn = pd.read_csv('../database/companies/imputation/GAIN/GAIN_ADASYN.csv')


In [19]:
GAIN_original_path = "../models/GAIN/Original/"
GAIN_original_result = train_and_evaluate_models(df_GAIN_original, models, features, "targeted", GAIN_original_path, "GAIN", "Original")

GAIN_rose_path = "../models/GAIN/Rose/"
GAIN_rose_result = train_and_evaluate_models(df_GAIN_rose, models, features, "targeted", GAIN_rose_path, "GAIN", "Rose")

GAIN_smotenc_path = "../models/GAIN/SmoteNC/"
GAIN_smotenc_result = train_and_evaluate_models(df_GAIN_smotenc, models, features, "targeted", GAIN_smotenc_path, "GAIN", "SmoteNC")

GAIN_borderlinesmote_path = "../models/GAIN/BorderlineSmote/"
GAIN_borderlinesmote_result = train_and_evaluate_models(df_GAIN_borderlinesmote, models, features, "targeted", GAIN_borderlinesmote_path, "GAIN", "BorderlineSmote")

GAIN_adasyn_path = "../models/GAIN/Adasyn/"
GAIN_adasyn_result = train_and_evaluate_models(df_GAIN_adasyn, models, features, "targeted", GAIN_adasyn_path, "GAIN", "Adasyn")


Training GAIN_Original...
Training GAIN_Rose...
Training GAIN_SmoteNC...
Training GAIN_BorderlineSmote...
Training GAIN_Adasyn...


In [20]:
GAIN_result = pd.concat([GAIN_original_result, GAIN_rose_result, GAIN_smotenc_result, GAIN_borderlinesmote_result, GAIN_adasyn_result], ignore_index=True)

## 6. AUROC result export

In [21]:
final_result = pd.concat([sparse_result, median_result, KNN_result, MiceForest_result, GAIN_result], ignore_index=True)

In [22]:
final_result.to_csv('../models/AUROC_table.csv')