# Structure

Imputation with MICE with RF (MissForest)
- 1. no column creation
    - 1.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 1.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
- 2. yes column creation
    - 2.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 2.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network

## Load the data and the packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import sklearn.neighbors._base 
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tensorflow import keras
from tensorflow.keras import layers
from imblearn.over_sampling import ADASYN
from missforest.missforest import MissForest

In [2]:
df = pd.read_csv('../../database/2016-2022_semantic_imputation.csv')

In [3]:
binary = ['unequal_voting','classified_board_system','poison_pill','operating_margin_below_3y_average']
non_ratio_variables = [
    "capex",
    "net_capex",
    "short_term_wc",
    "long_term_wc",
    "modified_wc",
    "ebitda",
    "ebit",
    "net_income",
    "net_debt",
    "ev",
    "repurchase",
    "board_size",
    "net_repurchase",
    "total_compensation_to_executives",
    "total_compensation_to_board_members",
    "dividend_to_common",
    "dividend_to_preferred"
]

df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)

ratio_variables = [
    "ebitda_margin",
    "operating_margin",
    "sales_to_total_assets",
    "roe",
    "normalized_roe",
    "operating_roe",
    "operating_roic",
    "eps_adjusted_diluted",
    "ev_to_sales",
    "tobin_q_ratio",
    "pb_ratio",
    "pe_ratio",
    "fcf_to_equity",
    "sales_growth_rate",
    "dividend_per_share",
    "dividend_payout_ratio",
    "asset_to_equity",
    "cash_conversion_cycle",
    "ev_ebitda",
    "ev_ebit",
]

technical_variables = [
    "free_float_percentage",
    "rsi_14d",
    "rsi_30d",
    "volatility_30d",
    "volatility_90d",
    "volatility_180d",
    "volume_30d_average_to_outstanding",
    "insider_shares_percentage",
    "institution_ownership_percentage",
    "ceo_tenure",
    "total_return_5y",
    "total_return_4y",
    "total_return_3y",
    "total_return_2y",
    "total_return_1y",
    "total_return_6m",
    "total_return_3m",
    "employee_growth_rate",
    "fcf_yield"
]

supportive = ["bic_level_2","bic_level_3","market_cap"]
factors = binary + non_ratio_variables + ratio_variables + technical_variables

df["bic_level_2"] = df["bic_level_2"].astype('category')
df["bic_level_3"] = df["bic_level_3"].astype('category')

# factors.append("targeted")

## 1. No column creation

### 1.1. raw data

In [4]:
# Create a copy of the dataframe and select only the factors you want to impute
df_amp = df[factors].copy()

# Use MissForest from missingpy for imputation
imputer = MissForest()
df_imputed = imputer.fit_transform(df_amp)

# Convert the result back to a dataframe
df_imputed = pd.DataFrame(df_imputed, columns=factors)

# Now merge back the imputed data with the original dataframe
df = pd.concat([df_imputed, df[['year', 'targeted', "market_cap", "bic_level_2", "bic_level_3"]]], axis=1)

In [5]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result1 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result1)


      Model  Train AUC  Test AUC
0        LR   0.494861  0.483239
1        RF   0.619853  0.602037
2       XGB   0.635359  0.615742
3      LGBM   0.648180  0.638470
4  CatBoost   0.648393  0.626637
5        NN   0.928767  0.626257


### 1.2. oversampling with ADASYN

In [6]:
# Split data
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)


In [7]:
# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result2 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result2)

      Model  Train AUC  Test AUC
0        LR   0.543917  0.484922
1        RF   0.999483  0.641493
2       XGB   0.997942  0.626848
3      LGBM   0.995800  0.612479
4  CatBoost   0.995205  0.602029
5        NN   0.998597  0.582144


## 2.1 with column creation

In [8]:
df = pd.read_csv('../../database/2016-2022_semantic_imputation.csv')
df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)
df["bic_level_2"] = df["bic_level_2"].astype('category')
df["bic_level_3"] = df["bic_level_3"].astype('category')

In [9]:
for col in non_ratio_variables:
    
    # 1. _percentile
    percentile_col = col + '_percentile'
    df[percentile_col] = df.groupby('year')[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_col].fillna(50, inplace=True)
    
    # 2. _10bins_percentile
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.cut(x, bins=10))
    percentile_10bins_col = col + '_10bins_percentile'
    df[percentile_10bins_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_10bins_col].fillna(50, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)

    # 3. _10bins_normalized
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    normalized_col = col + '_10bins_normalized'
    df[normalized_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: (x - x.mean()) / x.std())
    df[normalized_col].fillna(0, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)
    
    # 4. _div_market_cap
    div_market_cap_col = col + '_div_market_cap'
    df[div_market_cap_col] = df[col] / df['market_cap']
    
    # 5. _div_log_market_cap
    df['log_market_cap'] = np.log(df['market_cap'])
    div_log_market_cap_col = col + '_div_log_market_cap'
    df[div_log_market_cap_col] = df[col] / df['log_market_cap']
    

In [10]:
def compute_percentile(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return group.rank(pct=True) * 100

def normalize(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return (group - group.mean()) / group.std()

for col in ratio_variables:
    percentile_col = col + '_industry_peers_percentile'
    df[percentile_col] = df.groupby(['year', 'bic_level_3'])[col].transform(compute_percentile)
    mask = df[percentile_col].isna()
    df.loc[mask, percentile_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(compute_percentile)
    df[percentile_col].fillna(50, inplace=True)
    df[percentile_col] = df[percentile_col].astype(float)
    normalized_col = col + '_industry_peers_normalized'
    df[normalized_col] = df.groupby(['year', 'bic_level_3'])[col].transform(normalize)
    mask = df[normalized_col].isna()
    df.loc[mask, normalized_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(normalize)
    df[normalized_col].fillna(0, inplace=True)
    df[normalized_col] = df[normalized_col].astype(float)


In [11]:
factors = []
for col in non_ratio_variables:
    factors.extend([
        col,
        f'{col}_percentile',
        f'{col}_10bins_percentile',
        f'{col}_10bins_normalized',
        f'{col}_div_market_cap',
        f'{col}_div_log_market_cap'
    ])

for col in ratio_variables:
    factors.extend([
        col,
        f'{col}_industry_peers_percentile',
        f'{col}_industry_peers_normalized'
    ])

factors = factors + binary + technical_variables


In [12]:
df_amp2 = df[factors].copy()
df_amp2.shape

(18213, 185)

In [13]:
# Create a copy of the dataframe and select only the factors you want to impute
df_amp2 = df[factors].copy()

# Use MissForest from missingpy for imputation
df_imputed = imputer.fit_transform(df_amp2)

# Convert the result back to a dataframe
df_imputed = pd.DataFrame(df_imputed, columns=factors)

# Now merge back the imputed data with the original dataframe
df = pd.concat([df_imputed, df[['year', 'targeted', "market_cap", "bic_level_2", "bic_level_3"]]], axis=1)

ValueError: Input data must be 2 dimensional and non empty.

### 2.1. raw data

In [None]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result3 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result3)


### 2.2. oversampling the ADASYN

In [None]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result4 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result4)