# Structure

Imputation with GAIN
https://github.com/dongdongdongdwn/GAIN-Dovey/blob/master/README.md
- 1. no column creation
    - 1.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 1.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
- 2. yes column creation
    - 2.1. raw data
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network
    - 2.2. ADASYN imputation
        - LR, RF, XGboost, LightGBM, Catboost, Neural Network

## Load the data and the packages

In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import sklearn.neighbors._base 
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from tensorflow import keras
from tensorflow.keras import layers
from imblearn.over_sampling import ADASYN
from missforest.missforest import MissForest

In [26]:
df = pd.read_csv('../../database/2016-2022.csv')

In [27]:
binary = ['unequal_voting','classified_board_system','poison_pill','operating_margin_below_3y_average']
non_ratio_variables = [
    "capex",
    "net_capex",
    "short_term_wc",
    "long_term_wc",
    "modified_wc",
    "ebitda",
    "ebit",
    "net_income",
    "net_debt",
    "ev",
    "repurchase",
    "board_size",
    "net_repurchase",
    "total_compensation_to_executives",
    "total_compensation_to_board_members",
    "dividend_to_common",
    "dividend_to_preferred"
]

df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)

ratio_variables = [
    "ebitda_margin",
    "operating_margin",
    "sales_to_total_assets",
    "roe",
    "normalized_roe",
    "operating_roe",
    "operating_roic",
    "eps_adjusted_diluted",
    "ev_to_sales",
    "tobin_q_ratio",
    "pb_ratio",
    "pe_ratio",
    "fcf_to_equity",
    "sales_growth_rate",
    "dividend_per_share",
    "dividend_payout_ratio",
    "asset_to_equity",
    "cash_conversion_cycle",
    "ev_ebitda",
    "ev_ebit",
]

technical_variables = [
    "free_float_percentage",
    "rsi_14d",
    "rsi_30d",
    "volatility_30d",
    "volatility_90d",
    "volatility_180d",
    "volume_30d_average_to_outstanding",
    "insider_shares_percentage",
    "institution_ownership_percentage",
    "ceo_tenure",
    "total_return_5y",
    "total_return_4y",
    "total_return_3y",
    "total_return_2y",
    "total_return_1y",
    "total_return_6m",
    "total_return_3m",
    "employee_growth_rate",
    "fcf_yield"
]

supportive = ["bic_level_2","bic_level_3","market_cap"]
factors = binary + non_ratio_variables + ratio_variables + technical_variables

df["bic_level_2"] = df["bic_level_2"].astype('category')
df["bic_level_3"] = df["bic_level_3"].astype('category')

# factors.append("targeted")

## 1. No column creation

### 1.1. raw data

In [29]:
# Necessary packages
import numpy as np
#import tensorflow as tf
##IF USING TF 2 use following import to still use TF < 2.0 Functionalities
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()


def normalization (data, parameters=None):
  '''Normalize data in [0, 1] range.
  
  Args:
    - data: original data
  
  Returns:
    - norm_data: normalized data
    - norm_parameters: min_val, max_val for each feature for renormalization
  '''

  # Parameters
  _, dim = data.shape
  norm_data = data.copy()
  
  if parameters is None:
  
    # MixMax normalization
    min_val = np.zeros(dim)
    max_val = np.zeros(dim)
    
    # For each dimension
    for i in range(dim):
      min_val[i] = np.nanmin(norm_data[:,i])
      norm_data[:,i] = norm_data[:,i] - np.nanmin(norm_data[:,i])
      max_val[i] = np.nanmax(norm_data[:,i])
      norm_data[:,i] = norm_data[:,i] / (np.nanmax(norm_data[:,i]) + 1e-6)   
      
    # Return norm_parameters for renormalization
    norm_parameters = {'min_val': min_val,
                       'max_val': max_val}

  else:
    min_val = parameters['min_val']
    max_val = parameters['max_val']
    
    # For each dimension
    for i in range(dim):
      norm_data[:,i] = norm_data[:,i] - min_val[i]
      norm_data[:,i] = norm_data[:,i] / (max_val[i] + 1e-6)  
      
    norm_parameters = parameters    
      
  return norm_data, norm_parameters


def renormalization (norm_data, norm_parameters):
  '''Renormalize data from [0, 1] range to the original range.
  
  Args:
    - norm_data: normalized data
    - norm_parameters: min_val, max_val for each feature for renormalization
  
  Returns:
    - renorm_data: renormalized original data
  '''
  
  min_val = norm_parameters['min_val']
  max_val = norm_parameters['max_val']

  _, dim = norm_data.shape
  renorm_data = norm_data.copy()
    
  for i in range(dim):
    renorm_data[:,i] = renorm_data[:,i] * (max_val[i] + 1e-6)   
    renorm_data[:,i] = renorm_data[:,i] + min_val[i]
    
  return renorm_data


def rounding (imputed_data, data_x):
  '''Round imputed data for categorical variables.
  
  Args:
    - imputed_data: imputed data
    - data_x: original data with missing values
    
  Returns:
    - rounded_data: rounded imputed data
  '''
  
  _, dim = data_x.shape
  rounded_data = imputed_data.copy()
  
  for i in range(dim):
    temp = data_x[~np.isnan(data_x[:, i]), i]
    # Only for the categorical variable
    if len(np.unique(temp)) < 20:
      rounded_data[:, i] = np.round(rounded_data[:, i])
      
  return rounded_data


def rmse_loss (ori_data, imputed_data, data_m):
  '''Compute RMSE loss between ori_data and imputed_data
  
  Args:
    - ori_data: original data without missing values
    - imputed_data: imputed data
    - data_m: indicator matrix for missingness
    
  Returns:
    - rmse: Root Mean Squared Error
  '''
  
  ori_data, norm_parameters = normalization(ori_data)
  imputed_data, _ = normalization(imputed_data, norm_parameters)
    
  # Only for missing values
  nominator = np.sum(((1-data_m) * ori_data - (1-data_m) * imputed_data)**2)
  denominator = np.sum(1-data_m)
  
  rmse = np.sqrt(nominator/float(denominator))
  
  return rmse


def xavier_init(size):
  '''Xavier initialization.
  
  Args:
    - size: vector size
    
  Returns:
    - initialized random vector.
  '''
  in_dim = size[0]
  xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
  return tf.random_normal(shape = size, stddev = xavier_stddev)
      

def binary_sampler(p, rows, cols):
  '''Sample binary random variables.
  
  Args:
    - p: probability of 1
    - rows: the number of rows
    - cols: the number of columns
    
  Returns:
    - binary_random_matrix: generated binary random matrix.
  '''
  unif_random_matrix = np.random.uniform(0., 1., size = [rows, cols])
  binary_random_matrix = 1*(unif_random_matrix < p)
  return binary_random_matrix


def uniform_sampler(low, high, rows, cols):
  '''Sample uniform random variables.
  
  Args:
    - low: low limit
    - high: high limit
    - rows: the number of rows
    - cols: the number of columns
    
  Returns:
    - uniform_random_matrix: generated uniform random matrix.
  '''
  return np.random.uniform(low, high, size = [rows, cols])       


def sample_batch_index(total, batch_size):
  '''Sample index of the mini-batch.
  
  Args:
    - total: total number of samples
    - batch_size: batch size
    
  Returns:
    - batch_idx: batch index
  '''
  total_idx = np.random.permutation(total)
  batch_idx = total_idx[:batch_size]
  return batch_idx

In [30]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import numpy as np
from tqdm import tqdm

def gain (data_x, gain_parameters):

  # Define mask matrix
  data_m = 1-np.isnan(data_x)
  
  # System parameters
  batch_size = gain_parameters['batch_size']
  hint_rate = gain_parameters['hint_rate']
  alpha = gain_parameters['alpha']
  iterations = gain_parameters['iterations']
  
  # Other parameters
  no, dim = data_x.shape
  
  # Hidden state dimensions
  h_dim = int(dim)
  
  # Normalization
  norm_data, norm_parameters = normalization(data_x)
  norm_data_x = np.nan_to_num(norm_data, 0)
  
  ## GAIN architecture   
  # Input placeholders
  # Data vector
  X = tf.placeholder(tf.float32, shape = [None, dim])
  # Mask vector 
  M = tf.placeholder(tf.float32, shape = [None, dim])
  # Hint vector
  H = tf.placeholder(tf.float32, shape = [None, dim])
  
  # Discriminator variables
  D_W1 = tf.Variable(xavier_init([dim*2, h_dim])) # Data + Hint as inputs
  D_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  D_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
  D_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  D_W3 = tf.Variable(xavier_init([h_dim, dim]))
  D_b3 = tf.Variable(tf.zeros(shape = [dim]))  # Multi-variate outputs
  
  theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
  
  #Generator variables
  # Data + Mask as inputs (Random noise is in missing components)
  G_W1 = tf.Variable(xavier_init([dim*2, h_dim]))  
  G_b1 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  G_W2 = tf.Variable(xavier_init([h_dim, h_dim]))
  G_b2 = tf.Variable(tf.zeros(shape = [h_dim]))
  
  G_W3 = tf.Variable(xavier_init([h_dim, dim]))
  G_b3 = tf.Variable(tf.zeros(shape = [dim]))
  
  theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
  
  ## GAIN functions
  # Generator
  def generator(x,m):
    # Concatenate Mask and Data
    inputs = tf.concat(values = [x, m], axis = 1) 
    G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
    G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)   
    # MinMax normalized output
    G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 
    return G_prob
      
  # Discriminator
  def discriminator(x, h):
    # Concatenate Data and Hint
    inputs = tf.concat(values = [x, h], axis = 1) 
    D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)  
    D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
    D_logit = tf.matmul(D_h2, D_W3) + D_b3
    D_prob = tf.nn.sigmoid(D_logit)
    return D_prob
  
  ## GAIN structure
  # Generator
  G_sample = generator(X, M)
 
  # Combine with observed data
  Hat_X = X * M + G_sample * (1-M)
  
  # Discriminator
  D_prob = discriminator(Hat_X, H)
  
  ## GAIN loss
  D_loss_temp = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) \
                                + (1-M) * tf.log(1. - D_prob + 1e-8)) 
  
  G_loss_temp = -tf.reduce_mean((1-M) * tf.log(D_prob + 1e-8))
  
  MSE_loss = \
  tf.reduce_mean((M * X - M * G_sample)**2) / tf.reduce_mean(M)
  
  D_loss = D_loss_temp
  G_loss = G_loss_temp + alpha * MSE_loss 
  
  ## GAIN solver
  D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
  G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
  
  ## Iterations
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
   
  # Start Iterations
  for it in tqdm(range(iterations)):    
      
    # Sample batch
    batch_idx = sample_batch_index(no, batch_size)
    X_mb = norm_data_x[batch_idx, :]  
    M_mb = data_m[batch_idx, :]  
    # Sample random vectors  
    Z_mb = uniform_sampler(0, 0.01, batch_size, dim) 
    # Sample hint vectors
    H_mb_temp = binary_sampler(hint_rate, batch_size, dim)
    H_mb = M_mb * H_mb_temp
      
    # Combine random vectors with observed vectors
    X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
    _, D_loss_curr = sess.run([D_solver, D_loss_temp], 
                              feed_dict = {M: M_mb, X: X_mb, H: H_mb})
    _, G_loss_curr, MSE_loss_curr = \
    sess.run([G_solver, G_loss_temp, MSE_loss],
             feed_dict = {X: X_mb, M: M_mb, H: H_mb})
            
  ## Return imputed data      
  Z_mb = uniform_sampler(0, 0.01, no, dim) 
  M_mb = data_m
  X_mb = norm_data_x          
  X_mb = M_mb * X_mb + (1-M_mb) * Z_mb 
      
  imputed_data = sess.run([G_sample], feed_dict = {X: X_mb, M: M_mb})[0]
  
  imputed_data = data_m * norm_data_x + (1-data_m) * imputed_data
  
  # Renormalization
  imputed_data = renormalization(imputed_data, norm_parameters)  
  
  # Rounding
  imputed_data = rounding(imputed_data, data_x)  
          
  return imputed_data

In [31]:
# 1. Preparation: Assuming you have already imported pandas as pd and other necessary libraries

# 2. Format your data
data_x = df[factors].values

# 3. Setup GAIN Parameters
gain_parameters = {
    'batch_size': 128,
    'hint_rate': 0.9,
    'alpha': 10,
    'iterations': 10000
}

# 4. Run GAIN
imputed_data = gain(data_x, gain_parameters)

# 5. Replace Missing Data in DataFrame
df[factors] = np.where(np.isnan(df[factors]), imputed_data, df[factors])


2023-10-25 15:31:21.171254: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:382] MLIR V1 optimization pass is not enabled
100%|████████████████████████████████████| 10000/10000 [00:14<00:00, 671.77it/s]


In [34]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result1 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result1)


2023-10-25 15:32:37.670673: W tensorflow/c/c_api.cc:305] Operation '{name:'training/Adam/decay/Assign' id:1133 op device:{requested: '', assigned: ''} def:{{{node training/Adam/decay/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training/Adam/decay, training/Adam/decay/Initializer/initial_value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
  updates = self.state_updates
2023-10-25 15:32:37.845721: W tensorflow/c/c_api.cc:305] Operation '{name:'loss/mul' id:935 op device:{requested: '', assigned: ''} def:{{{node loss/mul}} = Mul[T=DT_FLOAT, _has_manual_control_dependencies=true](loss/mul/x, loss/dense_40_loss/value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either 

      Model  Train AUC  Test AUC
0        LR   0.517378  0.516439
1        RF   0.612245  0.631038
2       XGB   0.638570  0.641958
3      LGBM   0.668172  0.658351
4  CatBoost   0.659091  0.635290
5        NN   0.915842  0.608056


  updates=self.state_updates,
2023-10-25 15:32:50.999310: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_40/Sigmoid' id:907 op device:{requested: '', assigned: ''} def:{{{node dense_40/Sigmoid}} = Sigmoid[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_40/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


### 1.2. oversampling with ADASYN

In [35]:
# Split data
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)


In [36]:
# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result2 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result2)

2023-10-25 15:34:47.396399: W tensorflow/c/c_api.cc:305] Operation '{name:'training_2/Adam/dense_41/bias/v/Assign' id:1647 op device:{requested: '', assigned: ''} def:{{{node training_2/Adam/dense_41/bias/v/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training_2/Adam/dense_41/bias/v, training_2/Adam/dense_41/bias/v/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
  updates = self.state_updates
2023-10-25 15:34:47.687624: W tensorflow/c/c_api.cc:305] Operation '{name:'loss_1/mul' id:1398 op device:{requested: '', assigned: ''} def:{{{node loss_1/mul}} = Mul[T=DT_FLOAT, _has_manual_control_dependencies=true](loss_1/mul/x, loss_1/dense_43_loss/value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect,

      Model  Train AUC  Test AUC
0        LR   0.566993  0.560878
1        RF   0.999660  0.645448
2       XGB   0.997908  0.607669
3      LGBM   0.996437  0.622092
4  CatBoost   0.995245  0.603411
5        NN   0.998222  0.552332


  updates=self.state_updates,
2023-10-25 15:35:13.040088: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_43/Sigmoid' id:1370 op device:{requested: '', assigned: ''} def:{{{node dense_43/Sigmoid}} = Sigmoid[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_43/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


## 2.1 with column creation

In [37]:
df = pd.read_csv('../../database/2016-2022.csv')
df['ev_ebitda'] = np.where((df['ev'] != 0) & (df['ebitda'] != 0), df['ev'] / df['ebitda'], np.nan)
df['ev_ebit'] = np.where((df['ev'] != 0) & (df['ebit'] != 0), df['ev'] / df['ebit'], np.nan)
df["bic_level_2"] = df["bic_level_2"].astype('category')
df["bic_level_3"] = df["bic_level_3"].astype('category')

In [38]:
for col in non_ratio_variables:
    
    # 1. _percentile
    percentile_col = col + '_percentile'
    df[percentile_col] = df.groupby('year')[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_col].fillna(50, inplace=True)
    
    # 2. _10bins_percentile
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.cut(x, bins=10))
    percentile_10bins_col = col + '_10bins_percentile'
    df[percentile_10bins_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: x.rank(pct=True) * 100)
    df[percentile_10bins_col].fillna(50, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)

    # 3. _10bins_normalized
    df['market_cap_bins'] = df.groupby('year')['market_cap'].transform(lambda x: pd.qcut(x, 10, labels=False, duplicates='drop'))
    normalized_col = col + '_10bins_normalized'
    df[normalized_col] = df.groupby(['year', 'market_cap_bins'])[col].transform(lambda x: (x - x.mean()) / x.std())
    df[normalized_col].fillna(0, inplace=True)
    df.drop('market_cap_bins', axis=1, inplace=True)
    
    # 4. _div_market_cap
    div_market_cap_col = col + '_div_market_cap'
    df[div_market_cap_col] = df[col] / df['market_cap']
    
    # 5. _div_log_market_cap
    df['log_market_cap'] = np.log(df['market_cap'])
    div_log_market_cap_col = col + '_div_log_market_cap'
    df[div_log_market_cap_col] = df[col] / df['log_market_cap']
    

In [39]:
def compute_percentile(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return group.rank(pct=True) * 100

def normalize(group):
    if len(group) < 10:
        return pd.Series([None] * len(group), index=group.index, dtype=float)
    return (group - group.mean()) / group.std()

for col in ratio_variables:
    percentile_col = col + '_industry_peers_percentile'
    df[percentile_col] = df.groupby(['year', 'bic_level_3'])[col].transform(compute_percentile)
    mask = df[percentile_col].isna()
    df.loc[mask, percentile_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(compute_percentile)
    df[percentile_col].fillna(50, inplace=True)
    df[percentile_col] = df[percentile_col].astype(float)
    normalized_col = col + '_industry_peers_normalized'
    df[normalized_col] = df.groupby(['year', 'bic_level_3'])[col].transform(normalize)
    mask = df[normalized_col].isna()
    df.loc[mask, normalized_col] = df[mask].groupby(['year', 'bic_level_2'])[col].transform(normalize)
    df[normalized_col].fillna(0, inplace=True)
    df[normalized_col] = df[normalized_col].astype(float)


In [40]:
factors = []
for col in non_ratio_variables:
    factors.extend([
        col,
        f'{col}_percentile',
        f'{col}_10bins_percentile',
        f'{col}_10bins_normalized',
        f'{col}_div_market_cap',
        f'{col}_div_log_market_cap'
    ])

for col in ratio_variables:
    factors.extend([
        col,
        f'{col}_industry_peers_percentile',
        f'{col}_industry_peers_normalized'
    ])

factors = factors + binary + technical_variables


In [41]:
# 1. Preparation: Assuming you have already imported pandas as pd and other necessary libraries

# 2. Format your data
data_x = df[factors].values

# 3. Setup GAIN Parameters
gain_parameters = {
    'batch_size': 128,
    'hint_rate': 0.9,
    'alpha': 10,
    'iterations': 10000
}

# 4. Run GAIN
imputed_data = gain(data_x, gain_parameters)

# 5. Replace Missing Data in DataFrame
df[factors] = np.where(np.isnan(df[factors]), imputed_data, df[factors])

100%|████████████████████████████████████| 10000/10000 [00:46<00:00, 212.94it/s]


### 2.1. raw data

In [42]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result3 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2023-10-25 15:39:20.641750: W tensorflow/c/c_api.cc:305] Operation '{name:'training_2/Adam/dense_43/bias/v/Assign' id:1669 op device:{requested: '', assigned: ''} def:{{{node training_2/Adam/dense_43/bias/v/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training_2/Adam/dense_43/bias/v, training_2/Adam/dense_43/bias/v/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
2023-10-25 15:39

      Model  Train AUC  Test AUC
0        LR   0.557112  0.551380
1        RF   0.602393  0.564660
2       XGB   0.644680  0.626791
3      LGBM   0.666368  0.638430
4  CatBoost   0.662327  0.657538
5        NN   0.942548  0.628492


  updates=self.state_updates,
2023-10-25 15:39:36.221887: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_46/Sigmoid' id:2641 op device:{requested: '', assigned: ''} def:{{{node dense_46/Sigmoid}} = Sigmoid[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_46/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.


### 2.2. oversampling the ADASYN

In [43]:
train_data = df[df['year'].isin([2016, 2017, 2018, 2019, 2020])]
test_data = df[df['year'] == 2021]

X_train = train_data[factors]
y_train = train_data['targeted']

X_test = test_data[factors]
y_test = test_data['targeted']

# Apply ADASYN oversampling
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)

# Scaling data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

def create_nn():
    model = keras.Sequential([
        layers.Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dropout(0.5),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy')
    return model

# Models (without neural network for now)
models = {
    "LR": LogisticRegression(max_iter=10000),
    "RF": RandomForestClassifier(n_estimators=100),
    "XGB": xgb.XGBClassifier(),
    "LGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0, iterations=100)
}

# Setup for Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=14)

# Storage for AUC scores
train_aucs = {}
test_aucs = {}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred_train = cross_val_predict(model, X_train, y_train, cv=cv, method='predict_proba')[:, 1]
    y_pred_test = model.predict_proba(X_test)[:, 1]
    
    train_aucs[model_name] = roc_auc_score(y_train, y_pred_train)
    test_aucs[model_name] = roc_auc_score(y_test, y_pred_test)

# Neural Network
nn_model = create_nn()
nn_model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)
y_pred_train_nn = nn_model.predict(X_train_scaled)
y_pred_test_nn = nn_model.predict(X_test_scaled)
train_aucs["NN"] = roc_auc_score(y_train, y_pred_train_nn)
test_aucs["NN"] = roc_auc_score(y_test, y_pred_test_nn)

# Compile Results
result4 = pd.DataFrame({
    'Model': list(train_aucs.keys()),
    'Train AUC': list(train_aucs.values()),
    'Test AUC': list(test_aucs.values())
})
print(result4)

2023-10-25 15:45:52.403835: W tensorflow/c/c_api.cc:305] Operation '{name:'training_6/Adam/dense_47/bias/m/Assign' id:3347 op device:{requested: '', assigned: ''} def:{{{node training_6/Adam/dense_47/bias/m/Assign}} = AssignVariableOp[_has_manual_control_dependencies=true, dtype=DT_FLOAT, validate_shape=false](training_6/Adam/dense_47/bias/m, training_6/Adam/dense_47/bias/m/Initializer/zeros)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
  updates = self.state_updates
2023-10-25 15:45:52.763204: W tensorflow/c/c_api.cc:305] Operation '{name:'loss_3/mul' id:3132 op device:{requested: '', assigned: ''} def:{{{node loss_3/mul}} = Mul[T=DT_FLOAT, _has_manual_control_dependencies=true](loss_3/mul/x, loss_3/dense_49_loss/value)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect,

      Model  Train AUC  Test AUC
0        LR   0.644877  0.574362
1        RF   0.999942  0.648279
2       XGB   0.998472  0.645321
3      LGBM   0.997082  0.653208
4  CatBoost   0.994339  0.591252
5        NN   0.999982  0.622176


  updates=self.state_updates,
2023-10-25 15:46:23.687476: W tensorflow/c/c_api.cc:305] Operation '{name:'dense_49/Sigmoid' id:3104 op device:{requested: '', assigned: ''} def:{{{node dense_49/Sigmoid}} = Sigmoid[T=DT_FLOAT, _has_manual_control_dependencies=true](dense_49/BiasAdd)}}' was changed by setting attribute after it was run by a session. This mutation will have no effect, and will trigger an error in the future. Either don't modify nodes after running them or create a new session.
