# 4. Data Oversampling

In this notebook, we conduct oversampling methods as shown below:
- random oversampling
- SMOTE(NC)
- Borderline SMOTE
- ADASYN

Note that, we only conduct oversampling for companies before 2022. 

In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler, SMOTENC, BorderlineSMOTE, ADASYN

In [2]:
df_median_original = pd.read_csv('../database/companies/imputation/median/median_original.csv')
df_knn = pd.read_csv('../database/companies/imputation/kNN/kNN_original.csv')
df_miceforest = pd.read_csv('../database/companies/imputation/MiceForest/MiceForest_original.csv')
df_gain = pd.read_csv('../database/companies/imputation/GAIN/GAIN_original.csv')

df_list = [df_median_original, df_knn, df_miceforest, df_gain]

  df_median_original = pd.read_csv('../database/companies/imputation/median/median_original.csv')


In [3]:
governance = ["ceo_is_female","unequal_voting","ceo_tenure","board_size","classified_board_system","poison_pill","buyback_yield",
              "dividend_payout_ratio","cf_to_total_compensation_to_executives","cf_to_total_compensation_to_board_members"]

operation = ["cf_to_capex_industry_peers_percentile","net_debt_to_ebitda_industry_peers_percentile",
             "current_ratio_industry_peers_percentile","ebitda_margin_industry_peers_percentile",
             "sales_to_total_assets_industry_peers_percentile","employee_growth_rate_industry_peers_percentile",
             "fcf_yield_industry_peers_percentile","sales_growth_rate_industry_peers_percentile",
             "cash_conversion_cycle_industry_peers_percentile","interest_coverage_ratio_industry_peers_percentile"]

ownership = ["free_float_percentage","institution_ownership_percentage","insider_shares_percentage"]


technical = ['rsi_14d','rsi_30d','volatility_30d','volatility_90d','volatility_180d',"volume_30d_average_to_outstanding"]

returns = ['total_return_5y', 'total_return_4y', 'total_return_3y','total_return_2y', 'total_return_1y', 'total_return_6m','total_return_3m']

valuation = ["roe_industry_peers_percentile","operating_roic_industry_peers_percentile","pe_ratio_industry_peers_percentile",
             "eps_industry_peers_percentile","ev_to_sales_industry_peers_percentile","tobin_q_ratio_industry_peers_percentile",
             "pb_ratio_industry_peers_percentile","asset_to_equity_industry_peers_percentile","ev_ebitda_industry_peers_percentile", "ev_to_asset_industry_peers_percentile"]

binary = ["ceo_is_female","unequal_voting", "classified_board_system","poison_pill"]

features = governance + operation + ownership + technical + returns + valuation

categorical_features_indices = [features.index(col) for col in binary]

In [4]:
df_names = ['median_original', 'kNN_original', 'MiceForest_original', 'GAIN_original']

df_paths = [
    '../database/companies/imputation/median/',
    '../database/companies/imputation/kNN/',
    '../database/companies/imputation/MiceForest/',
    '../database/companies/imputation/GAIN/'
]

## 1. Random Oversampling (ROSE)

In [5]:
ros = RandomOverSampler(random_state=0)

for df, name, path in zip(df_list, df_names, df_paths):
    
    # split the training set and test set
    df_training = df[df['training_data'] == 1]
    df_testing = df[df['training_data'] == 0]

    
    # conduct oversampling for training set
    X = df_training[features]
    y = df_training["targeted"]
    X_resampled, y_resampled = ros.fit_resample(X, y)
    
    # convert it to pandas dataframe
    df_resampled_training = pd.DataFrame(X_resampled, columns=features)
    df_resampled_training["targeted"] = y_resampled
    df_resampled_training["training_data"] = 1
    
    # concatenate it with the test data
    df_resampled = pd.concat([df_resampled_training, df_testing], ignore_index=True)
    
    # convert the non-binary values of the binary columns to the binary value
    for col in binary:
        if col in df_resampled.columns:
            df_resampled[col] = (df_resampled[col] > 0.5).astype(int)
    
    # save this as the csv file.
    resampled_path = path + name.replace("_original", "_ROSE") + ".csv"
    df_resampled.to_csv(resampled_path, index=False)


## 2. SMOTENC

In [6]:
smote_nc = SMOTENC(categorical_features=categorical_features_indices, random_state=0)

for df, name, path in zip(df_list, df_names, df_paths):
 
    df_training = df[df['training_data'] == 1]
    df_testing = df[df['training_data'] == 0]
    
    X = df_training[features]
    y = df_training["targeted"]
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)

    df_resampled_training = pd.DataFrame(X_resampled, columns=features)
    df_resampled_training["targeted"] = y_resampled
    df_resampled_training["training_data"] = 1

    df_resampled = pd.concat([df_resampled_training, df_testing], ignore_index=True)

    resampled_path = path + name.replace("_original", "_SMOTENC") + ".csv"
    df_resampled.to_csv(resampled_path, index=False)
    

## 3. Borderline SMOTE

In [7]:
borderline_smote = BorderlineSMOTE(random_state=0)

for df, name, path in zip(df_list, df_names, df_paths):

    df_training = df[df['training_data'] == 1]
    df_testing = df[df['training_data'] == 0]
    df_training = df[df['year'] < 2022]
    df_testing = df[df['year'] == 2022]

    X = df_training[features]
    y = df_training["targeted"]
    X_resampled, y_resampled = borderline_smote.fit_resample(X, y)

    df_resampled_training = pd.DataFrame(X_resampled, columns=features)
    df_resampled_training["targeted"] = y_resampled
    df_resampled_training["training_data"] = 1


    for col in binary:
        if col in df_resampled_training.columns:
            df_resampled_training[col] = (df_resampled_training[col] > 0.5).astype(int)

    df_resampled = pd.concat([df_resampled_training, df_testing], ignore_index=True)

    resampled_path = path + name.replace("_original", "_BORDERLINESMOTE") + ".csv"
    df_resampled.to_csv(resampled_path, index=False)
    

## 4. ADASYN

In [8]:
adasyn = ADASYN(random_state=42)

for df, name, path in zip(df_list, df_names, df_paths):

    df_training = df[df['training_data'] == 1]
    df_testing = df[df['training_data'] == 0]

    X = df_training[features]
    y = df_training["targeted"]
    X_resampled, y_resampled = adasyn.fit_resample(X, y)

    df_resampled_training = pd.DataFrame(X_resampled, columns=features)
    df_resampled_training["targeted"] = y_resampled
    df_resampled_training["training_data"] = 1

    for col in binary:
        if col in df_resampled_training.columns:
            df_resampled_training[col] = (df_resampled_training[col] > 0.5).astype(int)

    df_resampled = pd.concat([df_resampled_training, df_testing], ignore_index=True)

    resampled_path = path + name.replace("_original", "_ADASYN") + ".csv"
    df_resampled.to_csv(resampled_path, index=False)
