In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, roc_curve
from pylab import rcParams
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.impute import SimpleImputer
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle
import importlib
import sys
#from visualization import plot_cv_indices
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
import glob


## Adding Features

In [85]:
# Load the labeled index data from csv
labeled_index = pd.read_csv("./data/labeled_data/quarterly_labeled_index_standardized.csv")
# Concat the index data with features - Q1 1998 - Q4 2019
Marco_folder = "./data/Features_Marco"
Micro_folder = "./data/Features_Micro"
csv_file_pattern = "*.csv"
Marco_csv_files = glob.glob(f"{Marco_folder}/{csv_file_pattern}")
Micro_csv_files = glob.glob(f"{Micro_folder}/{csv_file_pattern}")


# Iterate over each CSV file in Marco_csv_files
for file in Marco_csv_files:
    data = pd.read_csv(file)
    # Get the file name from the CSV file path
    file_name = file.split('/')[-1]
    # Extract the index value from the file name
    new_name = file_name.split('.')[0]
    # Rename the 'Percentage Change' column to the index value
    data.rename(columns={'Percentage Change': new_name}, inplace=True)
    
    # Merge the data based on the specified conditions
    labeled_index = labeled_index.merge(data,
                                     how='left',
                                     left_on='Quarter',
                                     right_on='Quarter')

# Iterate over each CSV file in Micro_csv_files
for file in Micro_csv_files:
    data = pd.read_csv(file)
    # Get the file name from the CSV file path
    file_name = file.split('/')[-1]
    # Extract the index value from the file name
    new_name = file_name.split('.')[0]
    # Rename the 'Percentage Change' column to the index value
    data.rename(columns={'Percentage Change': new_name}, inplace=True)
    # Merge the data based on the specified conditions
    labeled_index = labeled_index.merge(data[['Quarter', 'index', new_name]],
                                        how='left',
                                        left_on=['Quarter', 'index'],
                                        right_on=['Quarter', 'index'])

# Save the merge_file DataFrame to a CSV file
labeled_index.to_csv("./data/merge_file.csv", index=False)
print(labeled_index)

     volatility      index  crash_label  price_change  volume_change  \
0     -0.628184  000001.SS            0      0.187602      -0.127074   
1     -0.628184  000001.SS            0      0.504626      -0.127074   
2     -0.625325  000001.SS            1     -0.794994      -0.127074   
3     -0.631044  000001.SS            0     -0.842731      -0.127074   
4     -0.629157  000001.SS            0     -0.082791      -0.127074   
..          ...        ...          ...           ...            ...   
875   -0.492953      ^SSMI            0     -0.799923      -0.127074   
876   -0.529507      ^SSMI            0      0.913863      -0.127074   
877   -0.458061      ^SSMI            0      0.217140      -0.127074   
878   -0.387992      ^SSMI            0     -0.010611      -0.127074   
879   -0.351613      ^SSMI            0      0.296246      -0.127074   

           date  Quarter  Crude_Oil_Index_Excess_Return_Quarterly  \
0     31/3/1998  Q1 1998                                -0.159831 

## Build the model