In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from pylab import rcParams
import pickle
import importlib
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [3]:
# Load the labeled index data from csv
labeled_index_df = pd.read_csv("./data/index/labeled_index.csv")

## Period Adjustment
- To predict a crash on day t, the daily price changes from each day prior to t could be used as a feature

In [4]:
# TODO
month = 3

##  Hyperparameter Tuning

In [20]:
param_grid = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'sag', 'saga']}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(x_train, y_train)
best_params = grid_search.best_params_

## Build the model

In [22]:
# Define features
x = labeled_index_df[['price_change', 'volume', 'volatility']]

# Define target variable
y = labeled_index_df['crash_label']

# Split the training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Scale the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# Train the logistic regression model by fitting the train data into the model
model = LogisticRegression(C=best_params['C'], solver=best_params['solver'])
model.fit(x_train, y_train)

In [23]:
# Evaluate the model
y_pred = model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9933290978398983
Confusion Matrix:
[[6252    0]
 [  42    2]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6252
           1       1.00      0.05      0.09        44

    accuracy                           0.99      6296
   macro avg       1.00      0.52      0.54      6296
weighted avg       0.99      0.99      0.99      6296



Problem: low recall and f1 score for crash label 1

## K-fold Validation

In [24]:
# Perform 10-fold cross-validation
scores = cross_val_score(model, x, y, cv=10)

# Print the cross-validation scores
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))

Cross-validation scores: [0.99237611 0.99237611 0.99237611 0.99237611 0.99237611 0.99237611
 0.99237611 0.99237611 0.99237611 0.99205845]
Average cross-validation score: 0.99


## Prediction Result

In [18]:
y_prob = model.predict_proba(x_test)
crash_prob = y_prob[:, 1]
print(f"Crash Probability: {crash_prob.mean()}")

Crash Probability: 0.0073005651712750586
