In [3]:
# Import libraries
import yfinance as yf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.colors as mcolors
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from tabulate import tabulate
import json
import datetime
from datetime import date, timedelta
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

# Set visualization style
sns.set(style='whitegrid', palette='muted', color_codes=True)

# Import additional libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve, log_loss, mean_squared_error, r2_score, ConfusionMatrixDisplay, roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import random
random.seed(42)
np.random.seed(42)
RANDOM_STATE = 42

import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

from plot_utils import *

# SMOTE
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=RANDOM_STATE)

In [2]:
from tensorflow.keras.models import load_model

# If saved as .h5
model = load_model('best_model.keras')
model.summary()
# If saved as SavedModel folder
# model = load_model('best_sentiment_model')


  saveable.load_own_variables(weights_store.get(inner_path))


## Data Entry

In [None]:
import pandas as pd
import numpy as np

# Example: extend dataset to 22 days for demonstration
dates = pd.date_range(start='2024-02-13', periods=22, freq='D')  # 22 consecutive days
close_prices = np.linspace(5200, 5025.7, 22)  # decreasing example prices
titles = [
    # Positive (first 11)
    "Global Stock Markets Surge on Strong Economic Data",
    "Investor Confidence Boosted as Companies Report Record Profits",
    "Tech Stocks Soar Following Breakthrough Product Launches",
    "Major Banks Report Better-than-Expected Earnings",
    "Financial Experts Optimistic About Market Recovery",
    "Central Bank Signals Supportive Monetary Policy",
    "Commodity Prices Rise on Strong Global Demand",
    "Emerging Markets See Influx of Foreign Investment",
    "Housing Market Shows Signs of Robust Growth",
    "Equity Markets Reach New Record Highs",
    "Corporate Earnings Exceed Analysts’ Expectations",

    # Negative (last 12)
    "Global Stock Markets Plunge as Major Banks Face Insolvency Crisis",
    "Investor Confidence Shattered as Top Banks Report Insolvency",
    "Stock Exchanges Tumble Amid Banking Sector Liquidity Concerns",
    "Major Banks Struggle as Market Volatility Surges",
    "Financial Experts Warn of Impending Market Downturn",
    "Central Bank Announces Emergency Rate Hike Amid Crisis",
    "Tech Stocks Collapse Following Regulatory Investigation",
    "Commodity Prices Crash as Global Demand Slows",
    "Corporate Debt Defaults Spike Amid Economic Uncertainty",
    "Currency Markets Wobble as Investor Panic Escalates",
    "Mortgage-Backed Securities Face Sudden Valuation Drop",
]


example_df = pd.DataFrame({
    'Title': titles,
    'Date': dates,
    'Close': close_prices
})

prev_21_df = example_df.iloc[:21].copy()

new_entry_df = example_df.iloc[21:].copy()

print("Previous 21 days:")
print(prev_21_df)

print("\nNew entry day:")
print(new_entry_df)


Previous 21 days:
                                                Title       Date   Close
0   Global Stock Markets Surge on Strong Economic ... 2024-02-13  5200.0
1   Investor Confidence Boosted as Companies Repor... 2024-02-14  5191.7
2   Tech Stocks Soar Following Breakthrough Produc... 2024-02-15  5183.4
3    Major Banks Report Better-than-Expected Earnings 2024-02-16  5175.1
4   Financial Experts Optimistic About Market Reco... 2024-02-17  5166.8
5     Central Bank Signals Supportive Monetary Policy 2024-02-18  5158.5
6       Commodity Prices Rise on Strong Global Demand 2024-02-19  5150.2
7   Emerging Markets See Influx of Foreign Investment 2024-02-20  5141.9
8         Housing Market Shows Signs of Robust Growth 2024-02-21  5133.6
9               Equity Markets Reach New Record Highs 2024-02-22  5125.3
10   Corporate Earnings Exceed Analysts’ Expectations 2024-02-23  5117.0
11  Global Stock Markets Plunge as Major Banks Fac... 2024-02-24  5108.7
12  Investor Confidence Shattered

### Sentiment: 

Title -> sentiment_score_finbert -> avg_sentiment_score_a_day -> 22_day_sentiment_volatility

In [111]:
# Load FinBERT (pre-trained for financial sentiment)
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finbert')
model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finbert')

def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Custom score: Positive = +1, Neutral = 0, Negative = -1
    print(probs[0])

    score = (probs[0][0] - probs[0][1]).item()
    return score

In [112]:
tqdm.pandas()
prev_21_df['sentiment_score_finbert'] = prev_21_df['Title'].progress_apply(get_sentiment_score)
new_entry_df['sentiment_score_finbert'] = new_entry_df['Title'].progress_apply(get_sentiment_score)
print(prev_21_df)
print(new_entry_df)

 19%|█▉        | 4/21 [00:00<00:00, 31.09it/s]

tensor([0.8803, 0.0580, 0.0618], grad_fn=<SelectBackward0>)
tensor([0.9035, 0.0374, 0.0592], grad_fn=<SelectBackward0>)
tensor([0.9266, 0.0219, 0.0515], grad_fn=<SelectBackward0>)
tensor([0.9298, 0.0508, 0.0195], grad_fn=<SelectBackward0>)
tensor([0.6382, 0.0290, 0.3328], grad_fn=<SelectBackward0>)


 52%|█████▏    | 11/21 [00:00<00:00, 27.39it/s]

tensor([0.9420, 0.0182, 0.0398], grad_fn=<SelectBackward0>)
tensor([0.8041, 0.1542, 0.0417], grad_fn=<SelectBackward0>)
tensor([0.9152, 0.0201, 0.0647], grad_fn=<SelectBackward0>)
tensor([0.9495, 0.0212, 0.0293], grad_fn=<SelectBackward0>)
tensor([0.7476, 0.0953, 0.1571], grad_fn=<SelectBackward0>)
tensor([0.9266, 0.0322, 0.0412], grad_fn=<SelectBackward0>)


 67%|██████▋   | 14/21 [00:00<00:00, 26.18it/s]

tensor([0.0098, 0.9536, 0.0366], grad_fn=<SelectBackward0>)
tensor([0.0196, 0.9413, 0.0392], grad_fn=<SelectBackward0>)
tensor([0.0139, 0.9474, 0.0387], grad_fn=<SelectBackward0>)
tensor([0.0192, 0.9432, 0.0376], grad_fn=<SelectBackward0>)
tensor([0.0169, 0.9443, 0.0388], grad_fn=<SelectBackward0>)


100%|██████████| 21/21 [00:00<00:00, 23.27it/s]


tensor([0.2473, 0.4917, 0.2610], grad_fn=<SelectBackward0>)
tensor([0.0129, 0.9323, 0.0548], grad_fn=<SelectBackward0>)
tensor([0.0100, 0.9642, 0.0258], grad_fn=<SelectBackward0>)
tensor([0.0268, 0.9600, 0.0132], grad_fn=<SelectBackward0>)
tensor([0.0187, 0.9195, 0.0618], grad_fn=<SelectBackward0>)


100%|██████████| 1/1 [00:00<00:00, 24.09it/s]

tensor([0.0110, 0.9450, 0.0440], grad_fn=<SelectBackward0>)
                                                Title       Date   Close  \
0   Global Stock Markets Surge on Strong Economic ... 2024-02-13  5200.0   
1   Investor Confidence Boosted as Companies Repor... 2024-02-14  5191.7   
2   Tech Stocks Soar Following Breakthrough Produc... 2024-02-15  5183.4   
3    Major Banks Report Better-than-Expected Earnings 2024-02-16  5175.1   
4   Financial Experts Optimistic About Market Reco... 2024-02-17  5166.8   
5     Central Bank Signals Supportive Monetary Policy 2024-02-18  5158.5   
6       Commodity Prices Rise on Strong Global Demand 2024-02-19  5150.2   
7   Emerging Markets See Influx of Foreign Investment 2024-02-20  5141.9   
8         Housing Market Shows Signs of Robust Growth 2024-02-21  5133.6   
9               Equity Markets Reach New Record Highs 2024-02-22  5125.3   
10   Corporate Earnings Exceed Analysts’ Expectations 2024-02-23  5117.0   
11  Global Stock Markets Plu




In [121]:
volatility_period = 22
raw_merged_df = pd.concat([prev_21_df, new_entry_df], ignore_index=True)

daily_sentiment = raw_merged_df.groupby('Date')['sentiment_score_finbert'].mean().reset_index()
daily_sentiment.rename(columns={'sentiment_score_finbert': 'avg_sentiment_score_a_day'}, inplace=True)

daily_sentiment[f'{volatility_period}_day_sentiment_volatility'] = daily_sentiment['avg_sentiment_score_a_day'].rolling(window=volatility_period).std()

merged_df = pd.merge(raw_merged_df[['Date', 'Close']].drop_duplicates(), daily_sentiment, on='Date', how='left')

prev_21_merged = merged_df.iloc[:21].copy()
new_entry_merged = merged_df.iloc[21:].copy()

print("Previous 21 days with volatility:")
print(prev_21_merged)

print("\nNew entry day with volatility:")
print(new_entry_merged)


Previous 21 days with volatility:
         Date   Close  avg_sentiment_score_a_day  22_day_sentiment_volatility
0  2024-02-13  5200.0                   0.822283                          NaN
1  2024-02-14  5191.7                   0.866107                          NaN
2  2024-02-15  5183.4                   0.904664                          NaN
3  2024-02-16  5175.1                   0.878993                          NaN
4  2024-02-17  5166.8                   0.609170                          NaN
5  2024-02-18  5158.5                   0.923874                          NaN
6  2024-02-19  5150.2                   0.649844                          NaN
7  2024-02-20  5141.9                   0.895059                          NaN
8  2024-02-21  5133.6                   0.928305                          NaN
9  2024-02-22  5125.3                   0.652382                          NaN
10 2024-02-23  5117.0                   0.894442                          NaN
11 2024-02-24  5108.7         

### Market

In [122]:
# Calculate n-day market return
merged_df[f'daily_return'] = merged_df['Close'].pct_change() * 100
merged_df[f'{volatility_period}_day_market_volatility'] = merged_df['Close'].rolling(window=volatility_period).std()

print(merged_df)

         Date   Close  avg_sentiment_score_a_day  22_day_sentiment_volatility  \
0  2024-02-13  5200.0                   0.822283                          NaN   
1  2024-02-14  5191.7                   0.866107                          NaN   
2  2024-02-15  5183.4                   0.904664                          NaN   
3  2024-02-16  5175.1                   0.878993                          NaN   
4  2024-02-17  5166.8                   0.609170                          NaN   
5  2024-02-18  5158.5                   0.923874                          NaN   
6  2024-02-19  5150.2                   0.649844                          NaN   
7  2024-02-20  5141.9                   0.895059                          NaN   
8  2024-02-21  5133.6                   0.928305                          NaN   
9  2024-02-22  5125.3                   0.652382                          NaN   
10 2024-02-23  5117.0                   0.894442                          NaN   
11 2024-02-24  5108.7       

In [134]:
# For example, we'll make up a daily_return for the first date
merged_df.at[0, 'daily_return'] = -0.159442

### VaR and ES

In [138]:
def add_var_es_features(df, return_col='daily_return', w=volatility_period, confidence=0.05):
    df = df.copy()
    z_score = abs(stats.norm.ppf(confidence))

    # Rolling mean and std of returns
    rolling_mu = df[return_col].rolling(window=w).mean()
    rolling_sigma = df[return_col].rolling(window=w).std()

    # Parametric Gaussian VaR (1-day, % return)
    df[f'VaR_{w}'] = - (rolling_mu + z_score * rolling_sigma)

    # Expected Shortfall (ES) assuming normal distribution
    pdf_factor = np.exp(-0.5 * z_score**2) / (np.sqrt(2 * np.pi))
    cdf_factor = confidence
    df[f'ES_{w}'] = - (rolling_mu + (rolling_sigma * pdf_factor / cdf_factor))

    return df

market_sentiment_data = add_var_es_features(merged_df, return_col='daily_return', confidence=0.05)
market_sentiment_data.iloc[21:]

Unnamed: 0,Date,Close,avg_sentiment_score_a_day,22_day_sentiment_volatility,daily_return,22_day_market_volatility,VaR_22,ES_22
21,2024-03-05,5025.7,-0.934014,0.879304,-0.164879,53.896769,0.159298,0.158587


## Modelling

### CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.utils import class_weight

def cnn_model(n, feature_cols, market_sentiment_data, cnn_results, X_test_dict, y_test_dict, lag=10, window_size=10):
    market_sentiment_data_with_lags = market_sentiment_data.copy()

    train_data = market_sentiment_data_with_lags[market_sentiment_data_with_lags['Date'] < '2022-01-01']
    test_data = market_sentiment_data_with_lags[(market_sentiment_data_with_lags['Date'] >= '2022-01-01')]
    train_df = train_data.dropna(subset=feature_cols+['future_crash']).copy()
    test_df = test_data.dropna(subset=feature_cols+['future_crash']).copy()

    # Prepare feature matrices and target vectors
    X_train = train_df[feature_cols].fillna(train_df[feature_cols].mean())
    y_train = train_df['future_crash']
    X_train_raw = X_train.values
    y_train_raw = y_train.values

    X_test = test_df[feature_cols].fillna(train_df[feature_cols].mean())
    y_test = test_df['future_crash']
    X_test_raw = X_test.values
    y_test_raw = y_test.values

    X_train_resampled, y_train_resampled = X_train_raw, y_train_raw

    # Convert to DataFrame for convenience
    X_train_resampled = pd.DataFrame(X_train_resampled, columns=X_train.columns)
    y_train_resampled = pd.Series(y_train_resampled)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_resampled)
    X_test_scaled = scaler.transform(X_test_raw)

    X_train_seq, y_train_seq, _ = create_sequences(pd.DataFrame(X_train_scaled, columns=X_train.columns), y_train_resampled, window=window_size)
    X_test_seq, y_test_seq, _ = create_sequences(pd.DataFrame(X_test_scaled, columns=X_test.columns), pd.Series(y_test_raw), window=window_size)

    # Define CNN model
    cnn_model = Sequential([
        Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(window_size, X_train_seq.shape[2])),
        MaxPooling1D(pool_size=2),
        Conv1D(filters=128, kernel_size=3, activation='relu'),
        GlobalMaxPooling1D(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')  # Binary classification
    ])

    cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

    class_weights_array = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train_seq),
        y=y_train_seq
    )
    class_weight_dict = dict(enumerate(class_weights_array))

    # Train
    cnn_model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32, validation_split=0.2, verbose=0, 
                class_weight=class_weight_dict)

    y_prob = cnn_model.predict(X_test_seq).flatten()
    best_threshold = dynamic_threshold_calculate(y_test_seq, y_prob)
    y_pred = (y_prob > best_threshold).astype(int)

    # Evaluate
    auc_score = roc_auc_score(y_test_seq, y_prob)
    report = classification_report(y_test_seq, y_pred, output_dict=True)
    conf = confusion_matrix(y_test_seq, y_pred)
    date_seq = test_df['Date'].iloc[window_size:].reset_index(drop=True)
    
    # Save model and results
    cnn_results[n] = {
        'model': cnn_model,
        'auc_score': auc_score,
        'report': report,
        'y_test': y_test_seq,
        'y_prob': y_prob,
        'features': feature_cols,
        'confusion': conf,
        'date_seq': date_seq,
    }

    X_test_dict[n] = X_test_seq
    y_test_dict[n] = y_test_seq

    print(f"\n=== {n}-Day CNN Model ===")
    print(f"AUC: {auc_score:.3f}")
    print(classification_report(y_test_seq, y_pred))