In [12]:
import sys
import os

# Add the path to the volatility_analysis directory
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)
sys.path.append(os.path.join(project_root, 'code', 'volatility_analysis'))

# Import the volatility analysis library directly
from volatility_pipeline import *

import torch
torch.set_grad_enabled(True)
print(f"Gradients enabled: {torch.is_grad_enabled()}")

Gradients enabled: True


In [13]:
FETCH_DATA = False # Set to True to fetch new news data
FROM, TO = "2023-01-01", "2025-12-31"
CUT = "2024-10-01"  # Date to split train/test data
TOPIC = "BUSINESS"
MARKET_NAME = "DAX 40"
WINDOW = 7
EPOCHS = 200
LSTM_TYPE = "simple"  # Options: "improved", "simple"
USE_TECHNICAL_INDICATORS = False

In [14]:
keywords = [
    "dax",  # German stock index (equivalent to IBEX 35)
    "inflation",  # Inflation
    "zinssatz",  # Interest rates
    "zölle",  # Tariffs
    "wirtschaftswachstum",  # Economic growth
    "rezession",  # Recession
    "arbeitslosigkeit",  # Unemployment
    "subventionen",  # Subsidies
    "investition",  # Investment
    "expansion"  # Expansion
]

KEYWORDS = " OR ".join([f'"{keyword}"' for keyword in keywords])

In [15]:
import pandas as pd, time, random
from pygooglenews import GoogleNews
# Fetch news data function
def fetch_dax_news_data():
    gn = GoogleNews(lang="de", country="DE")

    all_frames = []

    start_dt = pd.to_datetime(FROM)
    end_dt   = pd.to_datetime(TO)

    while start_dt <= end_dt:
        stop_dt = min(start_dt + pd.Timedelta(days=WINDOW-1), end_dt)

        feed    = gn.search(KEYWORDS,
                            from_=start_dt.strftime("%Y-%m-%d"),
                            to_  =stop_dt.strftime("%Y-%m-%d"))

        df      = pd.DataFrame(
                    {"date":  [pd.to_datetime(e.published).tz_localize(None)
                                for e in feed["entries"]],
                    "title": [e.title for e in feed["entries"]]}
                )
        all_frames.append(df)

        # polite pause to avoid 429 errors
        time.sleep(random.uniform(1.0, 2.5))
        start_dt = stop_dt + pd.Timedelta(days=1)

    news_df = (pd.concat(all_frames, ignore_index=True)
                .drop_duplicates())          # remove overlaps
    
    # Remove the "- name of the news source" suffix from titles
    news_df['title'] = news_df['title'].str.replace(r' - .+$', '', regex=True)
    

    # Save to CSV
    os.makedirs("../news", exist_ok=True)
    news_df.to_csv(f"../news/news_data_{FROM}_{TO}_DAX.csv", index=False)
    
    return news_df

In [16]:
# Fetch or load news data
if FETCH_DATA:
    news_df = fetch_dax_news_data()
    print(f"Fetched {len(news_df)} news articles")
else:
    # Read the CSV file
    news_df = pd.read_csv(f"../news/news_data_{FROM}_{TO}_DAX.csv", parse_dates=['date'])
    print(f"Loaded {len(news_df)} news articles")

Loaded 11711 news articles


In [17]:
import yfinance

# Fetch DAX 40 market data using yfinance
dax40 = yfinance.Ticker("^GDAXI")
dax40_data = dax40.history(start=FROM, end=TO, interval="1d", auto_adjust=True)
dax40_data = dax40_data.reset_index()  # Convert index to column
dax40_data["Returns"] = dax40_data["Close"].pct_change()
dax40_data["Volatility"] = dax40_data["Returns"].rolling(window=21).std() * np.sqrt(252)
dax40_data = dax40_data.dropna().reset_index(drop=True)

print(f"Loaded {len(dax40_data)} days of DAX 40 market data")
dax40_data.head()

Loaded 589 days of DAX 40 market data


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Returns,Volatility
0,2023-01-31 00:00:00+01:00,15081.169922,15136.219727,14993.589844,15128.269531,57047200,0.0,0.0,0.000145,0.123464
1,2023-02-01 00:00:00+01:00,15125.120117,15222.339844,15107.830078,15180.740234,57095200,0.0,0.0,0.003468,0.122377
2,2023-02-02 00:00:00+01:00,15275.0,15520.969727,15264.30957,15509.19043,122979300,0.0,0.0,0.021636,0.122071
3,2023-02-03 00:00:00+01:00,15408.400391,15476.669922,15347.860352,15476.429688,76253700,0.0,0.0,-0.002112,0.121006
4,2023-02-06 00:00:00+01:00,15367.040039,15406.929688,15275.570312,15345.910156,54430400,0.0,0.0,-0.008433,0.123242


In [None]:
# Run the volatility analysis pipeline
results = run_volatility_pipeline(
    news_df=news_df,
    stock_data=dax40_data,
    market_name=MARKET_NAME,
    cut_date=CUT,
    output_dir="../news",
    seq_len=10,
    epochs=EPOCHS,
    learning_rate=0.001,
    verbose=True,
    use_technical_indicators=USE_TECHNICAL_INDICATORS,
    lstm_type=LSTM_TYPE,
    patience=100
)

Splitting data at 2024-10-01...
Sentiment model loaded on device: cuda
GPU: NVIDIA GeForce RTX 4070
Calculating enhanced sentiment scores...


In [None]:
results_no_sentiment = run_volatility_pipeline(
    news_df=news_df,
    stock_data=dax40_data,
    market_name=MARKET_NAME + " (No Sentiment)",
    cut_date=CUT,
    output_dir="../news",
    seq_len=10,
    epochs=EPOCHS,
    learning_rate=0.001,
    use_sentiment=False,
    verbose=True,
    use_technical_indicators=USE_TECHNICAL_INDICATORS,
    lstm_type=LSTM_TYPE,
)

In [None]:
import matplotlib.pyplot as plt
# Plot the MAE over time for both models
plt.figure(figsize=(14, 6))
plt.plot(results['test_dates'], np.abs(results['y_actual'] - results['y_pred']), label='With Sentiment', color='blue', linewidth=2)
plt.plot(results_no_sentiment['test_dates'], np.abs(results_no_sentiment['y_actual'] - results_no_sentiment['y_pred']), label='Without Sentiment', color='orange', linewidth=2)
plt.title('LSTM Model Absolute Error Over Time (DAX 40)', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=14)
plt.ylabel('Absolute Error', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Compare metrics between sentiment-based and volatility-only models
print("\n=== MODEL COMPARISON ===")
print("With Sentiment:")
for metric_name, value in results['metrics'].items():
    print(f"  {metric_name}: {value:.6f}")

print("\nWithout Sentiment:")
for metric_name, value in results_no_sentiment['metrics'].items():
    print(f"  {metric_name}: {value:.6f}")