In [5]:
import pandas as pd
import duckdb

# Path to DuckDB database
db_path = r'C:\Users\btada\Documents\financial_news.db'

# Establish a single DuckDB connection
conn = duckdb.connect(database=db_path, read_only=False)

# Optimized SQL Queries (with JOIN to remove redundant filtering)
market_data_query = """
SELECT 
    md.trading_day_date, 
    md.ticker, 
    md.price, 
    md.volume 
FROM Headlines.Market_Data_Daily_Processing md
JOIN Headlines.Trading_Calendar tc 
ON md.trading_day_date = tc.trading_date;
"""

articles_trading_day_query = """
SELECT 
    mapped_trading_date, 
    ticker, 
    article_title 
FROM Headlines.Articles_Trading_Day;
"""

# Fetch Data
market_data = conn.execute(market_data_query).fetchdf()
articles_trading_day = conn.execute(articles_trading_day_query).fetchdf()

# Close connection
conn.close()

# Ensure price changes are computed correctly per stock
market_data["price_change"] = market_data.groupby("ticker")["price"].pct_change() * 100

# Identify notable price movement events (>2% change)
notable_events = market_data[abs(market_data["price_change"]) > 2]

# Merge with articles to find news coverage on those days
notable_events_articles = notable_events.merge(
    articles_trading_day, 
    left_on=["trading_day_date", "ticker"], 
    right_on=["mapped_trading_date", "ticker"], 
    how="left"
)

# Display sample results
print("\nNotable Price Movement Events with Articles")
print(notable_events_articles.head())

# Aggregate Price Movements for Modeling (Create aggregated_results)
aggregated_results = notable_events_articles.groupby(["trading_day_date", "ticker"]).agg(
    price_change=("price_change", "first"),  # Since price change is the same across duplicate rows
    volume=("volume", "first"),  # Volume is the same across duplicate rows
).reset_index()

print("\n✅ Aggregated Results – Sample Price Movements:")
print(aggregated_results.head())


  market_data["price_change"] = market_data.groupby("ticker")["price"].pct_change() * 100



Notable Price Movement Events with Articles
  trading_day_date ticker      price     volume  price_change  \
0       2024-11-21    ORI  38.220001   852850.0      2.001595   
1       2024-12-10    ORI  35.980000  1158504.0     -3.097230   
2       2024-12-17    ORI  36.430000  1479164.0     -2.853334   
3       2024-12-18    ORI  35.630001  1411634.0     -2.195990   
4       2022-01-03   NUVL  18.590000    37765.0     -2.363449   

  mapped_trading_date                                      article_title  
0          2024-11-21  With 73% ownership of the shares, Old Republic...  
1                 NaT                                                NaN  
2                 NaT                                                NaN  
3                 NaT                                                NaN  
4                 NaT                                                NaN  

✅ Aggregated Results – Sample Price Movements:
  trading_day_date ticker  price_change     volume
0       2022-01

In [7]:
# 🛠 Feature Engineering

# Count number of articles per trading day per ticker
article_counts = notable_events_articles.groupby(["trading_day_date", "ticker"]).size().reset_index(name="article_count")

# Merge article counts with notable price movement events
notable_events_articles_aggregated = notable_events_articles.merge(
    article_counts, on=["trading_day_date", "ticker"], how="left"
)

# Identify earnings-related articles
earnings_keywords = ["earnings", "q1", "q2", "q3", "q4", "report", "guidance", "miss", "beat"]
notable_events_articles_aggregated["is_earnings_related"] = notable_events_articles_aggregated["article_title"].str.contains(
    "|".join(earnings_keywords), case=False, na=False
)

# Load Loughran-McDonald Dictionary for High-Risk Words
lm_dict_path = r"C:\Users\btada\Documents\Loughran-McDonald_MasterDictionary_1993-2023.csv"
lm_dict = pd.read_csv(lm_dict_path)

high_risk_words = lm_dict.query("Negative != 0 or Uncertainty != 0 or Litigious != 0")["Word"].str.lower().tolist()

# Count high-risk words in article titles
notable_events_articles_aggregated["high_risk_word_count"] = notable_events_articles_aggregated["article_title"].apply(
    lambda x: sum(word in x.lower() for word in high_risk_words) if pd.notna(x) else 0
)

# Classify articles
def classify_article(row):
    if row["is_earnings_related"]:
        return "Earnings"
    elif row["high_risk_word_count"] > 0:
        return "High-Risk"
    else:
        return "General"

notable_events_articles_aggregated["article_classification"] = notable_events_articles_aggregated.apply(classify_article, axis=1)

# Aggregate Classification Summary
article_classification_summary = notable_events_articles_aggregated.groupby(["trading_day_date", "ticker"]).agg(
    total_articles=("article_title", "count"),
    earnings_articles=("is_earnings_related", "sum"),
    high_risk_articles=("high_risk_word_count", "sum"),
    general_articles=("article_classification", lambda x: (x == "General").sum())
).reset_index()

print("\nFeature Engineering Completed – Sample of Aggregated Classification Summary:")
print(article_classification_summary.head())



Feature Engineering Completed – Sample of Aggregated Classification Summary:
  trading_day_date ticker  total_articles  earnings_articles  \
0       2022-01-01   ACIW               0                  0   
1       2022-01-01    AEL               0                  0   
2       2022-01-01    AEO               0                  0   
3       2022-01-01     AI               0                  0   
4       2022-01-01    ALB               0                  0   

   high_risk_articles  general_articles  
0                   0                 1  
1                   0                 1  
2                   0                 1  
3                   0                 1  
4                   0                 1  


In [8]:
# 📊 Risk Score Computation

# News Risk Score: Weighted sum of high-risk, earnings, and general articles
notable_events_articles_aggregated["news_risk_score"] = (
    notable_events_articles_aggregated["high_risk_word_count"] * 3 +  # High-Risk Words → 3x weight
    notable_events_articles_aggregated["is_earnings_related"].astype(int) * 2 +  # Earnings Articles → 2x weight
    notable_events_articles_aggregated["article_count"] * 1  # General Articles → 1x weight
)

# Normalize price change to scale price impact between 0 and 10
max_price_change = notable_events_articles_aggregated["price_change"].abs().max()
notable_events_articles_aggregated["price_impact_score"] = (
    notable_events_articles_aggregated["price_change"].abs() / max_price_change
) * 10

# Final Weighted Risk Score: News (60%) + Price Impact (40%)
notable_events_articles_aggregated["final_risk_score"] = (
    notable_events_articles_aggregated["news_risk_score"] * 0.6 +
    notable_events_articles_aggregated["price_impact_score"] * 0.4
)

# Aggregate Risk Scores per Stock per Trading Day (Ensure no double counting)
risk_score_summary = notable_events_articles_aggregated.groupby(["trading_day_date", "ticker"]).agg(
    total_articles=("article_count", "sum"),
    total_high_risk_articles=("high_risk_word_count", "sum"),
    total_earnings_articles=("is_earnings_related", "sum"),
    avg_news_risk_score=("news_risk_score", "mean"),
    avg_price_impact_score=("price_impact_score", "mean"),
    avg_final_risk_score=("final_risk_score", "mean")
).reset_index()

print("\nRisk Score Computation Complete – Sample Risk Scores:")
print(risk_score_summary.head())



Risk Score Computation Complete – Sample Risk Scores:
  trading_day_date ticker  total_articles  total_high_risk_articles  \
0       2022-01-01   ACIW               1                         0   
1       2022-01-01    AEL               1                         0   
2       2022-01-01    AEO               1                         0   
3       2022-01-01     AI               1                         0   
4       2022-01-01    ALB               1                         0   

   total_earnings_articles  avg_news_risk_score  avg_price_impact_score  \
0                        0                  1.0                0.144540   
1                        0                  1.0                0.088733   
2                        0                  1.0                0.155312   
3                        0                  1.0                0.057692   
4                        0                  1.0                0.312690   

   avg_final_risk_score  
0              0.657816  
1              

### **Current Risk Score Computation – Explanation**

The **Risk Score** in this notebook is designed to measure **the potential market impact of news articles** on **a stock’s daily price movement**. It combines **textual analysis from news articles** with **actual price changes** to capture **the relationship between news sentiment and stock volatility**.

---

### **1️Components of the Risk Score**
The risk score is calculated based on **two primary factors**:

#### **1. News-Based Risk Indicators (Weighted Scoring System)**  
Each news article is evaluated based on **its content** and **classified** into:
| **Indicator**            | **Description**                                        | **Weight** |
|--------------------------|--------------------------------------------------------|------------|
| **High-Risk Words**       | Articles containing **negative, uncertainty, or legal terms** (e.g., "lawsuit", "bankrupt", "uncertain") from the **Loughran-McDonald Dictionary**. | **+3 per occurrence** |
| **Earnings Mentions**     | Articles containing **earnings-related terms** (e.g., "earnings", "report", "guidance", "miss", "beat"). | **+2 per article** |
| **General News Volume**   | **All other articles** that do not fall into the above categories. | **+1 per article** |

This results in a **news risk score** per trading day:
\[
\text{News Risk Score} = 3 \times (\text{High-Risk Words}) + 2 \times (\text{Earnings Articles}) + 1 \times (\text{General Articles})
\]

#### **2. Price Impact Normalization**
Price movement is also factored in, as **large price swings** (up or down) often indicate **high market volatility**.  
Price impact is **normalized** to a **0-10 scale** based on the **largest observed price change** across the dataset:
\[
\text{Price Impact Score} = \left(\frac{\left|\text{Price Change (\%)}\right|}{\text{Max Price Change (\%)}}\right) \times 10
\]

---

### **2Final Risk Score Formula**
The **final risk score** combines **news risk indicators** and **price impact** with **a weighted formula**:
\[
\text{Final Risk Score} = (0.6 \times \text{News Risk Score}) + (0.4 \times \text{Price Impact Score})
\]

- **60% Weight → News Risk (Content/Sentiment)**  
- **40% Weight → Price Impact (Market Reaction)**

This **weighted approach** reflects the idea that **news sentiment often drives price changes**, but **large price swings** themselves may **indicate risk** regardless of news coverage.

---

### **Summary**
| **Risk Component**         | **Purpose**                                                      | **Weighting in Final Score** |
|----------------------------|-------------------------------------------------------------------|-------------------------------|
| **News Risk Indicators**    | Evaluate **news content** using high-risk words, earnings, and volume. | **60%** |
| **Price Impact Normalization** | Capture **market volatility** based on daily price movements.   | **40%** |

The **Risk Score** aims to **quantify the relationship between market sentiment and price volatility**, providing a **composite indicator** that can be **used for modeling and prediction**.

---

### **Why This Approach?**
- **High-Risk Words & Earnings are prioritized** because **negative sentiment or financial disclosures** often **signal uncertainty**.
- **Price Impact adds market confirmation**, ensuring that **large movements** are also flagged as **risky** even if **news coverage is light**.

In [11]:
# Model Training

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# Final Modeling DataFrame: Merge Risk Scores with Price Changes from aggregated_results
risk_price_validation = risk_score_summary.merge(
    aggregated_results[["trading_day_date", "ticker", "price_change"]],
    on=["trading_day_date", "ticker"],
    how="left"
)

nltk.download('vader_lexicon')

# 1️⃣ Lagged Risk Score (Previous Day's Risk)
risk_price_validation["lagged_risk_score"] = risk_price_validation.groupby("ticker")["avg_final_risk_score"].shift(1)

# 2️⃣ Sentiment-Weighted Risk Score
# Merge back article titles to get sentiment
risk_price_validation = risk_price_validation.merge(
    notable_events_articles_aggregated[["trading_day_date", "ticker", "article_title"]],
    on=["trading_day_date", "ticker"],
    how="left"
)

sia = SentimentIntensityAnalyzer()
risk_price_validation["sentiment_score"] = risk_price_validation["article_title"].fillna("").apply(
    lambda x: sia.polarity_scores(x)["compound"]
)

risk_price_validation["adjusted_risk_score"] = (
    risk_price_validation["avg_final_risk_score"] + (-risk_price_validation["sentiment_score"] * 5)
)

# Optional: Clean up the DataFrame if you no longer need article titles
risk_price_validation.drop(columns=["article_title", "sentiment_score"], inplace=True)


# Feature Selection & Data Preparation
features = [
    "avg_final_risk_score", "adjusted_risk_score", "lagged_risk_score", 
    "total_articles", "total_high_risk_articles", "total_earnings_articles"
]
target = "price_change"

ml_data = risk_price_validation.dropna(subset=features + [target])

X = ml_data[features]
y = ml_data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define Models
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Ridge Regression": Ridge(alpha=1.0),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, objective="reg:squarederror"),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    #"Support Vector Regression (SVR)": SVR(kernel="linear"),
    "Neural Network (MLP)": MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=200, warm_start=True, random_state=42),
}

# Train & Evaluate Models
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {"MAE": mae, "MSE": mse, "R²": r2}

# Display Model Performance
import pandas as pd
from tabulate import tabulate

model_comparison = pd.DataFrame(results).T
print("\nModel Comparison")
print(tabulate(model_comparison, headers="keys", tablefmt="psql"))


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\btada/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



Model Comparison
+----------------------+---------+----------+-----------+
|                      |     MAE |      MSE |        R² |
|----------------------+---------+----------+-----------|
| RandomForest         | 4.9505  |  87.1631 | 0.673869  |
| Ridge Regression     | 6.1935  | 266.135  | 0.0042229 |
| XGBoost              | 5.72131 |  90.057  | 0.663041  |
| Decision Tree        | 4.77659 | 133.238  | 0.501475  |
| Neural Network (MLP) | 5.97711 |  90.5466 | 0.661209  |
+----------------------+---------+----------+-----------+




### **Approach, Results, and Next Steps Summary**

---

## **Approach Summary**
This project aims to **predict daily stock price movements** by **analyzing news sentiment and price volatility**.  
A **Risk Score** was constructed to **quantify the relationship between market sentiment and stock volatility** using the following approach:

---

### **Risk Score Computation**
1. **News-Based Indicators (Weighted System)**:
   - **High-Risk Words (Loughran-McDonald Dictionary)** → **+3 per occurrence**.
   - **Earnings-Related Articles** → **+2 per article**.
   - **General News Volume** → **+1 per article**.

2. **Price Impact Normalization**:
   - **Price changes are scaled** to **0-10** based on the **largest observed daily movement**.

3. **Final Risk Score**:
   \[
   \text{Final Risk Score} = (0.6 \times \text{News Risk Score}) + (0.4 \times \text{Price Impact Score})
   \]

---

### **Feature Engineering for Modeling**
- **Lagged Risk Score** (previous day’s risk) → captures potential **momentum effects**.
- **Adjusted Risk Score (VADER Sentiment)** → adjusts the risk score based on **article sentiment polarity**.
- **Final features**:
  - `avg_final_risk_score`  
  - `adjusted_risk_score`  
  - `lagged_risk_score`  
  - `total_articles`  
  - `total_high_risk_articles`  
  - `total_earnings_articles`

---

### **Model Training**
Models Trained:
| **Model**              | **Key Characteristics** |
|------------------------|--------------------------|
| **RandomForest**        | Non-linear, robust to noise. |
| **Ridge Regression**    | Linear model with L2 regularization. |
| **XGBoost**             | Gradient boosting model; often excels in structured data. |
| **Decision Tree**        | Simple, interpretable, but prone to overfitting. |
| **Neural Network (MLP)** | Captures complex patterns but requires tuning. |

---

## **Results Summary**
| **Model**              | **MAE** | **MSE** | **R²** |
|------------------------|---------|---------|--------|
| **RandomForest**        | **4.95** | **87.16** | **0.674** |
| **Ridge Regression**    | 6.19    | 266.13  | 0.004  |
| **XGBoost**             | 5.72    | 90.06   | 0.663  |
| **Decision Tree**        | 4.78    | 133.24  | 0.501  |
| **Neural Network (MLP)** | 5.98    | 90.55   | 0.661  |

---

### **Key Takeaways:**
- **RandomForest** and **XGBoost** performed the best, achieving **R² ~ 0.67**, suggesting that **the Risk Score and sentiment features capture some meaningful relationship** with price changes.
- **Linear models like Ridge Regression performed poorly**, indicating **the relationship between news and price movements is non-linear**.
- **Neural Networks and Decision Trees performed moderately well** but **were outperformed by ensemble models like RandomForest**.

---

## **Next Steps**
### **Incorporate FinBERT Sentiment Scores**
While **VADER** sentiment scores were useful, **FinBERT** is a **finance-specific sentiment model** trained on **financial news and filings**.  
**Replacing or combining VADER with FinBERT** could **improve sentiment precision** and **further enhance the Adjusted Risk Score**.

#### **Integrate FinBERT Sentiment (Future Update):**
1. **Join `finbert_sentiment` Table** (from the ERD you shared) with the `notable_events_articles_aggregated` DataFrame.
2. **Use the sentiment polarity/score from FinBERT** to replace **or adjust the VADER sentiment score**.
3. **Update the `adjusted_risk_score` feature** to incorporate **FinBERT sentiment**.
4. **Compare Model Performance** before/after.

---

### **Article-Level Modeling (Alternative Approach)**  
Instead of **aggregating by day**, **each article becomes one observation**.  
This approach allows us to ask:  
**“Given this article title, what is the expected price change?”**

#### **Steps to Implement:**
1. **Treat Each Article as a Row**.
2. **Features**:
   - **High-Risk Word Count** (title & description).
   - **FinBERT Sentiment Score**.
   - **VADER Sentiment (optional for comparison)**.
   - **Article Metadata** (e.g., publication time, source).
   - **Earnings mention flag**.
3. **Target Variable: Price Movement Window**:
   - **Same day price change**.
   - **T+1 or T+2 price change**.
   - **Define a window** (e.g., **T to T+1 price change**).
4. **Model Training**:
   - Regression models to **predict price change** from **article features**.

---

### **Hybrid Approach (Article + Daily Signals)**
This **keeps the daily aggregation but introduces article-level features** as **additional signals**:
| **Granular Features**                        | **Example Integration**                               |
|-----------------------------------------------|--------------------------------------------------------|
| **Was there a High-Risk Article today?**      | Binary 0/1 feature.                                    |
| **Sentiment of the most negative article?**   | Min sentiment score for the day.                       |
| **How many articles had sentiment < -0.5?**   | Count of highly negative articles.                     |
| **Article Volume for the day?**               | Already present (`total_articles`).                    |

#### **Steps to Implement:**
1. **Extract these granular features during Risk Score Computation.**
2. **Add them to `risk_price_validation`** as **additional input features**.
3. **Evaluate whether these “most negative” or “high-risk” articles drive stronger market reactions**.

---

## **Summary of Next Steps**
| **Priority** | **Action**                                   | **Expected Benefit**                                         |
|--------------|----------------------------------------------|--------------------------------------------------------------|
| **High**     | **Integrate `finbert_sentiment` as a feature**| More precise **finance-specific sentiment analysis**.         |
| **Medium**   | **Hybrid Approach (Add Article-Level Granular Features)** | Capture **maximum negative sentiment or high-risk counts**.  |
| **Low**      | **Explore Full Article-Level Model**           | Granular prediction from **individual articles**, but requires **restructuring the data**.

---

## **Potential Plan Moving Forward**
| **Step**                     | **Description**                                 | **Priority** |
|------------------------------|--------------------------------------------------|--------------|
| **1. Integrate `finbert_sentiment`** | Replace or supplement VADER with FinBERT.        | **High**    |
| **2. Add Granular Features** | Track **most negative sentiment & high-risk articles** per day. | **Medium**  |
| **3. Explore Article-Level Model** | Build **article-level dataset** for price prediction. | **Low**     |
