In [2]:
import pandas as pd
import duckdb

# Path to your DuckDB database file
db_path = r'C:\Users\btada\Documents\financial_news.db'

# Connect to the DuckDB database

# Connect to DuckDB
def query_duckdb(query, db_path):
    """Helper function to execute a query on DuckDB and return a DataFrame."""
    conn = duckdb.connect(database=db_path, read_only=False)
    df = conn.execute(query).fetchdf()
    conn.close()
    return df

# SQL Queries
market_data_query = """
SELECT 
    trading_day_date, 
    ticker, 
    price, 
    volume 
FROM Headlines.Market_Data_Daily_Processing;
"""

articles_trading_day_query = """
SELECT 
    mapped_trading_date, 
    ticker, 
    article_title 
FROM Headlines.Articles_Trading_Day;
"""

trading_calendar_query = """
SELECT 
    trading_date 
FROM Headlines.Trading_Calendar;
"""

# Fetch Data
market_data = query_duckdb(market_data_query, db_path)
articles_trading_day = query_duckdb(articles_trading_day_query, db_path)
trading_calendar = query_duckdb(trading_calendar_query, db_path)

# Ensure only valid trading days are considered
market_data = market_data[market_data["trading_day_date"].isin(trading_calendar["trading_date"])]

# Compute daily price change percentage
market_data["price_change"] = market_data["price"].pct_change() * 100

# Identify notable price movement events (threshold: >2% change)
notable_events = market_data[abs(market_data["price_change"]) > 2]

# Merge with articles to find news coverage on those days
notable_events_articles = notable_events.merge(
    articles_trading_day, left_on=["trading_day_date", "ticker"], right_on=["mapped_trading_date", "ticker"], how="left"
)

# Display results
print("Notable Price Movement Events with Articles",notable_events_articles.head())


Notable Price Movement Events with Articles   trading_day_date ticker      price     volume  price_change  \
0       2024-02-06   EXEL  21.830000  1910829.0      2.009344   
1       2024-02-07   EXEL  20.180000  3376200.0     -7.558405   
2       2024-02-26   EXEL  21.959999  2255096.0      6.446922   
3       2024-04-09   EXEL  23.719999  1294084.0      2.506483   
4       2024-04-11   EXEL  22.650000  2357426.0     -4.349667   

  mapped_trading_date                                      article_title  
0                 NaT                                                NaN  
1                 NaT                                                NaN  
2                 NaT                                                NaN  
3          2024-04-09  EXEL or REGN: Which Is the Better Value Stock ...  
4                 NaT                                                NaN  


  market_data["price_change"] = market_data["price"].pct_change() * 100


In [3]:
notable_events_articles.head()

Unnamed: 0,trading_day_date,ticker,price,volume,price_change,mapped_trading_date,article_title
0,2024-02-06,EXEL,21.83,1910829.0,2.009344,NaT,
1,2024-02-07,EXEL,20.18,3376200.0,-7.558405,NaT,
2,2024-02-26,EXEL,21.959999,2255096.0,6.446922,NaT,
3,2024-04-09,EXEL,23.719999,1294084.0,2.506483,2024-04-09,EXEL or REGN: Which Is the Better Value Stock ...
4,2024-04-11,EXEL,22.65,2357426.0,-4.349667,NaT,


In [5]:
notable_events_articles[~notable_events_articles['article_title'].isna()].tail()

Unnamed: 0,trading_day_date,ticker,price,volume,price_change,mapped_trading_date,article_title
139820,2024-07-26,MTCH,33.25,3176644.0,6.026781,2024-07-26,Match Group (MTCH) to Post Q2 Earnings: What's...
139824,2024-07-24,EIX,76.040001,1378864.0,74.483704,2024-07-24,"The Zacks Analyst Blog Highlights CMS Energy, ..."
139826,2024-01-23,RBC,273.130005,131364.0,525.440857,2024-01-23,RBC Bearings to Webcast Third Quarter Fiscal Y...
139830,2024-04-10,TXG,35.310001,1204270.0,-5.512434,2024-04-10,10x Genomics to Report First Quarter 2024 Fina...
139832,2023-12-12,EIX,67.620003,2433498.0,-4.139495,2023-12-12,AES' Board Rewards Shareholders With 4% Divide...


### **Breakdown of the Output (Notable Price Movement Events & Articles)**  

The above output highlights significant **price movements** (>2% change) for the stock **EXEL**, with corresponding **news articles** (if available). 

Column description:  

| **Column**               | **Description** |
|--------------------------|----------------|
| `trading_day_date`       | The **date** of the notable price movement. |
| `ticker`                | The stock symbol (**EXEL** in this case). |
| `price`                 | The stock's **closing price** on that trading day. |
| `volume`                | The **trading volume** for that stock on that day. |
| `price_change`          | The **percentage change** in stock price compared to the previous trading day. |
| `mapped_trading_date`   | The date from the **Articles_Trading_Day** table, which matches articles to trading days. If `NaT`, no matching article was found. |
| `article_title`         | The **headline of a news article** associated with that trading day. If `NaN`, no article was found for that date. |

---

### **Interpreting the Data**
| trading_day_date | ticker | price  | volume   | price_change | mapped_trading_date | article_title |
|-----------------|--------|--------|---------|-------------|-------------------|-----------------------------|
| **2024-02-06** | EXEL   | 21.83  | 1,910,829  | **+2.01%** | `NaT` | `NaN` (No article found) |
| **2024-02-07** | EXEL   | 20.18  | 3,376,200  | **-7.56%** | `NaT` | `NaN` (No article found) |
| **2024-02-26** | EXEL   | 21.96  | 2,255,096  | **+6.45%** | `NaT` | `NaN` (No article found) |
| **2024-04-09** | EXEL   | 23.72  | 1,294,084  | **+2.51%** | **2024-04-09** | **"EXEL or REGN: Which Is the Better Value Stock ..."** |
| **2024-04-11** | EXEL   | 22.65  | 2,357,426  | **-4.35%** | `NaT` | `NaN` (No article found) |

---

### **Key Observations**
1. **Most large price movements don't have matching articles (`NaT`, `NaN`)**  
   - Examples from above: On **Feb 6, 7, and 26**, EXEL had **significant price changes**, but no news articles were found on those dates.
   - This suggests that the stock moved due to **factors other than news headlines** 

2. **April 9 has a matching article**  
   - On **April 9**, EXEL had a **+2.51% price increase** and an article titled:  
     *"EXEL or REGN: Which Is the Better Value Stock ..."*
   - This might indicate that **positive coverage** helped drive a slight increase.  More analysis would need to be done in order to confirm/deny this hypothesis. 

3. **April 11 had a significant price drop (-4.35%) but no article**  
   - If no article exists for this movement, it could be due to:
     - **Delayed market reactions** to older news.
     - **Earnings reports, SEC filings, or macroeconomic factors**.
     - **Social media discussions or analyst ratings** not captured in the dataset.

4. When **EXEL** appears multiple times for the same `trading_day_date`, it indicates **multiple articles were published on that day** related to the stock.

#### **Breakdown of the Duplicate Entries (May 1, 2024)**
| trading_day_date | ticker | price | volume | price_change | mapped_trading_date | article_title |
|-----------------|--------|--------|---------|-------------|-------------------|------------------------------------------|
| **2024-05-01** | EXEL   | 21.92  | 7,471,580 | **-6.54%** | **2024-05-01** | *Exelixis (EXEL) Q1 2024 Earnings Call Transcript* |
| **2024-05-01** | EXEL   | 21.92  | 7,471,580 | **-6.54%** | **2024-05-01** | *Exelixis (EXEL) Q1 Earnings Miss, Cabometyx Sales...* |
| **2024-05-01** | EXEL   | 21.92  | 7,471,580 | **-6.54%** | **2024-05-01** | *Q1 2024 Exelixis Inc Earnings Call* |
| **2024-05-01** | EXEL   | 21.92  | 7,471,580 | **-6.54%** | **2024-05-01** | *Exelixis Inc (EXEL) Q1 2024 Earnings Call Transcript* |

- On **May 1, 2024**, **EXEL's stock dropped by -6.54%** with an **unusually high trading volume (7.47M shares)**.
- This coincided with **multiple earnings-related articles** on the **same day**.
- These articles likely influenced the market's reaction, leading to the drop.
- Since the `trading_day_date` and `mapped_trading_date` align, these articles were **published on the same day the stock price moved**.

In [6]:
# Count the number of articles per trading day per ticker
article_counts = notable_events_articles.groupby(["trading_day_date", "ticker"]).size().reset_index(name="article_count")

# Merge with notable price movement events
notable_events_articles_aggregated = notable_events_articles.merge(
    article_counts, on=["trading_day_date", "ticker"], how="left"
)

# Identify earnings-related articles
earnings_keywords = ["earnings", "q1", "q2", "q3", "q4", "report", "guidance", "miss", "beat"]
notable_events_articles_aggregated["is_earnings_related"] = notable_events_articles_aggregated["article_title"].str.contains(
    "|".join(earnings_keywords), case=False, na=False
)

# Aggregate at the ticker & date level to capture key takeaways
aggregated_results = notable_events_articles_aggregated.groupby(["trading_day_date", "ticker"]).agg(
    price_change=("price_change", "first"),  # Since price change is the same across duplicate rows
    volume=("volume", "first"),  # Volume is the same across duplicate rows
    article_count=("article_count", "first"),  # Number of articles on that day
    earnings_articles=("is_earnings_related", "sum"),  # Count of earnings-related articles
).reset_index()

print("Key Takeaways from Notable Price Movements",aggregated_results.head())

Key Takeaways from Notable Price Movements   trading_day_date ticker  price_change      volume  article_count  \
0       2023-12-01      A    295.911469   1729584.0             15   
1       2023-12-01     AA     34.464371   5458737.0              1   
2       2023-12-01    AAL    -83.079926  48859690.0              1   
3       2023-12-01   AAON    -42.469414    454792.0              1   
4       2023-12-01    AAP     16.825056   2211750.0              1   

   earnings_articles  
0                  7  
1                  0  
2                  0  
3                  0  
4                  0  


In [8]:
aggregated_results.head(15)

Unnamed: 0,trading_day_date,ticker,price_change,volume,article_count,earnings_articles
0,2023-12-01,A,295.911469,1729584.0,15,7
1,2023-12-01,AA,34.464371,5458737.0,1,0
2,2023-12-01,AAL,-83.079926,48859690.0,1,0
3,2023-12-01,AAON,-42.469414,454792.0,1,0
4,2023-12-01,AAP,16.825056,2211750.0,1,0
5,2023-12-01,AAPL,85.453842,45692800.0,1,0
6,2023-12-01,ABBV,440.354156,4903402.0,1,0
7,2023-12-01,ABG,-12.889796,117961.0,1,0
8,2023-12-01,ABNB,73.280296,7422051.0,8,2
9,2023-12-01,ABT,111.323784,4682314.0,16,3


In [12]:
# 1️⃣ Verify Outliers - Identify extreme price changes that may be due to stock splits or data issues
outlier_threshold = 100  # Price changes above 100% might indicate stock splits or incorrect data
outliers = aggregated_results[abs(aggregated_results["price_change"]) > outlier_threshold]

# 2️⃣ Compare Article Counts to Price Moves - Check correlation between article volume and price volatility
correlation = aggregated_results[["price_change", "article_count"]].corr()

# 3️⃣ Look at Multi-Day Trends - Fetch price changes for the next trading day to see if trends continue
aggregated_results["next_trading_day"] = aggregated_results["trading_day_date"] + pd.Timedelta(days=1)

# Merge to get next day's price change
multi_day_trends = aggregated_results.merge(
    aggregated_results[["trading_day_date", "ticker", "price_change"]],
    left_on=["next_trading_day", "ticker"],
    right_on=["trading_day_date", "ticker"],
    suffixes=("", "_next_day"),
    how="left"
).drop(columns=["next_trading_day"])


from tabulate import tabulate  # Optional for better table formatting

# Display outliers for verification
print("\n🚨 Potential Outliers in Price Changes 🚨")
if not outliers.empty:
    print(tabulate(outliers.head(), headers='keys', tablefmt='psql'))
else:
    print("No significant outliers detected.")

# Display correlation between article volume and price movement
print("\n📊 Correlation Between Article Count and Price Change 📊")
print(correlation.to_string())

# Display multi-day trends to check if stock movements persist
print("\n📈 Multi-Day Price Movement Trends 📈")
if not multi_day_trends.empty:
    print(tabulate(multi_day_trends.head(), headers='keys', tablefmt='psql'))
else:
    print("No multi-day trends found.")


🚨 Potential Outliers in Price Changes 🚨
+----+---------------------+----------+----------------+------------------+-----------------+---------------------+---------------------+
|    | trading_day_date    | ticker   |   price_change |           volume |   article_count |   earnings_articles | next_trading_day    |
|----+---------------------+----------+----------------+------------------+-----------------+---------------------+---------------------|
|  0 | 2023-12-01 00:00:00 | A        |        295.911 |      1.72958e+06 |              15 |                   7 | 2023-12-02 00:00:00 |
|  6 | 2023-12-01 00:00:00 | ABBV     |        440.354 |      4.9034e+06  |               1 |                   0 | 2023-12-02 00:00:00 |
|  9 | 2023-12-01 00:00:00 | ABT      |        111.324 |      4.68231e+06 |              16 |                   3 | 2023-12-02 00:00:00 |
| 12 | 2023-12-01 00:00:00 | ACGL     |        139.091 |      1.65989e+06 |               1 |                   0 | 2023-12-02 00:0

## **1. Potential Outliers in Price Changes**
| **Ticker** | **Price Change (%)** | **Volume** | **Articles** | **Earnings-Related Articles** |
|-----------|----------------------|------------|-------------|----------------------|
| **A (Agilent Technologies)** | **+295.91%** | 1.72M | 15 | 7 |
| **ABBV (AbbVie Inc.)** | **+440.35%** | 4.90M | 1 | 0 |
| **ABT (Abbott Laboratories)** | **+111.32%** | 4.68M | 16 | 3 |
| **ACGL (Arch Capital Group)** | **+139.09%** | 1.65M | 1 | 0 |
| **ACLS (Axcelis Technologies)** | **+212.23%** | 454,850 | 1 | 0 |

### **Key Takeaways:**
- **Extreme price moves like +440% for ABBV and +295% for Agilent (A) are highly unusual.**
- **Earnings-related articles are common in large price moves** (e.g., Agilent had 7 earnings articles).
- **ABBV, ACGL, and ACLS had huge price jumps with only 1 or no related articles**, suggesting:
  - A potential **stock split or acquisition announcement**.
  - Possible **bad data or reporting errors** in the dataset.

**Next Step:** Verify whether these extreme price changes are real or if they result from stock splits or incorrect data.

---

## **2. Correlation Between Article Count and Price Change**
```
               price_change  article_count
price_change       1.000000      -0.001236
article_count     -0.001236       1.000000
```
### **Key Takeaways:**
- **There is almost NO correlation (-0.0012) between the number of articles and price movement.**
- This seems to indicate **more news coverage does NOT necessarily predict larger price movements**.
- However, this could be because **the type of article (e.g., earnings vs. general news) matters more than just the number of articles**.

**Next Step:**  
Instead of just counting articles, we could analyze:
- **Types of articles (earnings, merger rumors, etc.).**
- **Sentiment or key words within articles.**
- **News volume over multiple days (not just the same day).**

---

## **3. Multi-Day Price Movement Trends**
| **Ticker** | **Price Change (%)** | **Next Trading Day Price Change (%)** |
|-----------|----------------------|--------------------------------|
| **A (Agilent Technologies)** | **+295.91%** | `NaN` (No data) |
| **AA (Alcoa Corp.)** | **+34.46%** | `NaN` (No data) |
| **AAL (American Airlines)** | **-83.08%** | `NaN` (No data) |
| **AAON (AAON Inc.)** | **-42.46%** | `NaN` (No data) |
| **AAP (Advance Auto Parts)** | **+16.82%** | `NaN` (No data) |

### **Key Takeaways:**
- **All entries are showing `NaN` for next-day price changes.**  
  - This suggests that either the **trading calendar is missing the next day**, or there was no data available.
  - **December 2, 2023, was a weekend**, which explains why these stocks don't have trading data for the next day.
  
**Next Step:**
- Maybe take a look at **T+2 (two days after)** and see if large moves **continue or reverse**.

In [15]:
# Define high-risk words to check in articles
# Load the Loughran-McDonald Master Dictionary
file_path = r"C:\Users\btada\Documents\Loughran-McDonald_MasterDictionary_1993-2023.csv"
df_dict = pd.read_csv(file_path)

# Filter words where any of the key columns have nonzero values (indicating high-risk classification)
high_risk_words = df_dict[
    (df_dict["Negative"] != 0) | (df_dict["Uncertainty"] != 0) | (df_dict["Litigious"] != 0)
]["Word"].str.lower().tolist()

#high_risk_words = ["bankrupt", "lawsuit", "fraud", "investigation", "recall", "regulatory", "miss", "downturn", "cut", "loss"]

# Create a new column to count occurrences of high-risk words in article titles
notable_events_articles_aggregated["high_risk_word_count"] = notable_events_articles_aggregated["article_title"].apply(
    lambda x: sum(word in x.lower() for word in high_risk_words) if pd.notna(x) else 0
)

# Classify articles based on presence of earnings-related and high-risk words
notable_events_articles_aggregated["article_classification"] = notable_events_articles_aggregated.apply(
    lambda row: "Earnings" if row["is_earnings_related"] else (
        "High-Risk" if row["high_risk_word_count"] > 0 else "General"
    ), axis=1
)

# Aggregate classification counts per trading day and ticker
article_classification_summary = notable_events_articles_aggregated.groupby(["trading_day_date", "ticker"]).agg(
    total_articles=("article_title", "count"),
    earnings_articles=("is_earnings_related", "sum"),
    high_risk_articles=("high_risk_word_count", "sum"),
    general_articles=("article_classification", lambda x: (x == "General").sum())
).reset_index()

# Display the classification summary using tabulate
print("\n📊 Article Classification Summary 📊")
if not article_classification_summary.empty:
    print(tabulate(article_classification_summary.head(10), headers="keys", tablefmt="psql"))  # Show first 10 rows
else:
    print("No article classifications found.")


📊 Article Classification Summary 📊
+----+---------------------+----------+------------------+---------------------+----------------------+--------------------+
|    | trading_day_date    | ticker   |   total_articles |   earnings_articles |   high_risk_articles |   general_articles |
|----+---------------------+----------+------------------+---------------------+----------------------+--------------------|
|  0 | 2023-12-01 00:00:00 | A        |               15 |                   7 |                    9 |                  5 |
|  1 | 2023-12-01 00:00:00 | AA       |                0 |                   0 |                    0 |                  1 |
|  2 | 2023-12-01 00:00:00 | AAL      |                0 |                   0 |                    0 |                  1 |
|  3 | 2023-12-01 00:00:00 | AAON     |                0 |                   0 |                    0 |                  1 |
|  4 | 2023-12-01 00:00:00 | AAP      |                0 |                   0 |         

### **Interpreting the Article Classification Summary**
This table breaks down the **types of articles** associated with notable price movements, by classifying them into **earnings-related, high-risk, and general articles**.

---

### **Key Takeaways**
| **Ticker** | **Total Articles** | **Earnings Articles** | **High-Risk Articles** | **General Articles** |
|-----------|------------------|---------------------|----------------------|--------------------|
| **A (Agilent)** | **15** | **7** | **9** | **5** |
| **AA (Alcoa)** | **0** | **0** | **0** | **1** |
| **AAL (American Airlines)** | **0** | **0** | **0** | **1** |
| **AAON (AAON Inc.)** | **0** | **0** | **0** | **1** |
| **AAP (Advance Auto Parts)** | **0** | **0** | **0** | **1** |
| **AAPL (Apple Inc.)** | **0** | **0** | **0** | **1** |
| **ABBV (AbbVie Inc.)** | **0** | **0** | **0** | **1** |
| **ABG (Asbury Automotive)** | **0** | **0** | **0** | **1** |
| **ABNB (Airbnb Inc.)** | **8** | **2** | **2** | **4** |
| **ABT (Abbott Laboratories)** | **16** | **3** | **9** | **8** |

---

#### **Agilent (A) & Abbott (ABT) Had the Most News Coverage**
- **Agilent (A):** **15 total articles**, with **7 earnings-related & 9 high-risk**.
- **Abbott (ABT):** **16 total articles**, with **3 earnings-related & 9 high-risk**.
- **These companies were heavily reported on, suggesting there may have been major news events**.

In [18]:
# Verify if "general articles" should be included in total_articles
# Recalculate total_articles as the sum of earnings, high-risk, and general articles
article_classification_summary["recalculated_total_articles"] = (
    article_classification_summary["earnings_articles"] +
    article_classification_summary["high_risk_articles"] +
    article_classification_summary["general_articles"]
)

# Check if recalculated total articles match the original total_articles
article_classification_summary["total_mismatch"] = article_classification_summary["total_articles"] != article_classification_summary["recalculated_total_articles"]

# Filter rows where there is a mismatch
mismatched_totals = article_classification_summary[article_classification_summary["total_mismatch"]]

# Analyze high-risk article text to ensure correct classification
# Filter notable events articles that are classified as high-risk
high_risk_articles = notable_events_articles_aggregated[notable_events_articles_aggregated["article_classification"] == "High-Risk"]

# Check how article types impact stock prices over multiple days
# Merge stock price movements with article classifications
multi_day_analysis = notable_events_articles_aggregated.merge(
    aggregated_results[["trading_day_date", "ticker", "price_change"]],
    on=["trading_day_date", "ticker"],
    how="left"
)

# Display results
import tabulate

# Display mismatched totals
print("\nMismatched Article Totals")
if not mismatched_totals.empty:
    print(tabulate.tabulate(mismatched_totals.info(), headers="keys", tablefmt="psql"))
else:
    print("✅ No mismatches found in article totals.")

# Display high-risk articles
print("\nHigh-Risk Articles")
if not high_risk_articles.empty:
    print(tabulate.tabulate(high_risk_articles[["trading_day_date", "ticker", "article_title"]].head(10), headers="keys", tablefmt="psql"))
else:
    print("No high-risk articles found.")

# Display multi-day analysis
print("\nMulti-Day Article Impact on Price Changes")
if not multi_day_analysis.empty:
    print(tabulate.tabulate(multi_day_analysis.head(10), headers="keys", tablefmt="psql"))
else:
    print("No multi-day trends found.")



Mismatched Article Totals
<class 'pandas.core.frame.DataFrame'>
Index: 60422 entries, 0 to 80674
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   trading_day_date             60422 non-null  datetime64[us]
 1   ticker                       60422 non-null  object        
 2   total_articles               60422 non-null  int64         
 3   earnings_articles            60422 non-null  int64         
 4   high_risk_articles           60422 non-null  int64         
 5   general_articles             60422 non-null  int64         
 6   recalculated_total_articles  60422 non-null  int64         
 7   total_mismatch               60422 non-null  bool          
dtypes: bool(1), datetime64[us](1), int64(5), object(1)
memory usage: 3.7+ MB


High-Risk Articles
+----+---------------------+----------+----------------------------------------------------------------------------------

## **Mismatched Article Totals (60,422 entries)**
**Problem:** The recalculated total articles **do not match** the reported `total_articles` in **60,422 cases**.

| **Column** | **Description** |
|-----------|----------------|
| `total_articles` | Original count of articles for the stock. |
| `recalculated_total_articles` | Sum of `earnings_articles + high_risk_articles + general_articles`. |
| `total_mismatch` | `True` if the two values don’t match. |

### **Key Insights**
- **Mismatches in article counts** suggest that some articles might be **double-counted, excluded, or misclassified.**

**Next Step:**
- Compare rows where `total_mismatch = True` to see if a **specific category is misclassified or missing**.

---

## **High-Risk Articles - Are They Correctly Classified?**
**Example Articles for `AMAT` (Applied Materials) on 12/01/2023:**
| **Article Title** |
|----------------|
| *US STOCKS-Wall St loses steam as traders assess Fed comments* |
| *Why Applied Materials Stock Dropped Today* |
| *1 Reason Applied Materials Stock Is No Longer A Buy -- and It's Not the Legal Investigation* |
| *These Stocks Moved the Most Today: Applied Materials, Gap, ChargePoint, Ross Stores, Expedia, and More* |

### **Key Insights**
 - **Some articles seem like they are correctly classified** (e.g., *"Why Applied Materials Stock Dropped Today"*).  
 - **But some may be misclassified**—*"Wall Street loses steam"* is **macro news**, not necessarily high-risk for AMAT, for example.  

**Next Step:**
- Refine **high-risk word detection**—should **"drop" and "no longer a buy"** be included?
- Consider **removing market-wide articles** that don’t directly impact specific stocks.

---

## **Multi-Day Article Impact on Stock Prices**
| **Date** | **Ticker** | **Price Change (%)** | **Earnings-Related?** | **High-Risk Words?** | **Next Day Price Change (%)** |
|----------|----------|----------------|----------------|----------------|------------------|
| 2024-02-06 | EXEL | **+2.00%** | No | No | +2.00% |
| 2024-02-07 | EXEL | **-7.55%** | No | No | -7.55% |
| 2024-04-09 | EXEL | **+2.50%** | No | No | +2.50% |
| 2024-05-01 | EXEL | **-6.54%** | ✅ **Yes** | No | -6.54% |
| 2024-05-23 | EXEL | **-2.80%** | No | No | -2.80% |

### **Key Insights**
- **Earnings reports are correlated with price movement.**  
  - *May 1 (Q1 earnings)* ➝ **-6.54% drop.**
  - *Multiple earnings articles (4 total).*
- **General news does not always lead to movement.**  
  - *April 9’s article ("Which Is the Better Value Stock")* had **no major effect**.
- **No strong relationship between high-risk words & price changes.**
  - *Most high-risk word articles did not lead to immediate price drops*.

**Next Steps:**
- Look at **T+2 and T+3 (two and three days later)**—some news may have a **delayed impact**.
- Check whether **certain article types (M&A, analyst ratings, lawsuits) predict larger moves**.

In [19]:
# Investigate Article Mismatches
# Identify mismatches where total_articles does not match recalculated total
mismatch_analysis = article_classification_summary[article_classification_summary["total_mismatch"]].copy()

# Check distribution of mismatches to see where discrepancies exist
mismatch_distribution = mismatch_analysis[["total_articles", "recalculated_total_articles"]].describe()

# Refine High-Risk Classification
# Remove broad market news (e.g., articles mentioning "Wall Street", "Fed", "market downturn")
broad_market_terms = ["wall street", "fed", "market", "stocks", "traders", "index", "sector"]
high_risk_articles_filtered = high_risk_articles[
    ~high_risk_articles["article_title"].str.contains("|".join(broad_market_terms), case=False, na=False)
]

# Expand Multi-Day Analysis (T+2 and T+3)
multi_day_analysis["T+2"] = multi_day_analysis["trading_day_date"] + pd.Timedelta(days=2)
multi_day_analysis["T+3"] = multi_day_analysis["trading_day_date"] + pd.Timedelta(days=3)

# Merge to get T+2 and T+3 price changes
multi_day_analysis = multi_day_analysis.merge(
    aggregated_results[["trading_day_date", "ticker", "price_change"]],
    left_on=["T+2", "ticker"],
    right_on=["trading_day_date", "ticker"],
    suffixes=("", "_T+2"),
    how="left"
).merge(
    aggregated_results[["trading_day_date", "ticker", "price_change"]],
    left_on=["T+3", "ticker"],
    right_on=["trading_day_date", "ticker"],
    suffixes=("", "_T+3"),
    how="left"
).drop(columns=["T+2", "T+3"])

# Display Results
import tabulate

# Display article mismatch distribution
print("\nMismatch Analysis Summary")
print(tabulate.tabulate(mismatch_distribution, headers="keys", tablefmt="psql"))

# Display filtered high-risk articles
print("\nRefined High-Risk Articles (Excluding Broad Market News)")
if not high_risk_articles_filtered.empty:
    print(tabulate.tabulate(high_risk_articles_filtered[["trading_day_date", "ticker", "article_title"]].head(10), headers="keys", tablefmt="psql"))
else:
    print("✅ No filtered high-risk articles remain.")

# Display multi-day price impact
print("\nExpanded Multi-Day Price Impact Analysis (T+2, T+3)")
if not multi_day_analysis.empty:
    print(tabulate.tabulate(multi_day_analysis.head(10), headers="keys", tablefmt="psql"))
else:
    print("No extended multi-day trends found.")



Mismatch Analysis Summary
+-------+------------------+-------------------------------+
|       |   total_articles |   recalculated_total_articles |
|-------+------------------+-------------------------------|
| count |     60422        |                   60422       |
| mean  |         0.826934 |                       2.08254 |
| std   |         3.21135  |                       4.32268 |
| min   |         0        |                       1       |
| 25%   |         0        |                       1       |
| 50%   |         0        |                       1       |
| 75%   |         0        |                       1       |
| max   |        59        |                      83       |
+-------+------------------+-------------------------------+

Refined High-Risk Articles (Excluding Broad Market News)
+----+---------------------+----------+---------------------------------------------------------------------------------------------+
|    | trading_day_date    | ticker   | article_t