In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

**For HDBSCAN I only saved the cluster IDs not the vectors**

In [2]:
# Load saved clusters
data_with_clusters = np.load("Word2Vec/retry/Finbert/finbert_hdbscan_cluster_labels.npy")
# article_vectors = data_with_clusters[:, :-1]  # Extract article vectors
cluster_labels = data_with_clusters  # Extract cluster labels

# Count articles in each cluster
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()

# Print cluster sizes
# print("Cluster Distribution:\n", cluster_counts)

# # Plot cluster sizes
# plt.figure(figsize=(12, 5))
# cluster_counts.plot(kind="bar", color="steelblue")
# plt.xlabel("Cluster ID")
# plt.ylabel("Number of Articles")
# plt.title("HDBSCAN Finbert Cluster Distribution")
# plt.xticks(rotation=45)
# plt.show()


In [3]:
len(data_with_clusters)

363820

In [4]:
from collections import Counter
import ast

# Load cleaned dataset 
df_cleaned = pd.read_csv("Word2Vec/retry/data/articles_cleaned_spacy_with_numbers.csv")
# remove dupes from title 
# df_cleaned = df_cleaned.drop_duplicates(subset=['article_title'])

In [5]:
df_cleaned.shape

(363820, 7)

In [6]:
# Convert token strings back to lists (if stored as strings)
df_cleaned["tokens"] = df_cleaned["tokens"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Add DBSCAN cluster labels to dataset
df_cleaned["cluster"] = cluster_labels.astype(int)  # Ensure int type for easier grouping

# Function to get most common words per cluster
def get_top_words(cluster_id, n=15):
    words = [word for tokens in df_cleaned[df_cleaned["cluster"] == cluster_id]["tokens"] for word in tokens]
    word_counts = Counter(words).most_common(n)
    return word_counts

# Analyze a few sample clusters
sample_clusters = [0, 10, 2, 13, 14]  # Adjust based on distribution
for cluster_id in sample_clusters:
    print(f"\n**Top Words in Cluster {cluster_id}:**")
    print(get_top_words(cluster_id))



**Top Words in Cluster 0:**
[('stock', 159), ('earning', 80), ('company', 70), ('2024', 58), ('$', 56), ('market', 55), ('%', 46), ('buy', 45), ('ai', 44), (' ', 41), ('year', 41), ('investor', 37), ('share', 35), ('report', 35), ('quarter', 35)]

**Top Words in Cluster 10:**
[('stock', 190), ('earning', 85), ('company', 79), ('$', 72), ('market', 69), (' ', 68), ('2024', 68), ('%', 66), ('share', 54), ('inc', 52), ('investor', 44), ('year', 44), ('buy', 43), ('report', 43), ('new', 40)]

**Top Words in Cluster 2:**
[('stock', 193), ('earning', 116), ('2024', 92), ('company', 84), (' ', 69), ('%', 63), ('investor', 60), ('$', 60), ('quarter', 54), ('look', 52), ('market', 52), ('buy', 51), ('report', 50), ('inc', 50), ('new', 45)]

**Top Words in Cluster 13:**
[('stock', 131), ('earning', 68), ('company', 50), ('inc', 46), ('ai', 45), ('$', 45), ('market', 44), ('2024', 44), ('%', 34), ('buy', 32), (' ', 30), ('report', 29), ('investor', 29), ('share', 29), ('year', 27)]

**Top Words 

In [8]:
# Load Loughran-McDonald Lexicon
lm_dict_path = 'Word2Vec/csv/Loughran-McDonald_MasterDictionary_1993-2023.csv'
lm_df = pd.read_csv(lm_dict_path)

# Define sentiment categories
sentiment_categories = ['Negative', 'Positive', 'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal']
lm_sentiment_dict = {category: set(lm_df[lm_df[category] > 0]['Word'].str.lower()) for category in sentiment_categories}

In [10]:
def assign_risk_theme(cluster_id):
    words = [word for tokens in df_cleaned[df_cleaned["cluster"] == cluster_id]["tokens"] for word in tokens]
    word_counts = Counter(words)
    total_words = sum(word_counts.values())

    category_proportions = {}
    for category, word_set in lm_sentiment_dict.items():
        category_count = sum(count for word, count in word_counts.items() if word in word_set)
        category_proportions[category] = category_count / total_words if total_words > 0 else 0

    # Custom thresholds
    low_freq_categories = ['Litigious', 'Strong_Modal', 'Weak_Modal','Uncertainty']
    threshold = {cat: 0.01 if cat in low_freq_categories else 0.05 for cat in category_proportions}

    # Apply threshold check
    valid_categories = {cat: score for cat, score in category_proportions.items() if score > threshold[cat]}

    if valid_categories:
        dominant_category = max(valid_categories, key=valid_categories.get)
        return dominant_category
    else:
        return 'Mixed Sentiment'


In [11]:
# Function to assign a risk theme to a cluster
# def assign_risk_theme(cluster_id):
#     words = [word for tokens in df_cleaned[df_cleaned["cluster"] == cluster_id]["tokens"] for word in tokens]
#     word_counts = Counter(words)
#     total_words = sum(word_counts.values())

#     category_proportions = {}
#     for category, word_set in lm_sentiment_dict.items():
#         category_count = sum(count for word, count in word_counts.items() if word in word_set)
#         category_proportions[category] = category_count / total_words if total_words > 0 else 0

#     # Determine dominant category
#     dominant_category = max(category_proportions, key=category_proportions.get)
#     return dominant_category if category_proportions[dominant_category] > 0.01 else 'Mixed Sentiment'


# Assign risk labels to all clusters
# cluster_risk_labels = {cluster_id: assign_risk_theme(cluster_id) for cluster_id in cluster_counts.index}
filtered_clusters = [c for c in cluster_counts.index if c != 0]
cluster_risk_labels = {cluster_id: assign_risk_theme(cluster_id) for cluster_id in filtered_clusters}

# Print risk labels
print("\n**Cluster Risk Labels:**")
for cluster_id, risk_theme in cluster_risk_labels.items():
    print(f"Cluster {cluster_id}: {risk_theme}")



**Cluster Risk Labels:**
Cluster -1: Mixed Sentiment
Cluster 1: Mixed Sentiment
Cluster 2: Mixed Sentiment
Cluster 3: Mixed Sentiment
Cluster 4: Mixed Sentiment
Cluster 5: Mixed Sentiment
Cluster 6: Mixed Sentiment
Cluster 7: Mixed Sentiment
Cluster 8: Mixed Sentiment
Cluster 9: Mixed Sentiment
Cluster 10: Mixed Sentiment
Cluster 11: Mixed Sentiment
Cluster 12: Mixed Sentiment
Cluster 13: Mixed Sentiment
Cluster 14: Mixed Sentiment
Cluster 15: Mixed Sentiment
Cluster 16: Mixed Sentiment
Cluster 17: Mixed Sentiment
Cluster 18: Mixed Sentiment
Cluster 19: Mixed Sentiment
Cluster 20: Mixed Sentiment
Cluster 21: Mixed Sentiment
Cluster 22: Mixed Sentiment
Cluster 23: Mixed Sentiment
Cluster 24: Mixed Sentiment
Cluster 25: Mixed Sentiment
Cluster 26: Mixed Sentiment
Cluster 27: Mixed Sentiment
Cluster 28: Mixed Sentiment
Cluster 29: Mixed Sentiment
Cluster 30: Mixed Sentiment
Cluster 31: Mixed Sentiment
Cluster 32: Mixed Sentiment
Cluster 33: Mixed Sentiment
Cluster 34: Mixed Sentiment
Clu

In [12]:
# Function to get top words per cluster
def get_top_words(cluster_id, n=20):
    words = [word for tokens in df_cleaned[df_cleaned["cluster"] == cluster_id]["tokens"] for word in tokens]
    word_counts = Counter(words).most_common(n)
    return word_counts

# Check high-risk clusters
high_risk_clusters = [12, 14]  # Uncertainty and Negative clusters
for cluster_id in high_risk_clusters:
    print(f"\n**Top Words in Cluster {cluster_id}:**")
    print(get_top_words(cluster_id))



**Top Words in Cluster 12:**
[('stock', 746), ('company', 305), ('$', 304), ('earning', 296), ('2024', 292), ('market', 268), (' ', 261), ('%', 232), ('inc', 225), ('ai', 199), ('investor', 193), ('buy', 187), ('year', 181), ('report', 173), ('quarter', 166), ('share', 162), ('look', 153), ('growth', 138), ('announce', 135), ('new', 134)]

**Top Words in Cluster 14:**
[('stock', 387), ('$', 159), ('2024', 151), ('earning', 136), ('market', 127), ('company', 125), ('%', 115), ('inc', 108), (' ', 98), ('investor', 91), ('buy', 89), ('report', 86), ('year', 83), ('share', 80), ('ai', 79), ('quarter', 78), ('growth', 76), ('look', 72), ('announce', 71), ('result', 65)]


In [13]:
# Count how many articles per stock belong to each risk cluster
stock_risk_exposure = df_cleaned.groupby(["ticker", "cluster"]).size().unstack(fill_value=0)

# Merge risk labels
stock_risk_exposure.columns = [cluster_risk_labels.get(cluster_id, "Unknown") for cluster_id in stock_risk_exposure.columns]

# Save results
# stock_risk_exposure.to_csv("scratch/Word2Vec/stock_risk_exposure.csv")

# Show a sample
print("\n**Stock-Level Risk Exposure:**")
print(stock_risk_exposure.head(20))



**Stock-Level Risk Exposure:**
        Mixed Sentiment  Unknown  Mixed Sentiment  Mixed Sentiment  \
ticker                                                               
A                   221        0                0                1   
AA                  106        0                0                1   
AAL                 402        0                0                0   
AAON                 73        0                1                0   
AAP                  83        0                0                0   
AAPL               4503        6                1                2   
ABBV                777        2                1                1   
ABCB                 37        0                0                0   
ABG                  76        0                0                0   
ABNB                700        2                1                0   
ABT                 512        2                2                0   
ACA                  73        0                0         

# Multi Tagging

In [14]:
# Define thresholds for each category
low_freq_categories = ['Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal']
category_thresholds = {cat: 0.01 if cat in low_freq_categories else 0.01 for cat in sentiment_categories}

In [15]:
from collections import Counter

def assign_risk_themes_multi(cluster_id):
    words = [word for tokens in df_cleaned[df_cleaned["cluster"] == cluster_id]["tokens"] for word in tokens]
    word_counts = Counter(words)
    total_words = sum(word_counts.values())

    category_proportions = {}
    for category, word_set in lm_sentiment_dict.items():
        category_count = sum(count for word, count in word_counts.items() if word in word_set)
        category_proportions[category] = category_count / total_words if total_words > 0 else 0

    # Select all categories that pass their respective thresholds
    selected_categories = [
        cat for cat, prop in category_proportions.items()
        if prop > category_thresholds.get(cat, 0.05)
    ]

    return selected_categories if selected_categories else ['Mixed Sentiment']


In [16]:
cluster_risk_labels_multi = {
    cluster_id: assign_risk_themes_multi(cluster_id)
    for cluster_id in cluster_counts.index
}

# Print the results
print("\n**Multi-Label Cluster Risk Tags:**")
for cluster_id, risk_tags in cluster_risk_labels_multi.items():
    print(f"Cluster {cluster_id}: {risk_tags}")



**Multi-Label Cluster Risk Tags:**
Cluster -1: ['Negative', 'Positive']
Cluster 0: ['Negative', 'Positive']
Cluster 1: ['Negative', 'Positive']
Cluster 2: ['Negative', 'Positive']
Cluster 3: ['Negative', 'Positive']
Cluster 4: ['Negative', 'Positive']
Cluster 5: ['Negative', 'Positive']
Cluster 6: ['Negative', 'Positive']
Cluster 7: ['Negative', 'Positive']
Cluster 8: ['Negative', 'Positive']
Cluster 9: ['Negative', 'Positive']
Cluster 10: ['Negative', 'Positive']
Cluster 11: ['Negative', 'Positive']
Cluster 12: ['Negative', 'Positive']
Cluster 13: ['Negative', 'Positive']
Cluster 14: ['Negative', 'Positive']
Cluster 15: ['Negative', 'Positive']
Cluster 16: ['Negative', 'Positive']
Cluster 17: ['Negative', 'Positive']
Cluster 18: ['Negative', 'Positive']
Cluster 19: ['Negative', 'Positive']
Cluster 20: ['Negative', 'Positive']
Cluster 21: ['Negative', 'Positive']
Cluster 22: ['Negative', 'Positive']
Cluster 23: ['Negative', 'Positive']
Cluster 24: ['Negative', 'Positive']
Cluster 25: 

In [17]:
df_cleaned["risk_tags"] = df_cleaned["cluster"].map(cluster_risk_labels_multi)
df_exploded = df_cleaned.explode("risk_tags")

print(df_exploded["risk_tags"].value_counts())
df_exploded.head(10)

risk_tags
Negative    363820
Positive    363820
Name: count, dtype: int64


Unnamed: 0,ticker,mapped_trading_date,article_title,description,full_text,clean_text,tokens,cluster,risk_tags
0,DXC,2024-07-11,DXC Technology to Report Fiscal First Quarter ...,"ASHBURN, Va., July 10, 2024--DXC Technology (N...",DXC Technology to Report Fiscal First Quarter ...,dxc technology to report fiscal first quarter ...,"[dxc, technology, report, fiscal, quarter, 202...",-1,Negative
0,DXC,2024-07-11,DXC Technology to Report Fiscal First Quarter ...,"ASHBURN, Va., July 10, 2024--DXC Technology (N...",DXC Technology to Report Fiscal First Quarter ...,dxc technology to report fiscal first quarter ...,"[dxc, technology, report, fiscal, quarter, 202...",-1,Positive
1,DRI,2024-07-17,Darden to buy Chuyâs for more than $600M,The all-cash transaction will add a Tex-Mex ch...,Darden to buy Chuyâs for more than $600M The...,darden to buy chuyâs for more than $600m the a...,"[darden, buy, chuyâs, $, 600, m, allcash, tran...",-1,Negative
1,DRI,2024-07-17,Darden to buy Chuyâs for more than $600M,The all-cash transaction will add a Tex-Mex ch...,Darden to buy Chuyâs for more than $600M The...,darden to buy chuyâs for more than $600m the a...,"[darden, buy, chuyâs, $, 600, m, allcash, tran...",-1,Positive
2,DVA,2024-07-19,Update: DaVita to Pay $34.5 Million for Settle...,Update: DaVita to Pay $34.5 Million for Settle...,Update: DaVita to Pay $34.5 Million for Settle...,update davita to pay $345 million for settleme...,"[update, davita, pay, $, 345, million, settlem...",-1,Negative
2,DVA,2024-07-19,Update: DaVita to Pay $34.5 Million for Settle...,Update: DaVita to Pay $34.5 Million for Settle...,Update: DaVita to Pay $34.5 Million for Settle...,update davita to pay $345 million for settleme...,"[update, davita, pay, $, 345, million, settlem...",-1,Positive
3,DRI,2024-07-22,Analysts Have Made A Financial Statement On Da...,Last week saw the newest yearly earnings relea...,Analysts Have Made A Financial Statement On Da...,analysts have made a financial statement on da...,"[analyst, financial, statement, darden, restau...",-1,Negative
3,DRI,2024-07-22,Analysts Have Made A Financial Statement On Da...,Last week saw the newest yearly earnings relea...,Analysts Have Made A Financial Statement On Da...,analysts have made a financial statement on da...,"[analyst, financial, statement, darden, restau...",-1,Positive
4,DVA,2024-07-23,DaVita HealthCare (DVA) Laps the Stock Market:...,The latest trading day saw DaVita HealthCare (...,DaVita HealthCare (DVA) Laps the Stock Market:...,davita healthcare dva laps the stock market he...,"[davita, healthcare, dva, lap, stock, market, ...",-1,Negative
4,DVA,2024-07-23,DaVita HealthCare (DVA) Laps the Stock Market:...,The latest trading day saw DaVita HealthCare (...,DaVita HealthCare (DVA) Laps the Stock Market:...,davita healthcare dva laps the stock market he...,"[davita, healthcare, dva, lap, stock, market, ...",-1,Positive


In [18]:
stock_risk_exposure = (
    df_exploded
    .groupby(["article_title", "risk_tags"])
    .size()
    .unstack(fill_value=0)
    .sort_index(axis=1)  
)

stock_risk_exposure.head(10)

risk_tags,Negative,Positive
article_title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""Bad News"" is Good News for these 3 Stocks",1,1
"""Barneyâs World"" to Premiere Monday, October 14 on Max",1,1
"""Big 3"" carmakers face ""tough choices"" to hit inventory targets - Wells Fargo",1,1
"""Big Short"" Investor Michael Burry Has 10% of His Portfolio in 2 ""Magnificent Seven"" AI Stocks",2,2
"""Communications for Good"" Provides Maximus Foundation Grantees With Professional Support",1,1
"""Decoded"": Groundbreaking Chinese Psychological Thriller Film Set for Global Release on August 22",1,1
"""EXPERIENCE MORE AT SEAâ¢"" WITH NORWEGIAN CRUISE LINE WHERE THERE IS MORE TO SEE, MORE TO DO AND MORE TO ENJOY",1,1
"""Empowering American Cities"" Initiative Reveals Economic Insights, 2025 Outlook",1,1
"""FAN FIRST"" PROGRAM BEST PLACE FOR KNICKS AND RANGERS FANS TO GET PLAYOFF TICKETS AT FACE VALUE",1,1
"""Finish Your Story"" in WWEÂ® 2K24 Deluxe Edition and Forty Years of WrestleMania Edition Now Available Worldwide",1,1


In [19]:
# # normalize...maybe not needed?
# stock_risk_percent = stock_risk_exposure.div(stock_risk_exposure.sum(axis=1), axis=0)
# stock_risk_percent

In [20]:
df_cleaned.columns

Index(['ticker', 'mapped_trading_date', 'article_title', 'description',
       'full_text', 'clean_text', 'tokens', 'cluster', 'risk_tags'],
      dtype='object')

# Join with Original

In [21]:
df_original = pd.read_csv("Word2Vec/retry/data/articles_cleaned_spacy_with_numbers.csv")


In [22]:
df_risk = df_original.merge(
    stock_risk_exposure,
    how="left",
    left_on="article_title",
    right_index=True
)


In [23]:
print(df_risk.columns)
df_risk.tail(10)#[['article_title', 'ticker', 'Negative', 'Positive', 'Uncertainty']])


Index(['ticker', 'mapped_trading_date', 'article_title', 'description',
       'full_text', 'clean_text', 'tokens', 'Negative', 'Positive'],
      dtype='object')


Unnamed: 0,ticker,mapped_trading_date,article_title,description,full_text,clean_text,tokens,Negative,Positive
363810,MRK,2024-07-15,The Zacks Analyst Blog Highlights Exxon Mobil...,"are included in this Analyst Blog Exxon Mobil,...",The Zacks Analyst Blog Highlights Exxon Mobil...,the zacks analyst blog highlights exxon mobil ...,"['zack', 'analyst', 'blog', 'highlight', 'exxo...",3,3
363811,MOH,2024-07-17,Would Trump Ease Up on Medicare? Wall Street T...,Investors bet that if Donald Trump is elected ...,Would Trump Ease Up on Medicare? Wall Street T...,would trump ease up on medicare wall street th...,"['trump', 'ease', 'medicare', 'wall', 'street'...",5,5
363812,MRK,2024-07-18,"Insiders At Merck Sold US$12m In Stock, Alludi...","Over the past year, many Merck & Co., Inc. ( N...","Insiders At Merck Sold US$12m In Stock, Alludi...",insiders at merck sold us$12m in stock alludin...,"['insider', 'merck', 'sell', 'us$12', 'm', 'st...",3,3
363813,MO,2024-07-24,"Investors Heavily Search Altria Group, Inc. (M...",Zacks.com users have recently been watching Al...,"Investors Heavily Search Altria Group, Inc. (M...",investors heavily search altria group inc mo h...,"['investor', 'heavily', 'search', 'altria', 'g...",5,5
363814,MOD,2024-07-29,Is Modine (MOD) a Solid Growth Stock? 3 Reason...,Modine (MOD) possesses solid growth attributes...,Is Modine (MOD) a Solid Growth Stock? 3 Reason...,is modine mod a solid growth stock 3 reasons t...,"['modine', 'mod', 'solid', 'growth', 'stock', ...",1,1
363815,MORN,2024-07-30,Morningstar Retirement Launches New Morningsta...,"CHICAGO, July 30, 2024--Morningstar Retirement...",Morningstar Retirement Launches New Morningsta...,morningstar retirement launches new morningsta...,"['morningstar', 'retirement', 'launch', 'new',...",1,1
363816,MOD,2024-07-31,Modine Reports First Quarter Fiscal 2025 Results,"Modine (NYSE: MOD), a diversified global leade...",Modine Reports First Quarter Fiscal 2025 Resul...,modine reports first quarter fiscal 2025 resul...,"['modine', 'report', 'quarter', 'fiscal', '202...",1,1
363817,MO,2024-08-01,"Company News for Aug 1, 2024","Companies in The News Are: AMD, ANET, MA, MO","Company News for Aug 1, 2024 Companies in The ...",company news for aug 1 2024 companies in the n...,"['company', 'news', 'aug', '1', '2024', 'compa...",4,4
363818,MPWR,2024-08-02,Monolithic Power (MPWR) Beats Q2 Earnings and ...,Monolithic (MPWR) delivered earnings and reven...,Monolithic Power (MPWR) Beats Q2 Earnings and ...,monolithic power mpwr beats q2 earnings and re...,"['monolithic', 'power', 'mpwr', 'beat', 'q2', ...",1,1
363819,MOH,2024-08-15,Michael Burry's Strategic Emphasis on Shift4 P...,Insight into the Latest 13F Filing and Key Por...,Michael Burry's Strategic Emphasis on Shift4 P...,michael burrys strategic emphasis on shift4 pa...,"['michael', 'burry', 'strategic', 'emphasis', ...",2,2


In [24]:
# group by ticker and sum the risk tags
df_risk_grouped = df_risk.groupby('ticker').sum()
df_risk_grouped = df_risk_grouped[['Negative', 'Positive', 'Uncertainty', 'Mixed Sentiment']]
df_risk_grouped

KeyError: "['Uncertainty', 'Mixed Sentiment'] not in index"

In [None]:
# save df_risk and df_risk_grouped
# df_risk.to_csv("Word2Vec/retry/data/hdbscan_articles_with_risk_tags.csv")
# df_risk_grouped.to_csv("Word2Vec/retry/hdbscan_articles_with_risk_tags_grouped.csv")