In [1]:
!pip install transformers==4.40.1 peft==0.5.0
!pip install sentencepiece
!pip install accelerate
!pip install torch
!pip install datasets
!pip install bitsandbytes



In [2]:
from huggingface_hub import login

# Log in to Hugging Face with your token
login("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")

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd

Mounted at /content/drive


***Model preparation***

In [4]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForCausalLM

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B")


In [5]:
from transformers import LlamaForCausalLM, LlamaTokenizerFast
from peft import PeftModel
import torch

# Base model and PEFT (LoRA) model
base_model = "meta-llama/Meta-Llama-3-8B"
peft_model = "FinGPT/fingpt-mt_llama3-8b_lora"

# Load tokenizer
tokenizer = LlamaTokenizerFast.from_pretrained(
    base_model,
    token="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",
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

# Load base model with 16-bit precision
model = LlamaForCausalLM.from_pretrained(base_model,
                    trust_remote_code=True,
                    device_map="auto",
                    torch_dtype=torch.float16)  # Enable 16-bit precision

# Apply LoRA-based PEFT model
model = PeftModel.from_pretrained(model, peft_model, torch_dtype=torch.float16)
model = model.eval()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'LlamaTokenizerFast'.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [7]:
import gc
import torch
import pandas as pd
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

# -----------------------
# 1. PREP: MODEL + TOKENIZER
# -----------------------
# Make sure you've already defined or imported:
#   tokenizer, model, device
# from your previous code.
# If you need them in this script, define them here similarly.

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Example sentiment function based on your reference
def get_sentiment(text):
    """Return (sentiment_label, sentiment_probability)."""
    if not text:
        # If empty text, treat as neutral
        return "Neutral", 1.0

    # (Optional) Truncate very long text to avoid GPU OOM
    len_text = min(len(text), 5000)
    text = text[:len_text]

    # Create prompt
    prompt = (
        f"Instruction: What is the sentiment of this news? "
        f"Please choose an answer from [Positive, Negative, Neutral].\n"
        f"Input: {text}\nAnswer: "
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits for the last token
    logits = outputs.logits[:, -1, :].to("cpu")
    probs = torch.softmax(logits, dim=-1)

    # Map the probabilities to the sentiment tokens
    class_tokens = tokenizer(["Positive", "Negative", "Neutral"], add_special_tokens=False)["input_ids"]
    class_probs = {tokenizer.decode(token_id): probs[0, token_id].item() for token_id in class_tokens}

    # Pick the highest-probability sentiment
    sentiment_label = max(class_probs, key=class_probs.get)
    sentiment_prob = class_probs[sentiment_label]

    # Clean up
    del inputs, outputs, logits, probs
    torch.cuda.empty_cache()

    return sentiment_label, sentiment_prob


# -----------------------
# 2. AGGREGATION FUNCTION
# -----------------------
def aggregate_daily_sentiment(group: pd.DataFrame) -> pd.Series:
    """
    Given a subset of rows (one day of articles),
    compute the required metrics.
    """
    total_articles = len(group)
    if total_articles == 0:
        return pd.Series({
            "mode_of_sentiment": None,
            "num_articles": 0,
            "ratio_positive": 0.0,
            "ratio_negative": 0.0,
            "ratio_neutral": 0.0,
            "avg_sentiment_positive": 0.0,
            "avg_sentiment_negative": 0.0,
            "avg_sentiment_neutral": 0.0,
            "weighted_avg_sentiment": 0.0,
            "mode_of_sentiment_score": None,
            "mode_of_avg_sentiment_score": None
        })

    # 1) Mode of sentiment types (which label appears most)
    sentiment_counts = group["sentiment"].value_counts()
    mode_of_sentiment = sentiment_counts.idxmax()

    # 2) Number of articles
    num_articles = total_articles

    # 3) Ratio of positive/negative/neutral
    ratio_positive = sentiment_counts.get("Positive", 0) / total_articles
    ratio_negative = sentiment_counts.get("Negative", 0) / total_articles
    ratio_neutral  = sentiment_counts.get("Neutral",  0) / total_articles

    # 4) Average sentiment for positive, negative, neutral
    avg_sentiment_positive = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].mean()
    avg_sentiment_negative = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].mean()
    avg_sentiment_neutral  = group.loc[group["sentiment"] == "Neutral",  "sentiment_logit"].mean()

    # 5) Weighted average sentiment:
    #    (sum of all positive sentiments - sum of all negative sentiments) / total_articles
    sum_pos = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].sum()
    sum_neg = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].sum()
    weighted_avg_sentiment = (sum_pos - sum_neg) / total_articles

    # 6) Mode of sentiment score (which sentiment label has the highest sum of probabilities)
    sum_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].sum()
    mode_of_sentiment_score = sum_sentiment_scores.idxmax() if not sum_sentiment_scores.empty else None

    # 7) Mode of average sentiment score (which label has the highest average probability)
    avg_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].mean()
    mode_of_avg_sentiment_score = avg_sentiment_scores.idxmax() if not avg_sentiment_scores.empty else None

    return pd.Series({
        "mode_of_sentiment": mode_of_sentiment,
        "num_articles": num_articles,
        "ratio_positive": ratio_positive,
        "ratio_negative": ratio_negative,
        "ratio_neutral":  ratio_neutral,
        "avg_sentiment_positive": avg_sentiment_positive,
        "avg_sentiment_negative": avg_sentiment_negative,
        "avg_sentiment_neutral":  avg_sentiment_neutral,
        "weighted_avg_sentiment": weighted_avg_sentiment,
        "mode_of_sentiment_score": mode_of_sentiment_score,
        "mode_of_avg_sentiment_score": mode_of_avg_sentiment_score
    })


# -----------------------
# 3. MAIN PROCESS
# -----------------------
TICKER_UNIVERSE = [
    'AAPL', 'MSFT', 'NVDA', 'AVGO', 'ADBE', 'UNH', 'JNJ', 'PFE', 'MRK', 'ABBV',
    'JPM', 'BAC', 'WFC', 'GS', 'MS', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'GOOGL',
    'META', 'DIS', 'VZ', 'CMCSA', 'PG', 'KO', 'PEP', 'WMT', 'COST', 'XOM', 'CVX',
    'COP', 'BA', 'UNP', 'HON', 'NEE', 'DUK', 'SO', 'PLD', 'AMT', 'CCI', 'SHW', 'DOW'
]

# Directory containing all TICKER.parquet files
input_dir = "/content/drive/My Drive/LLM_RL_Data/stock_parquet"
output_dir = "/content/drive/My Drive/LLM_RL_Data/llm_sentiments"

# Optional: warm-up pass to stabilize GPU memory
print("Running warm-up pass...")
dummy_input = tokenizer("Warm-up", return_tensors="pt", padding=True, max_length=128).to(device)
with torch.no_grad():
    _ = model(**dummy_input)
torch.cuda.empty_cache()


# Loop over all tickers
for ticker in ['SO']:
    input_path = f"{input_dir}/{ticker}.parquet"
    try:
        df = pd.read_parquet(input_path)
    except FileNotFoundError:
        print(f"File not found for {ticker}, skipping...")
        continue

    # The parquet file has a Date index (trading_day) and one column named exactly the same as the ticker.
    # Each cell in that column is a list of articles (or None).
    if ticker not in df.columns:
        print(f"Ticker column '{ticker}' not found in {input_path}, skipping...")
        continue

    print(f"\nProcessing {ticker} from file: {input_path}")
    print(f"Data shape: {df.shape}")

    # Step 1: Convert the index into a column if needed
    if df.index.name == "trading_day":
        df.reset_index(inplace=True)

    # Step 2: Explode the list of articles so each article is its own row
    df_exploded = df.explode(ticker, ignore_index=False).reset_index(drop=True)
    df_exploded.rename(columns={ticker: "article_text"}, inplace=True)

    # If the parquet file can have None/NaN in the list, fill with empty strings
    df_exploded["article_text"] = df_exploded["article_text"].fillna("")

    # Step 3: Apply get_sentiment to each article
    #         This can be slow for large data; you might want batching if needed.
    sentiments, logits = [], []
    for text in tqdm(df_exploded["article_text"], desc=f"{ticker} articles"):
        s_label, s_logit = get_sentiment(text)
        sentiments.append(s_label)
        logits.append(s_logit)

    df_exploded["sentiment"] = sentiments
    df_exploded["sentiment_logit"] = logits

    # Step 4: Group by original date to compute daily aggregates
    # Make sure you still have a date column (trading_day)
    # If it was originally the index, check how you want to name it in the exploded DataFrame.
    # Suppose the original date column is "trading_day"
    if "trading_day" not in df_exploded.columns:
        # If needed, rename the old index name or re-check how the date is stored
        pass

    grouped_results = (
        df_exploded
        .groupby("trading_day", as_index=True)
        .apply(aggregate_daily_sentiment)
        .reset_index()
    )

    # Step 5: Save the aggregated result for this ticker
    ticker_output_path = f"{output_dir}/{ticker}_sentiment_agg.parquet"
    grouped_results.to_parquet(ticker_output_path, index=False)

    print(f"Saved daily sentiment aggregates to {ticker_output_path}")

    # Optional memory cleanup
    del df, df_exploded, grouped_results, sentiments, logits
    torch.cuda.empty_cache()
    gc.collect()

print("\nAll tickers processed!")


Running warm-up pass...

Processing SO from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/SO.parquet
Data shape: (1848, 1)


SO articles: 100%|██████████| 2828/2828 [02:31<00:00, 18.64it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/SO_sentiment_agg.parquet

All tickers processed!


In [8]:
import gc
import torch
import pandas as pd
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

# -----------------------
# 1. Industry Mapping Dictionary
# -----------------------
TICKER_INDUSTRY_MAP = {
    # Technology
    'AAPL': 'Technology',
    'MSFT': 'Technology',
    'NVDA': 'Technology',
    'AVGO': 'Technology',
    'ADBE': 'Technology',
    'GOOGL': 'Technology',
    'META': 'Technology',

    # Healthcare
    'UNH': 'Healthcare',
    'JNJ': 'Healthcare',
    'PFE': 'Healthcare',
    'MRK': 'Healthcare',
    'ABBV': 'Healthcare',

    # Financials
    'JPM': 'Financials',
    'BAC': 'Financials',
    'WFC': 'Financials',
    'GS': 'Financials',
    'MS': 'Financials',

    # Consumer Discretionary
    'AMZN': 'Consumer Discretionary',
    'TSLA': 'Consumer Discretionary',
    'HD': 'Consumer Discretionary',
    'MCD': 'Consumer Discretionary',
    'NKE': 'Consumer Discretionary',

    # Communication Services
    'DIS': 'Communication Services',
    'VZ': 'Communication Services',
    'CMCSA': 'Communication Services',

    # Consumer Staples
    'PG': 'Consumer Staples',
    'KO': 'Consumer Staples',
    'PEP': 'Consumer Staples',
    'WMT': 'Consumer Staples',
    'COST': 'Consumer Staples',

    # Energy
    'XOM': 'Energy',
    'CVX': 'Energy',
    'COP': 'Energy',

    # Industrials / Materials
    'BA': 'Industrials',
    'UNP': 'Industrials',
    'HON': 'Industrials',
    'SHW': 'Industrials',
    'DOW': 'Materials',

    # Utilities
    'NEE': 'Utilities',
    'DUK': 'Utilities',
    'SO': 'Utilities',

    # Real Estate
    'PLD': 'Real Estate',
    'AMT': 'Real Estate',
    'CCI': 'Real Estate'
}

# -----------------------
# 2. PREP: MODEL + TOKENIZER
# -----------------------
# Ensure you've defined/imported:
#   tokenizer, model, device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Updated get_sentiment function to accept an industry argument
def get_sentiment(text, industry=None):
    """Return (sentiment_label, sentiment_probability) with optional industry context."""
    if not text:
        # If empty text, treat as neutral
        return "Neutral", 1.0

    # (Optional) Truncate very long text to avoid GPU OOM
    len_text = min(len(text), 5000)
    text = text[:len_text]

    # Prepend industry information if provided
    industry_text = f"Industry: {industry}.\n" if industry is not None else ""
    prompt = (
        f"{industry_text}"
        f"Instruction: What is the sentiment of this news? "
        f"Please choose an answer from [Positive, Negative, Neutral].\n"
        f"Input: {text}\nAnswer: "
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits for the last token
    logits = outputs.logits[:, -1, :].to("cpu")
    probs = torch.softmax(logits, dim=-1)

    # Map the probabilities to the sentiment tokens
    class_tokens = tokenizer(["Positive", "Negative", "Neutral"], add_special_tokens=False)["input_ids"]
    class_probs = {tokenizer.decode(token_id): probs[0, token_id].item() for token_id in class_tokens}

    # Pick the highest-probability sentiment
    sentiment_label = max(class_probs, key=class_probs.get)
    sentiment_prob = class_probs[sentiment_label]

    # Clean up
    del inputs, outputs, logits, probs
    torch.cuda.empty_cache()

    return sentiment_label, sentiment_prob


# -----------------------
# 3. AGGREGATION FUNCTION
# -----------------------
def aggregate_daily_sentiment(group: pd.DataFrame) -> pd.Series:
    """
    Given a subset of rows (one day of articles),
    compute the required metrics, including combined sentiment.
    """
    total_articles = len(group)
    if total_articles == 0:
        return pd.Series({
            "mode_of_sentiment": None,
            "num_articles": 0,
            "ratio_positive": 0.0,
            "ratio_negative": 0.0,
            "ratio_neutral": 0.0,
            "avg_sentiment_positive": 0.0,
            "avg_sentiment_negative": 0.0,
            "avg_sentiment_neutral": 0.0,
            "weighted_avg_sentiment": 0.0,
            "mode_of_sentiment_score": None,
            "mode_of_avg_sentiment_score": None,
            "sentiment_combined": None,
            "sentiment_score_combined": 0.0
        })

    # 1) Mode of sentiment types (which label appears most)
    sentiment_counts = group["sentiment"].value_counts()
    mode_of_sentiment = sentiment_counts.idxmax()

    # 2) Number of articles
    num_articles = total_articles

    # 3) Ratio of positive/negative/neutral
    ratio_positive = sentiment_counts.get("Positive", 0) / total_articles
    ratio_negative = sentiment_counts.get("Negative", 0) / total_articles
    ratio_neutral  = sentiment_counts.get("Neutral",  0) / total_articles

    # 4) Average sentiment for positive, negative, neutral
    avg_sentiment_positive = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].mean()
    avg_sentiment_negative = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].mean()
    avg_sentiment_neutral  = group.loc[group["sentiment"] == "Neutral",  "sentiment_logit"].mean()

    # 5) Weighted average sentiment:
    #    (sum of all positive sentiments - sum of all negative sentiments) / total_articles
    sum_pos = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].sum()
    sum_neg = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].sum()
    weighted_avg_sentiment = (sum_pos - sum_neg) / total_articles

    # 6) Mode of sentiment score (which sentiment label has the highest sum of probabilities)
    sum_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].sum()
    mode_of_sentiment_score = sum_sentiment_scores.idxmax() if not sum_sentiment_scores.empty else None

    # 7) Mode of average sentiment score (which label has the highest average probability)
    avg_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].mean()
    mode_of_avg_sentiment_score = avg_sentiment_scores.idxmax() if not avg_sentiment_scores.empty else None

    # 8) Combined sentiment for all articles of the day:
    combined_text = " ".join(group["article_text"].tolist()).strip()
    # Retrieve industry from the group (all rows have the same ticker, hence same industry)
    industry = group["industry"].iloc[0] if "industry" in group.columns else None
    sentiment_combined, sentiment_score_combined = get_sentiment(combined_text, industry=industry)

    return pd.Series({
        "mode_of_sentiment": mode_of_sentiment,
        "num_articles": num_articles,
        "ratio_positive": ratio_positive,
        "ratio_negative": ratio_negative,
        "ratio_neutral":  ratio_neutral,
        "avg_sentiment_positive": avg_sentiment_positive,
        "avg_sentiment_negative": avg_sentiment_negative,
        "avg_sentiment_neutral":  avg_sentiment_neutral,
        "weighted_avg_sentiment": weighted_avg_sentiment,
        "mode_of_sentiment_score": mode_of_sentiment_score,
        "mode_of_avg_sentiment_score": mode_of_avg_sentiment_score,
        "sentiment_combined": sentiment_combined,
        "sentiment_score_combined": sentiment_score_combined
    })


# -----------------------
# 4. MAIN PROCESS
# -----------------------
TICKER_UNIVERSE = [
    'AAPL', 'MSFT', 'NVDA', 'AVGO', 'ADBE', 'UNH', 'JNJ', 'PFE', 'MRK', 'ABBV',
    # 'JPM', 'BAC', 'WFC', 'GS', 'MS', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'GOOGL',
    # 'META', 'DIS', 'VZ', 'CMCSA', 'PG', 'KO', 'PEP', 'WMT', 'COST', 'XOM', 'CVX',
    # 'COP', 'BA', 'UNP', 'HON', 'NEE', 'DUK', 'SO', 'PLD', 'AMT', 'CCI', 'SHW', 'DOW'
]

# Directory containing all TICKER.parquet files
input_dir = "/content/drive/My Drive/LLM_RL_Data/stock_parquet"
output_dir = "/content/drive/My Drive/LLM_RL_Data/llm_sentiments"

# Optional: warm-up pass to stabilize GPU memory
print("Running warm-up pass...")
dummy_input = tokenizer("Warm-up", return_tensors="pt", padding=True, max_length=128).to(device)
with torch.no_grad():
    _ = model(**dummy_input)
torch.cuda.empty_cache()

# Loop over tickers (example with ticker 'SO')
for ticker in TICKER_UNIVERSE:
    input_path = f"{input_dir}/{ticker}.parquet"
    try:
        df = pd.read_parquet(input_path)
    except FileNotFoundError:
        print(f"File not found for {ticker}, skipping...")
        continue

    if ticker not in df.columns:
        print(f"Ticker column '{ticker}' not found in {input_path}, skipping...")
        continue

    print(f"\nProcessing {ticker} from file: {input_path}")
    print(f"Data shape: {df.shape}")

    # Convert index to column if needed
    if df.index.name == "trading_day":
        df.reset_index(inplace=True)

    # Explode the list of articles so each article is its own row
    df_exploded = df.explode(ticker, ignore_index=False).reset_index(drop=True)
    df_exploded.rename(columns={ticker: "article_text"}, inplace=True)

    # Fill missing article text with empty strings
    df_exploded["article_text"] = df_exploded["article_text"].fillna("")

    # Add a column for industry using the mapping
    industry = TICKER_INDUSTRY_MAP.get(ticker, None)
    df_exploded["industry"] = industry

    # Apply get_sentiment to each article
    sentiments, logits = [], []
    for text in tqdm(df_exploded["article_text"], desc=f"{ticker} articles"):
        s_label, s_logit = get_sentiment(text, industry=industry)
        sentiments.append(s_label)
        logits.append(s_logit)

    df_exploded["sentiment"] = sentiments
    df_exploded["sentiment_logit"] = logits

    # Group by trading_day to compute daily aggregates
    grouped_results = (
        df_exploded
        .groupby("trading_day", as_index=True)
        .apply(aggregate_daily_sentiment)
        .reset_index()
    )

    # Save the aggregated result for this ticker
    ticker_output_path = f"{output_dir}/{ticker}_sentiment_agg.parquet"
    grouped_results.to_parquet(ticker_output_path, index=False)

    print(f"Saved daily sentiment aggregates to {ticker_output_path}")

    # Cleanup
    del df, df_exploded, grouped_results, sentiments, logits
    torch.cuda.empty_cache()
    gc.collect()

print("\nAll tickers processed!")


Running warm-up pass...

Processing AAPL from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/AAPL.parquet
Data shape: (1848, 1)


AAPL articles: 100%|██████████| 27318/27318 [41:21<00:00, 11.01it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/AAPL_sentiment_agg.parquet

Processing MSFT from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/MSFT.parquet
Data shape: (1848, 1)


MSFT articles: 100%|██████████| 16769/16769 [24:14<00:00, 11.53it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/MSFT_sentiment_agg.parquet

Processing NVDA from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/NVDA.parquet
Data shape: (1848, 1)


NVDA articles: 100%|██████████| 10918/10918 [16:18<00:00, 11.16it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/NVDA_sentiment_agg.parquet

Processing AVGO from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/AVGO.parquet
Data shape: (1848, 1)


AVGO articles: 100%|██████████| 4289/4289 [05:27<00:00, 13.11it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/AVGO_sentiment_agg.parquet

Processing ADBE from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/ADBE.parquet
Data shape: (1848, 1)


ADBE articles: 100%|██████████| 3366/3366 [03:39<00:00, 15.34it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/ADBE_sentiment_agg.parquet

Processing UNH from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/UNH.parquet
Data shape: (1848, 1)


UNH articles: 100%|██████████| 4304/4304 [05:12<00:00, 13.79it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/UNH_sentiment_agg.parquet

Processing JNJ from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/JNJ.parquet
Data shape: (1848, 1)


JNJ articles: 100%|██████████| 10630/10630 [14:29<00:00, 12.22it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/JNJ_sentiment_agg.parquet

Processing PFE from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/PFE.parquet
Data shape: (1848, 1)


PFE articles: 100%|██████████| 19229/19229 [26:52<00:00, 11.93it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/PFE_sentiment_agg.parquet

Processing MRK from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/MRK.parquet
Data shape: (1848, 1)


MRK articles: 100%|██████████| 6420/6420 [08:25<00:00, 12.71it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/MRK_sentiment_agg.parquet

Processing ABBV from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/ABBV.parquet
Data shape: (1848, 1)


ABBV articles: 100%|██████████| 4447/4447 [05:50<00:00, 12.70it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/ABBV_sentiment_agg.parquet

All tickers processed!


In [9]:
import gc
import torch
import pandas as pd
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

# -----------------------
# 1. Industry Mapping Dictionary
# -----------------------
TICKER_INDUSTRY_MAP = {
    # Technology
    'AAPL': 'Technology',
    'MSFT': 'Technology',
    'NVDA': 'Technology',
    'AVGO': 'Technology',
    'ADBE': 'Technology',
    'GOOGL': 'Technology',
    'META': 'Technology',

    # Healthcare
    'UNH': 'Healthcare',
    'JNJ': 'Healthcare',
    'PFE': 'Healthcare',
    'MRK': 'Healthcare',
    'ABBV': 'Healthcare',

    # Financials
    'JPM': 'Financials',
    'BAC': 'Financials',
    'WFC': 'Financials',
    'GS': 'Financials',
    'MS': 'Financials',

    # Consumer Discretionary
    'AMZN': 'Consumer Discretionary',
    'TSLA': 'Consumer Discretionary',
    'HD': 'Consumer Discretionary',
    'MCD': 'Consumer Discretionary',
    'NKE': 'Consumer Discretionary',

    # Communication Services
    'DIS': 'Communication Services',
    'VZ': 'Communication Services',
    'CMCSA': 'Communication Services',

    # Consumer Staples
    'PG': 'Consumer Staples',
    'KO': 'Consumer Staples',
    'PEP': 'Consumer Staples',
    'WMT': 'Consumer Staples',
    'COST': 'Consumer Staples',

    # Energy
    'XOM': 'Energy',
    'CVX': 'Energy',
    'COP': 'Energy',

    # Industrials / Materials
    'BA': 'Industrials',
    'UNP': 'Industrials',
    'HON': 'Industrials',
    'SHW': 'Industrials',
    'DOW': 'Materials',

    # Utilities
    'NEE': 'Utilities',
    'DUK': 'Utilities',
    'SO': 'Utilities',

    # Real Estate
    'PLD': 'Real Estate',
    'AMT': 'Real Estate',
    'CCI': 'Real Estate'
}

# -----------------------
# 2. PREP: MODEL + TOKENIZER
# -----------------------
# Ensure you've defined/imported:
#   tokenizer, model, device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Updated get_sentiment function to accept an industry argument
def get_sentiment(text, industry=None):
    """Return (sentiment_label, sentiment_probability) with optional industry context."""
    if not text:
        # If empty text, treat as neutral
        return "Neutral", 1.0

    # (Optional) Truncate very long text to avoid GPU OOM
    len_text = min(len(text), 5000)
    text = text[:len_text]

    # Prepend industry information if provided
    industry_text = f"Industry: {industry}.\n" if industry is not None else ""
    prompt = (
        f"{industry_text}"
        f"Instruction: What is the sentiment of this news? "
        f"Please choose an answer from [Positive, Negative, Neutral].\n"
        f"Input: {text}\nAnswer: "
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits for the last token
    logits = outputs.logits[:, -1, :].to("cpu")
    probs = torch.softmax(logits, dim=-1)

    # Map the probabilities to the sentiment tokens
    class_tokens = tokenizer(["Positive", "Negative", "Neutral"], add_special_tokens=False)["input_ids"]
    class_probs = {tokenizer.decode(token_id): probs[0, token_id].item() for token_id in class_tokens}

    # Pick the highest-probability sentiment
    sentiment_label = max(class_probs, key=class_probs.get)
    sentiment_prob = class_probs[sentiment_label]

    # Clean up
    del inputs, outputs, logits, probs
    torch.cuda.empty_cache()

    return sentiment_label, sentiment_prob


# -----------------------
# 3. AGGREGATION FUNCTION
# -----------------------
def aggregate_daily_sentiment(group: pd.DataFrame) -> pd.Series:
    """
    Given a subset of rows (one day of articles),
    compute the required metrics, including combined sentiment.
    """
    total_articles = len(group)
    if total_articles == 0:
        return pd.Series({
            "mode_of_sentiment": None,
            "num_articles": 0,
            "ratio_positive": 0.0,
            "ratio_negative": 0.0,
            "ratio_neutral": 0.0,
            "avg_sentiment_positive": 0.0,
            "avg_sentiment_negative": 0.0,
            "avg_sentiment_neutral": 0.0,
            "weighted_avg_sentiment": 0.0,
            "mode_of_sentiment_score": None,
            "mode_of_avg_sentiment_score": None,
            "sentiment_combined": None,
            "sentiment_score_combined": 0.0
        })

    # 1) Mode of sentiment types (which label appears most)
    sentiment_counts = group["sentiment"].value_counts()
    mode_of_sentiment = sentiment_counts.idxmax()

    # 2) Number of articles
    num_articles = total_articles

    # 3) Ratio of positive/negative/neutral
    ratio_positive = sentiment_counts.get("Positive", 0) / total_articles
    ratio_negative = sentiment_counts.get("Negative", 0) / total_articles
    ratio_neutral  = sentiment_counts.get("Neutral",  0) / total_articles

    # 4) Average sentiment for positive, negative, neutral
    avg_sentiment_positive = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].mean()
    avg_sentiment_negative = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].mean()
    avg_sentiment_neutral  = group.loc[group["sentiment"] == "Neutral",  "sentiment_logit"].mean()

    # 5) Weighted average sentiment:
    #    (sum of all positive sentiments - sum of all negative sentiments) / total_articles
    sum_pos = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].sum()
    sum_neg = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].sum()
    weighted_avg_sentiment = (sum_pos - sum_neg) / total_articles

    # 6) Mode of sentiment score (which sentiment label has the highest sum of probabilities)
    sum_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].sum()
    mode_of_sentiment_score = sum_sentiment_scores.idxmax() if not sum_sentiment_scores.empty else None

    # 7) Mode of average sentiment score (which label has the highest average probability)
    avg_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].mean()
    mode_of_avg_sentiment_score = avg_sentiment_scores.idxmax() if not avg_sentiment_scores.empty else None

    # 8) Combined sentiment for all articles of the day:
    combined_text = " ".join(group["article_text"].tolist()).strip()
    # Retrieve industry from the group (all rows have the same ticker, hence same industry)
    industry = group["industry"].iloc[0] if "industry" in group.columns else None
    sentiment_combined, sentiment_score_combined = get_sentiment(combined_text, industry=industry)

    return pd.Series({
        "mode_of_sentiment": mode_of_sentiment,
        "num_articles": num_articles,
        "ratio_positive": ratio_positive,
        "ratio_negative": ratio_negative,
        "ratio_neutral":  ratio_neutral,
        "avg_sentiment_positive": avg_sentiment_positive,
        "avg_sentiment_negative": avg_sentiment_negative,
        "avg_sentiment_neutral":  avg_sentiment_neutral,
        "weighted_avg_sentiment": weighted_avg_sentiment,
        "mode_of_sentiment_score": mode_of_sentiment_score,
        "mode_of_avg_sentiment_score": mode_of_avg_sentiment_score,
        "sentiment_combined": sentiment_combined,
        "sentiment_score_combined": sentiment_score_combined
    })


# -----------------------
# 4. MAIN PROCESS
# -----------------------
TICKER_UNIVERSE = [
    # 'AAPL', 'MSFT', 'NVDA', 'AVGO', 'ADBE', 'UNH', 'JNJ', 'PFE', 'MRK', 'ABBV',
    'JPM', 'BAC', 'WFC', 'GS', 'MS', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'GOOGL',
    'META', 'DIS', 'VZ', 'CMCSA', 'PG', 'KO', 'PEP', 'WMT', 'COST', 'XOM', 'CVX',
    # 'COP', 'BA', 'UNP', 'HON', 'NEE', 'DUK', 'SO', 'PLD', 'AMT', 'CCI', 'SHW', 'DOW'
]

# Directory containing all TICKER.parquet files
input_dir = "/content/drive/My Drive/LLM_RL_Data/stock_parquet"
output_dir = "/content/drive/My Drive/LLM_RL_Data/llm_sentiments"

# Optional: warm-up pass to stabilize GPU memory
print("Running warm-up pass...")
dummy_input = tokenizer("Warm-up", return_tensors="pt", padding=True, max_length=128).to(device)
with torch.no_grad():
    _ = model(**dummy_input)
torch.cuda.empty_cache()

# Loop over tickers (example with ticker 'SO')
for ticker in TICKER_UNIVERSE:
    input_path = f"{input_dir}/{ticker}.parquet"
    try:
        df = pd.read_parquet(input_path)
    except FileNotFoundError:
        print(f"File not found for {ticker}, skipping...")
        continue

    if ticker not in df.columns:
        print(f"Ticker column '{ticker}' not found in {input_path}, skipping...")
        continue

    print(f"\nProcessing {ticker} from file: {input_path}")
    print(f"Data shape: {df.shape}")

    # Convert index to column if needed
    if df.index.name == "trading_day":
        df.reset_index(inplace=True)

    # Explode the list of articles so each article is its own row
    df_exploded = df.explode(ticker, ignore_index=False).reset_index(drop=True)
    df_exploded.rename(columns={ticker: "article_text"}, inplace=True)

    # Fill missing article text with empty strings
    df_exploded["article_text"] = df_exploded["article_text"].fillna("")

    # Add a column for industry using the mapping
    industry = TICKER_INDUSTRY_MAP.get(ticker, None)
    df_exploded["industry"] = industry

    # Apply get_sentiment to each article
    sentiments, logits = [], []
    for text in tqdm(df_exploded["article_text"], desc=f"{ticker} articles"):
        s_label, s_logit = get_sentiment(text, industry=industry)
        sentiments.append(s_label)
        logits.append(s_logit)

    df_exploded["sentiment"] = sentiments
    df_exploded["sentiment_logit"] = logits

    # Group by trading_day to compute daily aggregates
    grouped_results = (
        df_exploded
        .groupby("trading_day", as_index=True)
        .apply(aggregate_daily_sentiment)
        .reset_index()
    )

    # Save the aggregated result for this ticker
    ticker_output_path = f"{output_dir}/{ticker}_sentiment_agg.parquet"
    grouped_results.to_parquet(ticker_output_path, index=False)

    print(f"Saved daily sentiment aggregates to {ticker_output_path}")

    # Cleanup
    del df, df_exploded, grouped_results, sentiments, logits
    torch.cuda.empty_cache()
    gc.collect()

print("\nAll tickers processed!")


Running warm-up pass...

Processing JPM from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/JPM.parquet
Data shape: (1848, 1)


JPM articles: 100%|██████████| 15768/15768 [22:51<00:00, 11.50it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/JPM_sentiment_agg.parquet

Processing BAC from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/BAC.parquet
Data shape: (1848, 1)


BAC articles: 100%|██████████| 9705/9705 [14:30<00:00, 11.15it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/BAC_sentiment_agg.parquet

Processing WFC from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/WFC.parquet
Data shape: (1848, 1)


WFC articles: 100%|██████████| 7548/7548 [10:40<00:00, 11.79it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/WFC_sentiment_agg.parquet

Processing GS from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/GS.parquet
Data shape: (1848, 1)


GS articles: 100%|██████████| 14701/14701 [21:26<00:00, 11.43it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/GS_sentiment_agg.parquet

Processing MS from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/MS.parquet
Data shape: (1848, 1)


MS articles: 100%|██████████| 10698/10698 [15:53<00:00, 11.22it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/MS_sentiment_agg.parquet

Processing AMZN from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/AMZN.parquet
Data shape: (1848, 1)


AMZN articles: 100%|██████████| 27470/27470 [40:08<00:00, 11.40it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/AMZN_sentiment_agg.parquet

Processing TSLA from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/TSLA.parquet
Data shape: (1848, 1)


TSLA articles: 100%|██████████| 27808/27808 [41:46<00:00, 11.10it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/TSLA_sentiment_agg.parquet

Processing HD from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/HD.parquet
Data shape: (1848, 1)


HD articles: 100%|██████████| 3891/3891 [04:40<00:00, 13.89it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/HD_sentiment_agg.parquet

Processing MCD from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/MCD.parquet
Data shape: (1848, 1)


MCD articles: 100%|██████████| 4702/4702 [06:02<00:00, 12.99it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/MCD_sentiment_agg.parquet

Processing NKE from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/NKE.parquet
Data shape: (1848, 1)


NKE articles: 100%|██████████| 5392/5392 [07:15<00:00, 12.39it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/NKE_sentiment_agg.parquet

Processing GOOGL from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/GOOGL.parquet
Data shape: (1848, 1)


GOOGL articles: 100%|██████████| 23108/23108 [33:34<00:00, 11.47it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/GOOGL_sentiment_agg.parquet

Processing META from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/META.parquet
Data shape: (1848, 1)


META articles: 100%|██████████| 8020/8020 [10:00<00:00, 13.36it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/META_sentiment_agg.parquet

Processing DIS from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/DIS.parquet
Data shape: (1848, 1)


DIS articles: 100%|██████████| 9391/9391 [12:52<00:00, 12.16it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/DIS_sentiment_agg.parquet

Processing VZ from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/VZ.parquet
Data shape: (1848, 1)


VZ articles: 100%|██████████| 4679/4679 [05:36<00:00, 13.90it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/VZ_sentiment_agg.parquet

Processing CMCSA from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/CMCSA.parquet
Data shape: (1848, 1)


CMCSA articles: 100%|██████████| 5055/5055 [06:35<00:00, 12.80it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/CMCSA_sentiment_agg.parquet

Processing PG from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/PG.parquet
Data shape: (1848, 1)


PG articles: 100%|██████████| 4071/4071 [04:58<00:00, 13.65it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/PG_sentiment_agg.parquet

Processing KO from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/KO.parquet
Data shape: (1848, 1)


KO articles: 100%|██████████| 4565/4565 [05:57<00:00, 12.77it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/KO_sentiment_agg.parquet

Processing PEP from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/PEP.parquet
Data shape: (1848, 1)


PEP articles: 100%|██████████| 3352/3352 [03:22<00:00, 16.52it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/PEP_sentiment_agg.parquet

Processing WMT from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/WMT.parquet
Data shape: (1848, 1)


WMT articles: 100%|██████████| 10535/10535 [14:53<00:00, 11.79it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/WMT_sentiment_agg.parquet

Processing COST from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/COST.parquet
Data shape: (1848, 1)


COST articles: 100%|██████████| 3404/3404 [03:51<00:00, 14.74it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/COST_sentiment_agg.parquet

Processing XOM from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/XOM.parquet
Data shape: (1848, 1)


XOM articles: 100%|██████████| 15253/15253 [22:36<00:00, 11.25it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/XOM_sentiment_agg.parquet

Processing CVX from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/CVX.parquet
Data shape: (1848, 1)


CVX articles: 100%|██████████| 12602/12602 [19:27<00:00, 10.79it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/CVX_sentiment_agg.parquet

All tickers processed!


In [10]:
import gc
import torch
import pandas as pd
import warnings
from tqdm import tqdm

warnings.filterwarnings('ignore')

# -----------------------
# 1. Industry Mapping Dictionary
# -----------------------
TICKER_INDUSTRY_MAP = {
    # Technology
    'AAPL': 'Technology',
    'MSFT': 'Technology',
    'NVDA': 'Technology',
    'AVGO': 'Technology',
    'ADBE': 'Technology',
    'GOOGL': 'Technology',
    'META': 'Technology',

    # Healthcare
    'UNH': 'Healthcare',
    'JNJ': 'Healthcare',
    'PFE': 'Healthcare',
    'MRK': 'Healthcare',
    'ABBV': 'Healthcare',

    # Financials
    'JPM': 'Financials',
    'BAC': 'Financials',
    'WFC': 'Financials',
    'GS': 'Financials',
    'MS': 'Financials',

    # Consumer Discretionary
    'AMZN': 'Consumer Discretionary',
    'TSLA': 'Consumer Discretionary',
    'HD': 'Consumer Discretionary',
    'MCD': 'Consumer Discretionary',
    'NKE': 'Consumer Discretionary',

    # Communication Services
    'DIS': 'Communication Services',
    'VZ': 'Communication Services',
    'CMCSA': 'Communication Services',

    # Consumer Staples
    'PG': 'Consumer Staples',
    'KO': 'Consumer Staples',
    'PEP': 'Consumer Staples',
    'WMT': 'Consumer Staples',
    'COST': 'Consumer Staples',

    # Energy
    'XOM': 'Energy',
    'CVX': 'Energy',
    'COP': 'Energy',

    # Industrials / Materials
    'BA': 'Industrials',
    'UNP': 'Industrials',
    'HON': 'Industrials',
    'SHW': 'Industrials',
    'DOW': 'Materials',

    # Utilities
    'NEE': 'Utilities',
    'DUK': 'Utilities',
    'SO': 'Utilities',

    # Real Estate
    'PLD': 'Real Estate',
    'AMT': 'Real Estate',
    'CCI': 'Real Estate'
}

# -----------------------
# 2. PREP: MODEL + TOKENIZER
# -----------------------
# Ensure you've defined/imported:
#   tokenizer, model, device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Updated get_sentiment function to accept an industry argument
def get_sentiment(text, industry=None):
    """Return (sentiment_label, sentiment_probability) with optional industry context."""
    if not text:
        # If empty text, treat as neutral
        return "Neutral", 1.0

    # (Optional) Truncate very long text to avoid GPU OOM
    len_text = min(len(text), 5000)
    text = text[:len_text]

    # Prepend industry information if provided
    industry_text = f"Industry: {industry}.\n" if industry is not None else ""
    prompt = (
        f"{industry_text}"
        f"Instruction: What is the sentiment of this news? "
        f"Please choose an answer from [Positive, Negative, Neutral].\n"
        f"Input: {text}\nAnswer: "
    )

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, max_length=128).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    # Extract logits for the last token
    logits = outputs.logits[:, -1, :].to("cpu")
    probs = torch.softmax(logits, dim=-1)

    # Map the probabilities to the sentiment tokens
    class_tokens = tokenizer(["Positive", "Negative", "Neutral"], add_special_tokens=False)["input_ids"]
    class_probs = {tokenizer.decode(token_id): probs[0, token_id].item() for token_id in class_tokens}

    # Pick the highest-probability sentiment
    sentiment_label = max(class_probs, key=class_probs.get)
    sentiment_prob = class_probs[sentiment_label]

    # Clean up
    del inputs, outputs, logits, probs
    torch.cuda.empty_cache()

    return sentiment_label, sentiment_prob


# -----------------------
# 3. AGGREGATION FUNCTION
# -----------------------
def aggregate_daily_sentiment(group: pd.DataFrame) -> pd.Series:
    """
    Given a subset of rows (one day of articles),
    compute the required metrics, including combined sentiment.
    """
    total_articles = len(group)
    if total_articles == 0:
        return pd.Series({
            "mode_of_sentiment": None,
            "num_articles": 0,
            "ratio_positive": 0.0,
            "ratio_negative": 0.0,
            "ratio_neutral": 0.0,
            "avg_sentiment_positive": 0.0,
            "avg_sentiment_negative": 0.0,
            "avg_sentiment_neutral": 0.0,
            "weighted_avg_sentiment": 0.0,
            "mode_of_sentiment_score": None,
            "mode_of_avg_sentiment_score": None,
            "sentiment_combined": None,
            "sentiment_score_combined": 0.0
        })

    # 1) Mode of sentiment types (which label appears most)
    sentiment_counts = group["sentiment"].value_counts()
    mode_of_sentiment = sentiment_counts.idxmax()

    # 2) Number of articles
    num_articles = total_articles

    # 3) Ratio of positive/negative/neutral
    ratio_positive = sentiment_counts.get("Positive", 0) / total_articles
    ratio_negative = sentiment_counts.get("Negative", 0) / total_articles
    ratio_neutral  = sentiment_counts.get("Neutral",  0) / total_articles

    # 4) Average sentiment for positive, negative, neutral
    avg_sentiment_positive = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].mean()
    avg_sentiment_negative = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].mean()
    avg_sentiment_neutral  = group.loc[group["sentiment"] == "Neutral",  "sentiment_logit"].mean()

    # 5) Weighted average sentiment:
    #    (sum of all positive sentiments - sum of all negative sentiments) / total_articles
    sum_pos = group.loc[group["sentiment"] == "Positive", "sentiment_logit"].sum()
    sum_neg = group.loc[group["sentiment"] == "Negative", "sentiment_logit"].sum()
    weighted_avg_sentiment = (sum_pos - sum_neg) / total_articles

    # 6) Mode of sentiment score (which sentiment label has the highest sum of probabilities)
    sum_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].sum()
    mode_of_sentiment_score = sum_sentiment_scores.idxmax() if not sum_sentiment_scores.empty else None

    # 7) Mode of average sentiment score (which label has the highest average probability)
    avg_sentiment_scores = group.groupby("sentiment")["sentiment_logit"].mean()
    mode_of_avg_sentiment_score = avg_sentiment_scores.idxmax() if not avg_sentiment_scores.empty else None

    # 8) Combined sentiment for all articles of the day:
    combined_text = " ".join(group["article_text"].tolist()).strip()
    # Retrieve industry from the group (all rows have the same ticker, hence same industry)
    industry = group["industry"].iloc[0] if "industry" in group.columns else None
    sentiment_combined, sentiment_score_combined = get_sentiment(combined_text, industry=industry)

    return pd.Series({
        "mode_of_sentiment": mode_of_sentiment,
        "num_articles": num_articles,
        "ratio_positive": ratio_positive,
        "ratio_negative": ratio_negative,
        "ratio_neutral":  ratio_neutral,
        "avg_sentiment_positive": avg_sentiment_positive,
        "avg_sentiment_negative": avg_sentiment_negative,
        "avg_sentiment_neutral":  avg_sentiment_neutral,
        "weighted_avg_sentiment": weighted_avg_sentiment,
        "mode_of_sentiment_score": mode_of_sentiment_score,
        "mode_of_avg_sentiment_score": mode_of_avg_sentiment_score,
        "sentiment_combined": sentiment_combined,
        "sentiment_score_combined": sentiment_score_combined
    })


# -----------------------
# 4. MAIN PROCESS
# -----------------------
TICKER_UNIVERSE = [
    # 'AAPL', 'MSFT', 'NVDA', 'AVGO', 'ADBE', 'UNH', 'JNJ', 'PFE', 'MRK', 'ABBV',
    # 'JPM', 'BAC', 'WFC', 'GS', 'MS', 'AMZN', 'TSLA', 'HD', 'MCD', 'NKE', 'GOOGL',
    # 'META', 'DIS', 'VZ', 'CMCSA', 'PG', 'KO', 'PEP', 'WMT', 'COST', 'XOM', 'CVX',
    'COP', 'BA', 'UNP', 'HON', 'NEE', 'DUK', 'SO', 'PLD', 'AMT', 'CCI', 'SHW', 'DOW'
]

# Directory containing all TICKER.parquet files
input_dir = "/content/drive/My Drive/LLM_RL_Data/stock_parquet"
output_dir = "/content/drive/My Drive/LLM_RL_Data/llm_sentiments"

# Optional: warm-up pass to stabilize GPU memory
print("Running warm-up pass...")
dummy_input = tokenizer("Warm-up", return_tensors="pt", padding=True, max_length=128).to(device)
with torch.no_grad():
    _ = model(**dummy_input)
torch.cuda.empty_cache()

# Loop over tickers (example with ticker 'SO')
for ticker in TICKER_UNIVERSE:
    input_path = f"{input_dir}/{ticker}.parquet"
    try:
        df = pd.read_parquet(input_path)
    except FileNotFoundError:
        print(f"File not found for {ticker}, skipping...")
        continue

    if ticker not in df.columns:
        print(f"Ticker column '{ticker}' not found in {input_path}, skipping...")
        continue

    print(f"\nProcessing {ticker} from file: {input_path}")
    print(f"Data shape: {df.shape}")

    # Convert index to column if needed
    if df.index.name == "trading_day":
        df.reset_index(inplace=True)

    # Explode the list of articles so each article is its own row
    df_exploded = df.explode(ticker, ignore_index=False).reset_index(drop=True)
    df_exploded.rename(columns={ticker: "article_text"}, inplace=True)

    # Fill missing article text with empty strings
    df_exploded["article_text"] = df_exploded["article_text"].fillna("")

    # Add a column for industry using the mapping
    industry = TICKER_INDUSTRY_MAP.get(ticker, None)
    df_exploded["industry"] = industry

    # Apply get_sentiment to each article
    sentiments, logits = [], []
    for text in tqdm(df_exploded["article_text"], desc=f"{ticker} articles"):
        s_label, s_logit = get_sentiment(text, industry=industry)
        sentiments.append(s_label)
        logits.append(s_logit)

    df_exploded["sentiment"] = sentiments
    df_exploded["sentiment_logit"] = logits

    # Group by trading_day to compute daily aggregates
    grouped_results = (
        df_exploded
        .groupby("trading_day", as_index=True)
        .apply(aggregate_daily_sentiment)
        .reset_index()
    )

    # Save the aggregated result for this ticker
    ticker_output_path = f"{output_dir}/{ticker}_sentiment_agg.parquet"
    grouped_results.to_parquet(ticker_output_path, index=False)

    print(f"Saved daily sentiment aggregates to {ticker_output_path}")

    # Cleanup
    del df, df_exploded, grouped_results, sentiments, logits
    torch.cuda.empty_cache()
    gc.collect()

print("\nAll tickers processed!")


Running warm-up pass...

Processing COP from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/COP.parquet
Data shape: (1848, 1)


COP articles: 100%|██████████| 10473/10473 [17:32<00:00,  9.95it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/COP_sentiment_agg.parquet

Processing BA from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/BA.parquet
Data shape: (1848, 1)


BA articles: 100%|██████████| 25586/25586 [36:52<00:00, 11.57it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/BA_sentiment_agg.parquet

Processing UNP from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/UNP.parquet
Data shape: (1848, 1)


UNP articles: 100%|██████████| 2798/2798 [02:34<00:00, 18.10it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/UNP_sentiment_agg.parquet

Processing HON from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/HON.parquet
Data shape: (1848, 1)


HON articles: 100%|██████████| 3173/3173 [03:18<00:00, 16.00it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/HON_sentiment_agg.parquet

Processing NEE from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/NEE.parquet
Data shape: (1848, 1)


NEE articles: 100%|██████████| 2771/2771 [02:30<00:00, 18.42it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/NEE_sentiment_agg.parquet

Processing DUK from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/DUK.parquet
Data shape: (1848, 1)


DUK articles: 100%|██████████| 2995/2995 [03:02<00:00, 16.40it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/DUK_sentiment_agg.parquet

Processing SO from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/SO.parquet
Data shape: (1848, 1)


SO articles: 100%|██████████| 2828/2828 [02:30<00:00, 18.84it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/SO_sentiment_agg.parquet

Processing PLD from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/PLD.parquet
Data shape: (1848, 1)


PLD articles: 100%|██████████| 2232/2232 [01:32<00:00, 24.03it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/PLD_sentiment_agg.parquet

Processing AMT from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/AMT.parquet
Data shape: (1848, 1)


AMT articles: 100%|██████████| 2749/2749 [02:54<00:00, 15.77it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/AMT_sentiment_agg.parquet

Processing CCI from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/CCI.parquet
Data shape: (1848, 1)


CCI articles: 100%|██████████| 2332/2332 [01:45<00:00, 22.21it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/CCI_sentiment_agg.parquet

Processing SHW from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/SHW.parquet
Data shape: (1848, 1)


SHW articles: 100%|██████████| 2100/2100 [00:53<00:00, 39.62it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/SHW_sentiment_agg.parquet

Processing DOW from file: /content/drive/My Drive/LLM_RL_Data/stock_parquet/DOW.parquet
Data shape: (1848, 1)


DOW articles: 100%|██████████| 3302/3302 [03:34<00:00, 15.38it/s]


Saved daily sentiment aggregates to /content/drive/My Drive/LLM_RL_Data/results/DOW_sentiment_agg.parquet

All tickers processed!


In [5]:
df = pd.read_parquet("/content/drive/My Drive/LLM_RL_Data/llm_sentiments/TSLA_sentiment_agg.parquet")
df

Unnamed: 0,trading_day,mode_of_sentiment,num_articles,ratio_positive,ratio_negative,ratio_neutral,avg_sentiment_positive,avg_sentiment_negative,avg_sentiment_neutral,weighted_avg_sentiment,mode_of_sentiment_score,mode_of_avg_sentiment_score,sentiment_combined,sentiment_score_combined
0,2018-01-01,Neutral,1,0.000000,0.000000,1.000000,,,1.000000,0.000000,Neutral,Neutral,Neutral,1.000000
1,2018-01-02,Neutral,1,0.000000,0.000000,1.000000,,,1.000000,0.000000,Neutral,Neutral,Neutral,1.000000
2,2018-01-03,Positive,4,0.750000,0.000000,0.250000,0.000547,,0.000200,0.000410,Positive,Positive,Neutral,0.000301
3,2018-01-04,Neutral,16,0.187500,0.062500,0.750000,0.000663,0.000262,0.000221,0.000108,Neutral,Positive,Negative,0.000268
4,2018-01-05,Neutral,1,0.000000,0.000000,1.000000,,,1.000000,0.000000,Neutral,Neutral,Neutral,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1843,2025-01-28,Neutral,18,0.333333,0.000000,0.666667,0.000386,,0.000296,0.000129,Neutral,Positive,Neutral,0.000342
1844,2025-01-29,Neutral,7,0.285714,0.000000,0.714286,0.000556,,0.000340,0.000159,Neutral,Positive,Positive,0.000275
1845,2025-01-30,Neutral,36,0.416667,0.111111,0.472222,0.000574,0.000222,0.000317,0.000215,Positive,Positive,Neutral,0.000305
1846,2025-01-31,Positive,19,0.631579,0.000000,0.368421,0.000793,,0.000296,0.000501,Positive,Positive,Neutral,0.000281


## OLD CODE

In [6]:
import gc
import torch
import pandas as pd
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Define function for sentiment analysis
def get_sentiment(text):
    if not text:  # If the text is empty, return neutral sentiment
        return "Neutral", 1.0  # Neutral with logit probability 1.0
    len_text = min(len(text),2000)
    text = text[:len_text]
    # Define the prompt for the model
    prompt = f'''Instruction: What is the sentiment of this news? Please choose an answer from [Positive, Negative, Neutral].\nInput: {text}\nAnswer: '''

    # Tokenize directly on the GPU for efficiency
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, max_length=128).to(device)

    # Forward pass on GPU
    with torch.no_grad():
        outputs = model(**inputs)

    # Get logits for the last token and move them back to CPU
    logits = outputs.logits[:, -1, :].to("cpu")
    probs = torch.softmax(logits, dim=-1)

    # Class tokens for Positive, Negative, Neutral
    class_tokens = tokenizer(["Positive", "Negative", "Neutral"], add_special_tokens=False)["input_ids"]
    class_probs = {tokenizer.decode(token_id): probs[0, token_id].item() for token_id in class_tokens}

    # Get the most probable sentiment
    sentiment = max(class_probs, key=class_probs.get)

    # Clear intermediate variables
    del inputs, outputs, logits, probs
    torch.cuda.empty_cache()
    return sentiment, class_probs[sentiment]


In [None]:
# Parameters
batch_size = 500  # Number of rows per batch
output_dir = '/content/drive/My Drive/LLM_RL_Data/results'  # Directory for individual ticker files

# Process the parquet files
file_paths = [
    '/content/drive/My Drive/LLM_RL_Data/TICKER.parquet'
]

# Warm-up pass
print("Running warm-up pass to stabilize GPU memory...")
dummy_input = tokenizer("Warm-up", return_tensors="pt", padding=True, max_length=128).to(device)
with torch.no_grad():
    _ = model(**dummy_input)
torch.cuda.empty_cache()

# Process each file
for input_path in file_paths:
    year = input_path.split('_')[-1].split('.')[0]  # Extract year from filename
    print(f"\nProcessing file: {input_path} (Year: {year})")

    # Load the parquet file
    df = pd.read_parquet(input_path)

    for ticker in ['TSLA']:
        print(f"\nProcessing sentiment for ticker: {ticker}")

        reddit_col = f"{ticker}_reddit"
        sentiment_col = f"{ticker}_sentiment"
        logit_col = f"{ticker}_logit"

        # Initialize lists for sentiments and logits
        sentiments = []
        logits = []

        # Process DataFrame in batches
        num_batches = (len(df) + batch_size - 1) // batch_size  # Calculate total number of batches
        for batch_num in tqdm(range(num_batches), desc=f"Ticker: {ticker} Batches"):
            # Extract batch
            start_idx = batch_num * batch_size
            end_idx = min((batch_num + 1) * batch_size, len(df))
            batch_df = df.iloc[start_idx:end_idx]

            # Process each row in the batch
            batch_sentiments = []
            batch_logits = []
            for text in batch_df[reddit_col].fillna("").tolist():
                sentiment, logit = get_sentiment(text)  # GPU inference
                batch_sentiments.append(sentiment)
                batch_logits.append(logit)

            # Append batch results to the main lists
            sentiments.extend(batch_sentiments)
            logits.extend(batch_logits)

            # Clear GPU memory periodically
            del batch_df, batch_sentiments, batch_logits
            torch.cuda.empty_cache()
            gc.collect()

        # Create and save a DataFrame for this ticker
        ticker_df = pd.DataFrame({
            'timestamp': df['timestamp'],  # Assuming there's a timestamp column
            reddit_col: df[reddit_col],
            sentiment_col: sentiments,
            logit_col: logits
        })

        # Save the ticker DataFrame to a separate parquet file
        ticker_output_path = f"{output_dir}/{ticker}_{year}.parquet"
        ticker_df.to_parquet(ticker_output_path, index=False)
        print(f"Saved results for {ticker} to {ticker_output_path}")

        # Clear memory for the ticker
        del ticker_df, sentiments, logits
        torch.cuda.empty_cache()
        gc.collect()

    # Clear memory for the file
    del df
    torch.cuda.empty_cache()
    gc.collect()

    print(f"Cleared GPU cache and unnecessary variables for: {input_path}")