In [None]:
!pip install transformers sentencepiece torch nltk


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
ENTITIES = [
    'silver', 'silver prices', 'silver price', 'silver futures',
    'spot silver', 'silver market', 'silver demand', 'silver supply',
    'silver production', 'silver mining', 'silver bullion', 'silver bars',
    'silver coins', 'silver reserves', 'silver exports', 'silver imports',
    'silver ETF', 'precious metals', 'London Bullion Market', 'COMEX silver'
]


In [None]:
import nltk
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from nltk.tokenize import sent_tokenize

nltk.download('punkt_tab')

# Load FinBERT-tone
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")
model.eval()

# Label map
id2label = {0: "neutral", 1: "positive", 2: "negative"}


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        label_id = torch.argmax(logits, dim=1).item()
        return id2label[label_id]


In [None]:
import pandas as pd
import re

df = pd.read_csv('articles_filtered.csv')
results = []

total_rows = len(df)

for idx, row in df.iterrows():
    article_text = row['text']
    date = row['Date']

    sentences = sent_tokenize(article_text)

    for sent in sentences:
        for entity in ENTITIES:
            # Search for entity match in sentence
            pattern = re.compile(rf'\b{re.escape(entity)}\b', flags=re.IGNORECASE)
            for match in pattern.finditer(sent):
                sentiment = predict_sentiment(sent)
                start = article_text.find(match.group())
                end = start + len(match.group())

                results.append({
                    'date': date,
                    'text': article_text,
                    'entity': match.group(),
                    'start': start,
                    'end': end,
                    'sentiment': sentiment
                })

    # Print % completed
    if idx % max(1, total_rows // 20) == 0:
        percent = (idx + 1) / total_rows * 100
        print(f"Progress: {percent:.1f}%")

print("Completed!")


  df = pd.read_csv('articles_filtered.csv')


Progress: 0.0%
Progress: 5.0%
Progress: 10.0%
Progress: 15.0%
Progress: 20.0%
Progress: 25.0%
Progress: 30.0%
Progress: 34.9%
Progress: 39.9%
Progress: 44.9%
Progress: 49.9%
Progress: 54.9%
Progress: 59.9%
Progress: 64.9%
Progress: 69.9%
Progress: 74.9%
Progress: 79.9%
Progress: 84.8%
Progress: 89.8%
Progress: 94.8%
Progress: 99.8%
Completed!


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load or prepare data
df = pd.DataFrame(results)

# Map sentiment to numerical scores
sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
df['sentiment_score'] = df['sentiment'].map(sentiment_map)

# Drop time component and any bad dates
df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.date
df = df.dropna(subset=['date'])

# Aggregate by date (not datetime)
daily_sentiment = df.groupby('date')['sentiment_score'].sum().reset_index()

# Normalize
scaler = MinMaxScaler()
daily_sentiment['normalized_score'] = scaler.fit_transform(daily_sentiment[['sentiment_score']])


In [None]:
daily_sentiment.to_csv('daily_sequence_sentiment.csv', index=False)
