In [1]:
import os

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
from torch.nn.functional import softmax

# Load FinBERT


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL_NAME = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()  # inference mode



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [4]:
def get_finbert_sentiment(headline):
    inputs = tokenizer(headline, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1).squeeze()

    # Map: 0 = negative, 1 = neutral, 2 = positive
    score = probs[2] - probs[0]  # Confidence of pos - neg
    sentiment = torch.argmax(probs).item()
    return sentiment, score.item()

In [9]:
news_df = pd.read_csv('./AAPL_news_dataset.csv')  # Load your news data

In [10]:
news_df

Unnamed: 0,ticker,date,headline
0,AAPL,2025-06-24,Cloud AI Today - Botpress Secures $25M To Enha...
1,AAPL,2025-06-24,Apple's Quiet AI Gambit Raises Eyebrows
2,AAPL,2025-06-24,Apple's EU Talks Enter Critical Phase
3,AAPL,2025-06-24,5 Dividend Stocks Poised to Profit From the AI...
4,AAPL,2025-06-24,US Finalizing $500 Million for African Critica...
5,AAPL,2025-06-24,Google faces UK push to loosen its grip on search
6,AAPL,2025-06-24,Steve Jobs Believed Teamwork Required 'Bumping...
7,AAPL,2025-06-23,Court filings reveal OpenAI and io’s early wor...
8,AAPL,2025-06-23,Is Apple Looking To Catch Up In AI With Big Deal?
9,AAPL,2025-06-23,Apple’s Liquid Glass interface improves with r...


In [44]:
hnews = pd.read_csv("./sp500-2008-2024.csv")

In [45]:
filtered_df = hnews[hnews['Title'].str.contains('apple', case=False, na=False)]

In [46]:
filtered_df['date'] = pd.to_datetime(filtered_df['Date']).dt.date
filtered_df['ticker'] = 'AAPL'
filtered_df['headline'] = filtered_df['Title']
df1 = filtered_df[['ticker','date','headline']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['date'] = pd.to_datetime(filtered_df['Date']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ticker'] = 'AAPL'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['headline'] = filtered_df['Title']


In [47]:
df_merged = pd.concat([df1, news_df], ignore_index=True)

In [48]:
news_df = df_merged

In [None]:
news_df['sentiment'], news_df['sentiment_score'] = zip(*news_df['headline'].map(get_finbert_sentiment))

In [12]:
daily_sentiment = news_df.groupby(['ticker', 'date']).agg(
    avg_sentiment_score=('sentiment_score', 'mean'),
    sentiment_articles=('headline', 'count'),
    pos_count=('sentiment', lambda x: (x == 2).sum()),
    neg_count=('sentiment', lambda x: (x == 0).sum()),
    neu_count=('sentiment', lambda x: (x == 1).sum()),
).reset_index()

In [13]:
daily_sentiment

Unnamed: 0,ticker,date,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
0,AAPL,2025-06-23,-0.325708,13,4,8,1
1,AAPL,2025-06-24,-0.150532,7,2,3,2


In [14]:
news_df

Unnamed: 0,ticker,date,headline,sentiment,sentiment_score
0,AAPL,2025-06-24,Cloud AI Today - Botpress Secures $25M To Enha...,1,-0.000179677
1,AAPL,2025-06-24,Apple's Quiet AI Gambit Raises Eyebrows,2,0.9364706
2,AAPL,2025-06-24,Apple's EU Talks Enter Critical Phase,0,-0.999472
3,AAPL,2025-06-24,5 Dividend Stocks Poised to Profit From the AI...,1,-9.360929e-08
4,AAPL,2025-06-24,US Finalizing $500 Million for African Critica...,0,-0.9999851
5,AAPL,2025-06-24,Google faces UK push to loosen its grip on search,2,0.9998252
6,AAPL,2025-06-24,Steve Jobs Believed Teamwork Required 'Bumping...,0,-0.9903817
7,AAPL,2025-06-23,Court filings reveal OpenAI and io’s early wor...,0,-0.9997898
8,AAPL,2025-06-23,Is Apple Looking To Catch Up In AI With Big Deal?,0,-0.9868358
9,AAPL,2025-06-23,Apple’s Liquid Glass interface improves with r...,1,-3.538756e-06


In [15]:
tech_df = pd.read_csv("./AAPL_ta_dataset.csv")

In [16]:
tech_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,sma_20,ema_20,bb_upper,bb_lower,atr,adx,target
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,77.421349,77.344102,80.042917,74.799782,1.628811,13.975331,0
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,77.355619,77.203823,80.062065,74.649173,1.684717,14.026662,0
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,77.120815,76.733674,80.621075,73.620554,1.976741,15.870779,0
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,76.876775,76.075174,81.513996,72.239553,2.119454,17.773211,1
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,76.582454,75.584871,81.896255,71.268652,2.165069,19.539754,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,201.897000,201.585117,209.242207,194.551794,5.037702,16.881351,1
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,201.255000,201.283677,207.353211,195.156789,4.837866,17.666974,0
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,200.598000,200.746184,206.115437,195.080563,4.721589,18.653920,1
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,200.084000,200.349405,205.061114,195.106887,4.562904,19.597532,1


In [17]:
tech_df['date'] = pd.to_datetime(tech_df['Date']).dt.date
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date']).dt.date

In [18]:
tech_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,sma_20,ema_20,bb_upper,bb_lower,atr,adx,target,date
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,77.421349,77.344102,80.042917,74.799782,1.628811,13.975331,0,2020-02-20
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,77.355619,77.203823,80.062065,74.649173,1.684717,14.026662,0,2020-02-21
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,77.120815,76.733674,80.621075,73.620554,1.976741,15.870779,0,2020-02-24
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,76.876775,76.075174,81.513996,72.239553,2.119454,17.773211,1,2020-02-25
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,76.582454,75.584871,81.896255,71.268652,2.165069,19.539754,0,2020-02-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,201.897000,201.585117,209.242207,194.551794,5.037702,16.881351,1,2025-06-13
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,201.255000,201.283677,207.353211,195.156789,4.837866,17.666974,0,2025-06-16
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,200.598000,200.746184,206.115437,195.080563,4.721589,18.653920,1,2025-06-17
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,200.084000,200.349405,205.061114,195.106887,4.562904,19.597532,1,2025-06-18


In [19]:
daily_sentiment = daily_sentiment[daily_sentiment['ticker'] == 'AAPL']
tech_df['ticker'] = 'AAPL'


In [21]:
merged_df = pd.merge(tech_df, daily_sentiment, on=['date', 'ticker'], how='left')


In [22]:
merged_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,...,atr,adx,target,date,ticker,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,...,1.628811,13.975331,0,2020-02-20,AAPL,,,,,
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,...,1.684717,14.026662,0,2020-02-21,AAPL,,,,,
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,...,1.976741,15.870779,0,2020-02-24,AAPL,,,,,
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,...,2.119454,17.773211,1,2020-02-25,AAPL,,,,,
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,...,2.165069,19.539754,0,2020-02-26,AAPL,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,...,5.037702,16.881351,1,2025-06-13,AAPL,,,,,
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,...,4.837866,17.666974,0,2025-06-16,AAPL,,,,,
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,...,4.721589,18.653920,1,2025-06-17,AAPL,,,,,
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,...,4.562904,19.597532,1,2025-06-18,AAPL,,,,,


In [23]:
merged_df[['avg_sentiment_score', 'pos_count', 'neg_count', 'neu_count', 'sentiment_articles']] = \
    merged_df[['avg_sentiment_score', 'pos_count', 'neg_count', 'neu_count', 'sentiment_articles']].fillna(0)


In [24]:
merged_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,...,atr,adx,target,date,ticker,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,...,1.628811,13.975331,0,2020-02-20,AAPL,0.0,0.0,0.0,0.0,0.0
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,...,1.684717,14.026662,0,2020-02-21,AAPL,0.0,0.0,0.0,0.0,0.0
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,...,1.976741,15.870779,0,2020-02-24,AAPL,0.0,0.0,0.0,0.0,0.0
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,...,2.119454,17.773211,1,2020-02-25,AAPL,0.0,0.0,0.0,0.0,0.0
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,...,2.165069,19.539754,0,2020-02-26,AAPL,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,...,5.037702,16.881351,1,2025-06-13,AAPL,0.0,0.0,0.0,0.0,0.0
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,...,4.837866,17.666974,0,2025-06-16,AAPL,0.0,0.0,0.0,0.0,0.0
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,...,4.721589,18.653920,1,2025-06-17,AAPL,0.0,0.0,0.0,0.0,0.0
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,...,4.562904,19.597532,1,2025-06-18,AAPL,0.0,0.0,0.0,0.0,0.0


In [31]:
filtered_df

Unnamed: 0,Title,Date,CP
291,Apple And My Cognitive Dissonance,2010-04-12,1196.48
300,"Oh My God, Apple Just Passed Microsoft In Mark...",2010-04-22,1208.67
317,Apple Passes Microsoft as World's Largest Tech...,2010-05-26,1067.95
449,Apple's stock still looks like a bargain - The...,2010-12-13,1240.46
817,"Apple to kill iPod classic, shuffle?",2011-09-28,1151.06
...,...,...,...
18297,"Markets News, Jan. 2, 2024: Nasdaq Sinks as Ap...",2024-01-02,4742.83
18409,3 Stock Market Predictions for 2024: The S&P 5...,2024-01-09,4756.50
18492,US stock market: Wall Street ends down as Appl...,2024-01-17,4739.21
18863,"Markets News, Feb. 15, 2024: Dow Leads Stocks ...",2024-02-15,5029.73


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['date'] = pd.to_datetime(filtered_df['Date']).dt.date


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ticker'] = 'AAPL'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['headline'] = filtered_df['Title']


In [38]:
df1

Unnamed: 0,ticker,date,headline
291,AAPL,2010-04-12,Apple And My Cognitive Dissonance
300,AAPL,2010-04-22,"Oh My God, Apple Just Passed Microsoft In Mark..."
317,AAPL,2010-05-26,Apple Passes Microsoft as World's Largest Tech...
449,AAPL,2010-12-13,Apple's stock still looks like a bargain - The...
817,AAPL,2011-09-28,"Apple to kill iPod classic, shuffle?"
...,...,...,...
18297,AAPL,2024-01-02,"Markets News, Jan. 2, 2024: Nasdaq Sinks as Ap..."
18409,AAPL,2024-01-09,3 Stock Market Predictions for 2024: The S&P 5...
18492,AAPL,2024-01-17,US stock market: Wall Street ends down as Appl...
18863,AAPL,2024-02-15,"Markets News, Feb. 15, 2024: Dow Leads Stocks ..."


In [43]:
df_merged

Unnamed: 0,ticker,date,headline,sentiment,sentiment_score
0,AAPL,2010-04-12,Apple And My Cognitive Dissonance,,
1,AAPL,2010-04-22,"Oh My God, Apple Just Passed Microsoft In Mark...",,
2,AAPL,2010-05-26,Apple Passes Microsoft as World's Largest Tech...,,
3,AAPL,2010-12-13,Apple's stock still looks like a bargain - The...,,
4,AAPL,2011-09-28,"Apple to kill iPod classic, shuffle?",,
...,...,...,...,...,...
258,AAPL,2025-06-23,ALTO Real Estate Funds Acquires Prime 24-Acre ...,0.0,-0.999837
259,AAPL,2025-06-23,Tokenized Shares of Solana Treasury Company De...,0.0,-0.999938
260,AAPL,2025-06-23,Market Chatter: Apple Sued by Shareholders Ove...,2.0,0.999049
261,AAPL,2025-06-23,Apple faces shareholder lawsuit over alleged A...,2.0,0.988303
