In [73]:
import os

In [74]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
from torch.nn.functional import softmax

# Load FinBERT


In [75]:
MODEL_NAME = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()  # inference mode



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30873, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [76]:
def get_finbert_sentiment(headline):
    inputs = tokenizer(headline, return_tensors="pt", truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1).squeeze()

    # Map: 0 = negative, 1 = neutral, 2 = positive
    score = probs[2] - probs[0]  # Confidence of pos - neg
    sentiment = torch.argmax(probs).item()
    return sentiment, score.item()

In [77]:
news_df = pd.read_csv('./AAPL_news_dataset.csv')  # Load your news data

In [78]:
news_df

Unnamed: 0,ticker,date,headline
0,AAPL,2025-06-24,Cloud AI Today - Botpress Secures $25M To Enha...
1,AAPL,2025-06-24,Apple's Quiet AI Gambit Raises Eyebrows
2,AAPL,2025-06-24,Apple's EU Talks Enter Critical Phase
3,AAPL,2025-06-24,5 Dividend Stocks Poised to Profit From the AI...
4,AAPL,2025-06-24,US Finalizing $500 Million for African Critica...
5,AAPL,2025-06-24,Google faces UK push to loosen its grip on search
6,AAPL,2025-06-24,Steve Jobs Believed Teamwork Required 'Bumping...
7,AAPL,2025-06-23,Court filings reveal OpenAI and io’s early wor...
8,AAPL,2025-06-23,Is Apple Looking To Catch Up In AI With Big Deal?
9,AAPL,2025-06-23,Apple’s Liquid Glass interface improves with r...


In [79]:
hnews = pd.read_csv("./sp500-2008-2024.csv")

In [80]:
filtered_df = hnews[hnews['Title'].str.contains('apple', case=False, na=False)]

In [81]:
filtered_df['date'] = pd.to_datetime(filtered_df['Date']).dt.date
filtered_df['ticker'] = 'AAPL'
filtered_df['headline'] = filtered_df['Title']
df1 = filtered_df[['ticker','date','headline']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['date'] = pd.to_datetime(filtered_df['Date']).dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['ticker'] = 'AAPL'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['headline'] = filtered_df['Title']


In [82]:
df_merged = pd.concat([df1, news_df], ignore_index=True)

In [83]:
news_df = df_merged

In [84]:
news_df['sentiment'], news_df['sentiment_score'] = zip(*news_df['headline'].map(get_finbert_sentiment))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [85]:
daily_sentiment = news_df.groupby(['ticker', 'date']).agg(
    avg_sentiment_score=('sentiment_score', 'mean'),
    sentiment_articles=('headline', 'count'),
    pos_count=('sentiment', lambda x: (x == 2).sum()),
    neg_count=('sentiment', lambda x: (x == 0).sum()),
    neu_count=('sentiment', lambda x: (x == 1).sum()),
).reset_index()

In [86]:
daily_sentiment

Unnamed: 0,ticker,date,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
0,AAPL,2010-04-12,-0.988344,1,0,1,0
1,AAPL,2010-04-22,-0.999520,1,0,1,0
2,AAPL,2010-05-26,-0.128264,1,0,0,1
3,AAPL,2010-12-13,-0.013537,1,0,0,1
4,AAPL,2011-09-28,-0.282580,1,0,1,0
...,...,...,...,...,...,...,...
193,AAPL,2024-01-17,0.999702,1,1,0,0
194,AAPL,2024-02-15,-0.170600,1,0,0,1
195,AAPL,2024-02-21,-0.999783,1,0,1,0
196,AAPL,2025-06-23,-0.325708,13,4,8,1


In [87]:
news_df

Unnamed: 0,ticker,date,headline,sentiment,sentiment_score
0,AAPL,2010-04-12,Apple And My Cognitive Dissonance,0,-0.988344
1,AAPL,2010-04-22,"Oh My God, Apple Just Passed Microsoft In Mark...",0,-0.999520
2,AAPL,2010-05-26,Apple Passes Microsoft as World's Largest Tech...,1,-0.128264
3,AAPL,2010-12-13,Apple's stock still looks like a bargain - The...,1,-0.013537
4,AAPL,2011-09-28,"Apple to kill iPod classic, shuffle?",0,-0.282580
...,...,...,...,...,...
258,AAPL,2025-06-23,ALTO Real Estate Funds Acquires Prime 24-Acre ...,0,-0.999837
259,AAPL,2025-06-23,Tokenized Shares of Solana Treasury Company De...,0,-0.999938
260,AAPL,2025-06-23,Market Chatter: Apple Sued by Shareholders Ove...,2,0.999049
261,AAPL,2025-06-23,Apple faces shareholder lawsuit over alleged A...,2,0.988303


In [88]:
tech_df = pd.read_csv("./AAPL_ta_dataset.csv")

In [89]:
tech_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,sma_20,ema_20,bb_upper,bb_lower,atr,adx,target
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,77.421349,77.344102,80.042917,74.799782,1.628811,13.975331,0
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,77.355619,77.203823,80.062065,74.649173,1.684717,14.026662,0
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,77.120815,76.733674,80.621075,73.620554,1.976741,15.870779,0
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,76.876775,76.075174,81.513996,72.239553,2.119454,17.773211,1
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,76.582454,75.584871,81.896255,71.268652,2.165069,19.539754,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,201.897000,201.585117,209.242207,194.551794,5.037702,16.881351,1
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,201.255000,201.283677,207.353211,195.156789,4.837866,17.666974,0
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,200.598000,200.746184,206.115437,195.080563,4.721589,18.653920,1
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,200.084000,200.349405,205.061114,195.106887,4.562904,19.597532,1


In [90]:
tech_df['date'] = pd.to_datetime(tech_df['Date']).dt.date
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date']).dt.date

In [91]:
tech_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,sma_20,ema_20,bb_upper,bb_lower,atr,adx,target,date
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,77.421349,77.344102,80.042917,74.799782,1.628811,13.975331,0,2020-02-20
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,77.355619,77.203823,80.062065,74.649173,1.684717,14.026662,0,2020-02-21
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,77.120815,76.733674,80.621075,73.620554,1.976741,15.870779,0,2020-02-24
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,76.876775,76.075174,81.513996,72.239553,2.119454,17.773211,1,2020-02-25
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,76.582454,75.584871,81.896255,71.268652,2.165069,19.539754,0,2020-02-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,201.897000,201.585117,209.242207,194.551794,5.037702,16.881351,1,2025-06-13
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,201.255000,201.283677,207.353211,195.156789,4.837866,17.666974,0,2025-06-16
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,200.598000,200.746184,206.115437,195.080563,4.721589,18.653920,1,2025-06-17
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,200.084000,200.349405,205.061114,195.106887,4.562904,19.597532,1,2025-06-18


In [92]:
daily_sentiment = daily_sentiment[daily_sentiment['ticker'] == 'AAPL']
tech_df['ticker'] = 'AAPL'


In [93]:
merged_df = pd.merge(tech_df, daily_sentiment, on=['date', 'ticker'], how='left')


In [94]:
merged_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,...,atr,adx,target,date,ticker,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,...,1.628811,13.975331,0,2020-02-20,AAPL,,,,,
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,...,1.684717,14.026662,0,2020-02-21,AAPL,,,,,
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,...,1.976741,15.870779,0,2020-02-24,AAPL,,,,,
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,...,2.119454,17.773211,1,2020-02-25,AAPL,,,,,
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,...,2.165069,19.539754,0,2020-02-26,AAPL,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,...,5.037702,16.881351,1,2025-06-13,AAPL,,,,,
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,...,4.837866,17.666974,0,2025-06-16,AAPL,,,,,
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,...,4.721589,18.653920,1,2025-06-17,AAPL,,,,,
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,...,4.562904,19.597532,1,2025-06-18,AAPL,,,,,


In [95]:
merged_df[['avg_sentiment_score', 'pos_count', 'neg_count', 'neu_count', 'sentiment_articles']] = \
    merged_df[['avg_sentiment_score', 'pos_count', 'neg_count', 'neu_count', 'sentiment_articles']].fillna(0)


In [96]:
merged_df

Unnamed: 0,Date,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,...,atr,adx,target,date,ticker,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
0,2020-02-20,77.628288,78.682561,77.121754,78.192994,100566000,53.041382,73.090178,0.843613,0.950528,...,1.628811,13.975331,0,2020-02-20,AAPL,0.0,0.0,0.0,0.0,0.0
1,2020-02-21,75.871170,77.664649,75.253152,77.221124,129554000,46.139544,44.897085,0.625894,0.885601,...,1.684717,14.026662,0,2020-02-21,AAPL,0.0,0.0,0.0,0.0,0.0
2,2020-02-24,72.267258,73.721425,70.098129,72.044290,222195200,35.838933,23.558792,0.160693,0.740620,...,1.976741,15.870779,0,2020-02-24,AAPL,0.0,0.0,0.0,0.0,0.0
3,2020-02-25,69.819420,73.321543,69.346820,72.938615,230673600,30.808022,4.745643,-0.400882,0.512319,...,2.119454,17.773211,1,2020-02-25,AAPL,0.0,0.0,0.0,0.0,0.0
4,2020-02-26,70.927002,72.194554,69.436482,69.443753,198054800,35.237888,15.867500,-0.747939,0.260268,...,2.165069,19.539754,0,2020-02-26,AAPL,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,2025-06-13,196.449997,200.369995,195.699997,199.729996,51447300,40.802560,7.115744,-1.571946,-1.231530,...,5.037702,16.881351,1,2025-06-13,AAPL,0.0,0.0,0.0,0.0,0.0
1337,2025-06-16,198.419998,198.690002,196.559998,197.300003,43020700,44.244670,25.806442,-1.632476,-1.311719,...,4.837866,17.666974,0,2025-06-16,AAPL,0.0,0.0,0.0,0.0,0.0
1338,2025-06-17,195.639999,198.389999,195.210007,197.199997,38856200,40.652387,3.898393,-1.883062,-1.425988,...,4.721589,18.653920,1,2025-06-17,AAPL,0.0,0.0,0.0,0.0,0.0
1339,2025-06-18,196.580002,197.570007,195.070007,195.940002,45394700,42.356616,13.518306,-1.982945,-1.537379,...,4.562904,19.597532,1,2025-06-18,AAPL,0.0,0.0,0.0,0.0,0.0


In [97]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 1341 non-null   object 
 1   Close                1341 non-null   float64
 2   High                 1341 non-null   float64
 3   Low                  1341 non-null   float64
 4   Open                 1341 non-null   float64
 5   Volume               1341 non-null   int64  
 6   rsi                  1341 non-null   float64
 7   stoch_k              1341 non-null   float64
 8   macd                 1341 non-null   float64
 9   macd_signal          1341 non-null   float64
 10  sma_20               1341 non-null   float64
 11  ema_20               1341 non-null   float64
 12  bb_upper             1341 non-null   float64
 13  bb_lower             1341 non-null   float64
 14  atr                  1341 non-null   float64
 15  adx                  1341 non-null   f

In [98]:
merged_df.describe()

Unnamed: 0,Close,High,Low,Open,Volume,rsi,stoch_k,macd,macd_signal,sma_20,...,bb_upper,bb_lower,atr,adx,target,avg_sentiment_score,sentiment_articles,pos_count,neg_count,neu_count
count,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,...,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0,1341.0
mean,159.47051,161.134345,157.630165,159.312823,86749830.0,53.683328,57.236075,0.658914,0.666503,158.607438,...,168.130452,149.084423,3.857153,25.820573,0.52871,-0.01085,0.085011,0.022371,0.032066,0.030574
std,43.03331,43.249103,42.72858,42.975391,52235030.0,12.215204,31.138494,2.994465,2.791245,43.121026,...,44.942324,41.687174,1.320913,9.648837,0.499361,0.175043,0.386632,0.195717,0.196273,0.203965
min,54.378578,55.379531,51.528412,55.27774,23234700.0,20.057863,0.350656,-10.616152,-8.325011,60.459789,...,66.202475,52.524962,1.628811,9.891763,0.0,-1.0,0.0,0.0,0.0,0.0
25%,131.068878,132.407324,129.976334,131.254519,53020300.0,44.194966,29.228742,-1.554457,-1.242093,129.846589,...,139.479082,121.79269,2.947438,17.815198,0.0,0.0,0.0,0.0,0.0,0.0
50%,158.848511,160.593803,156.51532,158.343244,72433800.0,53.375694,61.922458,0.862992,0.701732,158.848372,...,168.226528,146.667973,3.587937,24.402507,1.0,0.0,0.0,0.0,0.0,0.0
75%,187.716309,188.849464,185.987994,187.288885,101593300.0,63.324256,86.236948,2.807368,2.695696,186.59256,...,194.830926,177.361747,4.542409,32.186018,1.0,0.0,0.0,0.0,0.0,0.0
max,258.396667,259.474086,257.010028,257.568678,426510000.0,82.042764,100.0,8.913714,8.281467,249.383421,...,264.059288,239.987765,11.587528,58.692489,1.0,0.999998,6.0,3.0,3.0,4.0
