
#Predicting Market Reactions to Earnings Calls Using Financial Language Models

*Ahmad Alshikh Menou, Risha Baid*

In [None]:
##if the first code block does not run, make sure to uncomment this and run.
!pip install --no-cache-dir --force-reinstall -U numpy

In [None]:
import subprocess
import sys

#Install all dependencies with compatible versions
subprocess.run([
    sys.executable, "-m", "pip", "install", "-q",
    "requests", "pandas", "numpy", "yfinance",
    "defeatbeta-api", "nltk"
], check=True)

#Install PyTorch and compatible transformers
subprocess.run([
    sys.executable, "-m", "pip", "install", "-q",
    "torch", "torchvision", "torchaudio"
], check=True)

subprocess.run([
    sys.executable, "-m", "pip", "install", "-q",
    "transformers==4.36.2", "sentencepiece", "protobuf"
], check=True)

#Import everything
import requests
import re
import numpy as np
import pandas as pd
import time
from datetime import datetime, timedelta
import yfinance as yf
from defeatbeta_api.data.ticker import Ticker
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

#Download VADER
nltk.download('vader_lexicon', quiet=True)
sia = SentimentIntensityAnalyzer()

#Load FinBERT
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model.eval()

[nltk_data] Downloading package punkt_tab to /tmp/nltk...
[nltk_data]   Package punkt_tab is already up-to-date!


[38;5;10m______      __           _    ______      _        
|  _  \    / _|         | |   | ___ \    | |       
| | | |___| |_ ___  __ _| |_  | |_/ / ___| |_ __ _ 
| | | / _ \  _/ _ \/ _` | __| | ___ \/ _ \ __/ _` |
| |/ /  __/ ||  __/ (_| | |_  | |_/ /  __/ || (_| |
|___/ \___|_| \___|\__,_|\__| \____/ \___|\__\__,_|[0m
[1;38;5;10mðŸ“ˆ:: Data Update Time ::[0m	2025-12-12 [1;38;5;10m::[0m
[1;38;5;10mðŸ“ˆ:: Software Version ::[0m	0.0.27      [1;38;5;10m::[0m


  _torch_pytree._register_pytree_node(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
import requests
import re
import numpy as np
import pandas as pd
import time
import readline
from datetime import datetime, timedelta
import yfinance as yf

#Objective

To build and evaluate predictive models that use linguistic features from corporate earnings calls, combined with firm characteristics to predict short-term stock market reactions following earnings announcements.

#Goals

- Transform unstructured earnings call text into predictive numerical features using VADER and FinBERT.

- Evaluate whether language alone contains predictive information about short-term market reactions.

- Compare traditional sentiment metrics (VADER) with financial language models (FinBERT).

- Assess which linguistic dimensions-tone, confidence, disagreement, or complexityâ€”matter most for prediction.

#Sample Size


This study focuses on the top 2 companies (by market cap) for 5 main sectors i.e. Technology, Communication Services, Consumer Discretionary,  Health Care, Financials

#Data Sources

- Constituents List: The list of S&P 500 companies was obtained from  
  https://datahub.io/core/s-and-p-500-companies/r/constituents.csv.
- Transcripts: `defeatbeta_api` â†’ `Ticker(...).earning_call_transcripts()` and `get_transcript(year, quarter)`; `content_full` concatenated per call.  
- Prices & Benchmark: `yfinance` â†’ daily prices and volumes for firm tickers + SPY; used to compute 1-day, 3-day, and 5-day post-earnings returns and CAR5.  
- Firm Metadata: `yfinance.info` â†’ sector, industry, company name, and `market_cap_billion`.  
- Baseline Sentiment: `nltk.sentiment.VADER` â†’ sentence-level polarity scores aggregated into `vader_mean`, `vader_pos_share`, `vader_neg_share`, and `vader_neu_share`.  
- Financial NLP Model: Hugging Face Transformers â†’ `ProsusAI/finbert`; used to extract financial sentiment scores, confidence, entropy, and dispersion measures from earnings call text.


# Data Collection

**Finding the top 2 companies (by market cap) from 5 main sectors through the S&P500 market data.**

In [None]:
sp500_url = "https://datahub.io/core/s-and-p-500-companies/r/constituents.csv" #getting top marketcap companies from s&p500 csv file

response = requests.get(sp500_url)
print(response)                  # <Response [200]>
print(response.text[:500])       # preview first 500 chars

#read the CSV text into a DataFrame
from io import StringIO
sp500 = pd.read_csv(StringIO(response.text))
sp500.head()

<Response [200]>
Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
ACN,Accenture,Information Te


Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [None]:
#Rename the columns and prep tickers
sp500 = sp500.rename(columns={"Symbol": "ticker", "GICS Sector": "gics_sector"})
sp500["ticker"] = sp500["ticker"].astype(str).str.replace(".", "-", regex=False)  # BRK.B -> BRK-B for Yahoo

#Keep only these main 5 sectors
target_map = {
    "Information Technology": "Technology",
    "Communication Services": "Communication Services",
    "Consumer Discretionary": "Consumer Discretionary",
    "Health Care": "Health Care",
    "Financials": "Financials"
}

sp500 = sp500[sp500["gics_sector"].isin(target_map.keys())].copy()
sp500["chosen_sector"] = sp500["gics_sector"].map(target_map)

sp500[["ticker","gics_sector","chosen_sector"]].head()

Unnamed: 0,ticker,gics_sector,chosen_sector
2,ABT,Health Care,Health Care
3,ABBV,Health Care,Health Care
4,ACN,Information Technology,Technology
5,ADBE,Information Technology,Technology
6,AMD,Information Technology,Technology


In [None]:
#Pull market caps from Yahoo finance
rows = []
for i in range(len(sp500)):
    t = sp500.iloc[i]["ticker"]
    s = sp500.iloc[i]["chosen_sector"]
    try:
        info = yf.Ticker(t).info
        mc = info.get("marketCap", None)
    except Exception:
        mc = None
    print(t, mc)
    rows.append({"ticker": t, "chosen_sector": s, "market_cap": mc})
    time.sleep(0.05)

#Create DataFrame, remove missing market caps, and clean formatting
mktcaps = pd.DataFrame(rows).dropna(subset=["market_cap"]).copy()
mktcaps["market_cap"] = mktcaps["market_cap"].astype("int64")

#Sort by sector and descending market cap for easy comparison
mktcaps = mktcaps.sort_values(["chosen_sector","market_cap"], ascending=[True, False])
mktcaps.head()


ABT 217766232064
ABBV 393808642048
ACN 178268864512
ADBE 150934601728
AMD 327334035456
AFL 59077484544
A 38811205632
ABNB 82033459200
AKAM 12730634240
ALGN 11702199296
ALL 54000144384
GOOGL 3663432253440
GOOG 3666819416064
AMZN 2424113201152
AXP 261317689344
AIG 47749591040
AMP 45975302144
AMGN 174693892096
APH 158649843712
ADI 134615711744
AON 76120498176
APO 84915830784
AAPL 4039405731840
AMAT 201948856320
APTV 16895982592
ACGL 35997093888
ANET 156932112384
AJG 64879214592
AIZ 12010806272
T 173468352512
ADSK 63738118144
AZO 56830328832
BAC 401900961792
BAX 9803043840
BDX 55806365696
BRK-B 1085980803072
BBY 15076867072
TECH 8896905216
BIIB 24926183424
BLK 165234163712
BX 188635742208
XYZ 39251111936
BK 80714809344
BKNG 173246087168
BSX 142477492224
BMY 108566716416
AVGO 1557813788672
BRO 27678984192
CDNS 85861597184
CZR 4987636736
COF 155274805248
CAH 47305175040
KMX 5903838720
CCL 38425698304
CBOE 26021951488
CDW 18752172032
COR 66070282240
CNC 19277336576
CRL 9540419584
SCHW 1764937

Unnamed: 0,ticker,chosen_sector,market_cap
12,GOOG,Communication Services,3666819416064
11,GOOGL,Communication Services,3663432253440
169,META,Communication Services,1674764615680
186,NFLX,Communication Services,398308376576
235,TMUS,Communication Services,225387675648


In [None]:
#Pick the top 2 companies per sector
top2 = []

#Filter companies in the current sector and take the top 2 largest by market cap
for sector in mktcaps["chosen_sector"].unique():
    top2.append(mktcaps[mktcaps["chosen_sector"] == sector].head(2))
    #Combine all sector subsets into one DataFrame
top2 = pd.concat(top2, ignore_index=True)

top2_display = top2[["chosen_sector","ticker","market_cap"]].copy()
top2_display["market_cap_bn"] = (top2_display["market_cap"]/1e9).round(1)
top2_display


Unnamed: 0,chosen_sector,ticker,market_cap,market_cap_bn
0,Communication Services,GOOG,3666819416064,3666.8
1,Communication Services,GOOGL,3663432253440,3663.4
2,Consumer Discretionary,AMZN,2424113201152,2424.1
3,Consumer Discretionary,TSLA,1607601225728,1607.6
4,Financials,BRK-B,1085980803072,1086.0
5,Financials,JPM,860672950272,860.7
6,Health Care,LLY,947447201792,947.4
7,Health Care,JNJ,501880258560,501.9
8,Technology,NVDA,4239786770432,4239.8
9,Technology,AAPL,4039405731840,4039.4


In [None]:
#dictionary to reuse later
sector_tickers = {}
for sec in top2["chosen_sector"].unique():
    sector_tickers[sec] = top2[top2["chosen_sector"] == sec]["ticker"].tolist()
sector_tickers

{'Communication Services': ['GOOG', 'GOOGL'],
 'Consumer Discretionary': ['AMZN', 'TSLA'],
 'Financials': ['BRK-B', 'JPM'],
 'Health Care': ['LLY', 'JNJ'],
 'Technology': ['NVDA', 'AAPL']}

In [None]:
#Flatten to a DataFrame and verify sectors from Yahoo Finance
rows = []
for sector, tickers in sector_tickers.items():
    for t in tickers:
        info = yf.Ticker(t).info  #metadata call
        market_cap = info.get("marketCap", None)
        rows.append({
            "chosen_sector": sector,
            "ticker": t,
            "yf_sector": info.get("sector"),
            "yf_industry": info.get("industry"),
            "shortName": info.get("shortName"),
            "market_cap": market_cap,
            "market_cap_billion": round(market_cap / 1e9, 2) if market_cap else None
        })

universe = pd.DataFrame(rows)
universe

Unnamed: 0,chosen_sector,ticker,yf_sector,yf_industry,shortName,market_cap,market_cap_billion
0,Communication Services,GOOG,Communication Services,Internet Content & Information,Alphabet Inc.,3666819416064,3666.82
1,Communication Services,GOOGL,Communication Services,Internet Content & Information,Alphabet Inc.,3663432253440,3663.43
2,Consumer Discretionary,AMZN,Consumer Cyclical,Internet Retail,"Amazon.com, Inc.",2424113201152,2424.11
3,Consumer Discretionary,TSLA,Consumer Cyclical,Auto Manufacturers,"Tesla, Inc.",1607601225728,1607.6
4,Financials,BRK-B,Financial Services,Insurance - Diversified,Berkshire Hathaway Inc. New,1085980803072,1085.98
5,Financials,JPM,Financial Services,Banks - Diversified,JP Morgan Chase & Co.,860672950272,860.67
6,Health Care,LLY,Healthcare,Drug Manufacturers - General,Eli Lilly and Company,947447201792,947.45
7,Health Care,JNJ,Healthcare,Drug Manufacturers - General,Johnson & Johnson,501880258560,501.88
8,Technology,NVDA,Technology,Semiconductors,NVIDIA Corporation,4239786770432,4239.79
9,Technology,AAPL,Technology,Consumer Electronics,Apple Inc.,4039405731840,4039.41


**Extracting earnings-call transcripts from the DefeatBeta API [https://github.com/defeat-beta/defeatbeta-api/].**

In [None]:
#import the Ticker class from Earnings Call transcript API
from defeatbeta_api.data.ticker import Ticker

In [None]:
#Flatten dictionary into a simple list of tickers
tickers = [t for tickers_list in sector_tickers.values() for t in tickers_list]
tickers

['GOOG', 'GOOGL', 'AMZN', 'TSLA', 'BRK-B', 'JPM', 'LLY', 'JNJ', 'NVDA', 'AAPL']

In [None]:
#Pull the transcripts list (includes transcripts + transcripts_id)
all_meta = []

for tk in tickers:
    tr = Ticker(tk).earning_call_transcripts()
    df = tr.get_transcripts_list()
    df["symbol"] = tk
    all_meta.append(df)

meta_df = pd.concat(all_meta, ignore_index=True)

#Clean report date
meta_df["report_date"] = pd.to_datetime(
    meta_df["report_date"], errors="coerce"
).dt.date

#Keep last 10 fiscal years
current_year = pd.Timestamp.today().year
meta_10y = meta_df[
    meta_df["fiscal_year"].astype(int).between(current_year - 9, current_year)
].copy()

#Drop duplicates per symbol / year / quarter
meta_10y = (
    meta_10y
    .sort_values(["symbol","fiscal_year","fiscal_quarter"], ascending=[True, False, False])
    .drop_duplicates(subset=["symbol","fiscal_year","fiscal_quarter"])
    .reset_index(drop=True)
)

meta_10y.head()

Unnamed: 0,symbol,fiscal_year,fiscal_quarter,report_date,transcripts,transcripts_id
0,AAPL,2025,4,2025-10-30,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",
1,AAPL,2025,3,2025-07-31,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",
2,AAPL,2025,2,2025-05-01,"[{'paragraph_number': 1, 'speaker': 'AI Insigh...",
3,AAPL,2025,1,2025-01-30,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",506955.0
4,AAPL,2024,4,2024-10-31,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",455679.0


In [None]:
#Fetch full transcript text
rows = []

for sym, sub in meta_10y.groupby("symbol"):
    tr = Ticker(sym).earning_call_transcripts()

    for _, r in sub.iterrows():
        try:
            tdf = tr.get_transcript(int(r.fiscal_year), int(r.fiscal_quarter))
            content_full = (
                " ".join(tdf["content"].astype(str))
                if tdf is not None and not tdf.empty
                else ""
            )
        except Exception:
            content_full = ""

        rows.append({
            "symbol": sym,
            "fiscal_year": r.fiscal_year,
            "fiscal_quarter": r.fiscal_quarter,
            "report_date": r.report_date,
            "transcripts": r.get("transcripts"),
            "transcripts_id": r.get("transcripts_id"),
            "content_full": content_full
        })

df_calls = pd.DataFrame(rows)
df_calls.head()


Unnamed: 0,symbol,fiscal_year,fiscal_quarter,report_date,transcripts,transcripts_id,content_full
0,AAPL,2025,4,2025-10-30,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",,"Good afternoon, and welcome to the Apple Q4 Fi..."
1,AAPL,2025,3,2025-07-31,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",,"Good afternoon, and welcome to the Apple Q3 Fi..."
2,AAPL,2025,2,2025-05-01,"[{'paragraph_number': 1, 'speaker': 'AI Insigh...",,"Good afternoon, and welcome to the Apple Q2 F..."
3,AAPL,2025,1,2025-01-30,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",506955.0,"Good afternoon, and welcome to the Apple Q1 Fi..."
4,AAPL,2024,4,2024-10-31,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",455679.0,"Good afternoon, and welcome to the Apple Q4 Fi..."


In [None]:
#Vader for Sentimental Analysis
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [None]:
def compute_key_features(text):
    #Handle missing or empty text
    if not isinstance(text, str) or not text.strip():
        return {
            'vader_mean': np.nan,
            'vader_pos_share': np.nan,
            'vader_neg_share': np.nan,
            'vader_neu_share': np.nan,
            'avg_sentence_length': np.nan
        }

    #Split into sentences
    sentences = re.split(r'[.!?]+\s+', text.strip())
    sentences = [s for s in sentences if len(s.split()) >= 3]

    #If no usable sentences, fall back to full text
    if not sentences:
        score = sia.polarity_scores(text)['compound']
        return {
            'vader_mean': score,
            'vader_pos_share': np.nan,
            'vader_neg_share': np.nan,
            'vader_neu_share': np.nan,
            'avg_sentence_length': np.nan
        }

    #Sentence-level VADER scores
    scores = np.array(
        [sia.polarity_scores(s)['compound'] for s in sentences],
        dtype=float
    )

    return {
        'vader_mean': scores.mean(),
        'vader_pos_share': (scores > 0.05).mean(),
        'vader_neg_share': (scores < -0.05).mean(),
        'vader_neu_share': ((scores >= -0.05) & (scores <= 0.05)).mean(),
        'avg_sentence_length': np.mean([len(s.split()) for s in sentences])
    }

In [None]:
#Computing features on `content_full` and attaching to the same DataFrame
features = (
    df_calls['content_full']
    .apply(compute_key_features)
    .apply(pd.Series)
)

df_final = pd.concat(
    [df_calls.reset_index(drop=True),
     features.reset_index(drop=True)],
    axis=1
)

df_final.head()

Unnamed: 0,symbol,fiscal_year,fiscal_quarter,report_date,transcripts,transcripts_id,content_full,vader_mean,vader_pos_share,vader_neg_share,vader_neu_share,avg_sentence_length
0,AAPL,2025,4,2025-10-30,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",,"Good afternoon, and welcome to the Apple Q4 Fi...",0.245773,0.524027,0.059497,0.416476,17.558352
1,AAPL,2025,3,2025-07-31,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",,"Good afternoon, and welcome to the Apple Q3 Fi...",0.25106,0.543779,0.062212,0.394009,18.069124
2,AAPL,2025,2,2025-05-01,"[{'paragraph_number': 1, 'speaker': 'AI Insigh...",,"Good afternoon, and welcome to the Apple Q2 F...",0.216898,0.529284,0.088937,0.381779,16.869848
3,AAPL,2025,1,2025-01-30,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",506955.0,"Good afternoon, and welcome to the Apple Q1 Fi...",0.276358,0.6097,0.071594,0.318707,17.618938
4,AAPL,2024,4,2024-10-31,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",455679.0,"Good afternoon, and welcome to the Apple Q4 Fi...",0.294967,0.640553,0.076037,0.28341,17.709677


In [None]:
#Events table: one row per (symbol, report_date)
events = df_final[['symbol','report_date']].dropna().copy()
events['report_date'] = pd.to_datetime(events['report_date'])

#Yahoo formatting (BRK.B -> BRK-B)
events['yahoo_symbol'] = events['symbol'].astype(str).str.replace('.', '-', regex=False)

#Tickers & benchmark
tickers = sorted(events['yahoo_symbol'].unique().tolist())
if 'SPY' not in tickers:
    tickers.append('SPY')

#Download window (buffer so +5 days always exists)
dmin = events['report_date'].min() - pd.Timedelta(days=20)
dmax = events['report_date'].max() + pd.Timedelta(days=20)

#Download prices once
data = yf.download(
    tickers,
    start=dmin,
    end=dmax,
    progress=False,
    auto_adjust=True
)[['Close','Volume']]

close = data['Close']     #columns = tickers
vol   = data['Volume']    #columns = tickers

#Compute event-window returns for one row
def event_metrics(row):
    sym = row['yahoo_symbol']
    dt  = row['report_date']

    #skip if ticker missing
    if sym not in close.columns or 'SPY' not in close.columns:
        return pd.Series({
            'day0': pd.NaT,
            'price_return_1day': np.nan,
            'price_return_3day': np.nan,
            'price_return_5day': np.nan,
            'bench_return_5day': np.nan,
            'CAR5': np.nan,
            'volume_change': np.nan
        })

    p = close[sym].dropna()
    b = close['SPY'].dropna()
    v = vol[sym].dropna()

    #Align event date to next trading day
    i0 = p.index.searchsorted(dt)
    if i0 >= len(p):
        return pd.Series({
            'day0': pd.NaT,
            'price_return_1day': np.nan,
            'price_return_3day': np.nan,
            'price_return_5day': np.nan,
            'bench_return_5day': np.nan,
            'CAR5': np.nan,
            'volume_change': np.nan
        })

    i_m1 = max(0, i0 - 1)
    i_p3 = min(len(p) - 1, i0 + 3)
    i_p5 = min(len(p) - 1, i0 + 5)

    #Stock returns (relative to t-1)
    r1 = (p.iloc[i0]  / p.iloc[i_m1]) - 1
    r3 = (p.iloc[i_p3]/ p.iloc[i_m1]) - 1
    r5 = (p.iloc[i_p5]/ p.iloc[i_m1]) - 1

    #Benchmark return over same window
    b_aligned = b.reindex(p.index).ffill()
    rb5 = (b_aligned.iloc[i_p5] / b_aligned.iloc[i_m1]) - 1

    #CAR5
    car5 = r5 - rb5

    #Volume change on day0 vs day-1
    v_aligned = v.reindex(p.index).ffill()
    vol_chg = (v_aligned.iloc[i0] / v_aligned.iloc[i_m1]) - 1 if v_aligned.iloc[i_m1] != 0 else np.nan

    return pd.Series({
        'day0': p.index[i0].date(),
        'price_return_1day': float(r1),
        'price_return_3day': float(r3),
        'price_return_5day': float(r5),
        'bench_return_5day': float(rb5),
        'CAR5': float(car5),
        'volume_change': float(vol_chg)
    })

#Compute metrics & merge into df_final
metrics = events.apply(event_metrics, axis=1)
events_with_returns = pd.concat([events[['symbol','report_date']], metrics], axis=1)

df_final['report_date'] = pd.to_datetime(df_final['report_date']).dt.normalize()
events_with_returns['report_date'] = pd.to_datetime(events_with_returns['report_date']).dt.normalize()

#Merge onto df_final
df_big = df_final.merge(events_with_returns, on=['symbol','report_date'], how='left')

#Adding firm metadata from universe
if 'universe' in globals():
    meta_cols = ['ticker','shortName','chosen_sector','yf_industry','market_cap_billion']
    df_big = df_big.merge(
        universe[meta_cols].rename(columns={'ticker':'symbol'}),
        on='symbol',
        how='left'
    )

    #Move key id columns to front
    front = ['symbol','shortName','chosen_sector','yf_industry','market_cap_billion','report_date']
    rest = [c for c in df_big.columns if c not in front]
    df_big = df_big[front + rest]

print(df_big.shape)
df_big.head()

(353, 23)


Unnamed: 0,symbol,shortName,chosen_sector,yf_industry,market_cap_billion,report_date,fiscal_year,fiscal_quarter,transcripts,transcripts_id,...,vader_neg_share,vader_neu_share,avg_sentence_length,day0,price_return_1day,price_return_3day,price_return_5day,bench_return_5day,CAR5,volume_change
0,AAPL,Apple Inc.,Technology,Consumer Electronics,4039.41,2025-10-30,2025,4,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",,...,0.059497,0.416476,17.558352,2025-10-30,0.006303,0.001261,0.000259,-0.024848,0.025107,0.367998
1,AAPL,Apple Inc.,Technology,Consumer Electronics,4039.41,2025-07-31,2025,3,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",,...,0.062212,0.394009,18.069124,2025-07-31,-0.00708,-0.029323,0.052523,-0.003483,0.056007,0.773104
2,AAPL,Apple Inc.,Technology,Consumer Electronics,4039.41,2025-05-01,2025,2,"[{'paragraph_number': 1, 'speaker': 'AI Insigh...",,...,0.088937,0.381779,16.869848,2025-05-01,0.003859,-0.065835,-0.070635,0.018971,-0.089606,0.097142
3,AAPL,Apple Inc.,Technology,Consumer Electronics,4039.41,2025-01-30,2025,1,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",506955.0,...,0.071594,0.318707,17.618938,2025-01-30,-0.007395,-0.027406,-0.025652,0.007494,-0.033146,0.223633
4,AAPL,Apple Inc.,Technology,Consumer Electronics,4039.41,2024-10-31,2024,4,"[{'paragraph_number': 1, 'speaker': 'Suhasini ...",455679.0,...,0.076037,0.28341,17.709677,2024-10-31,-0.018209,-0.028901,-0.011386,0.026896,-0.038282,0.367514


In [None]:
def _safe_chunks(text):
    """Return 3 text chunks: start/middle/end (keeps it simple + avoids 512 token limit bias)."""
    if not isinstance(text, str) or not text.strip():
        return []
    n = len(text)
    if n < 2000:   #Short transcript: just score once
        return [text]
    return [text[:2000], text[n//2 - 1000:n//2 + 1000], text[-2000:]]

def _finbert_probs(text):
    """FinBERT probs: pos, neg, neu."""
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        p = torch.softmax(logits, dim=1).detach().cpu().numpy()[0]

    #model order: [positive, negative, neutral]
    return float(p[0]), float(p[1]), float(p[2])

def finbert_features(text):
    chunks = _safe_chunks(text)
    if not chunks:
        return pd.Series({
            "finbert_pos_avg3": np.nan,
            "finbert_neg_avg3": np.nan,
            "finbert_neu_avg3": np.nan,
            "finbert_score_avg3": np.nan,
            "finbert_score_std3": np.nan,
            "finbert_confidence": np.nan,
            "finbert_entropy": np.nan
        })

    scores = []
    probs_list = []

    for c in chunks:
        pos, neg, neu = _finbert_probs(c)
        probs_list.append([pos, neg, neu])
        scores.append(pos - neg)

    probs_arr = np.array(probs_list)
    pos_avg, neg_avg, neu_avg = probs_arr.mean(axis=0)

    score_avg = float(np.mean(scores))
    score_std = float(np.std(scores)) if len(scores) > 1 else 0.0

    #Confidence & entropy from averaged probs
    conf = float(max(pos_avg, neg_avg, neu_avg))
    p = np.array([pos_avg, neg_avg, neu_avg])
    ent = float(-(p * np.log(p + 1e-12)).sum())

    return pd.Series({
        "finbert_pos_avg3": float(pos_avg),
        "finbert_neg_avg3": float(neg_avg),
        "finbert_neu_avg3": float(neu_avg),
        "finbert_score_avg3": score_avg,
        "finbert_score_std3": score_std,
        "finbert_confidence": conf,
        "finbert_entropy": ent
    })

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
#flatten the existing dict
finbert_symbols = [t for lst in sector_tickers.values() for t in lst]

#keep only these 10 companies for FinBERT
df_finbert = df_big[df_big["symbol"].isin(finbert_symbols)].copy()

#run FinBERT only on this subset
fb = df_finbert["content_full"].apply(finbert_features)

df_finbert = pd.concat(
    [df_finbert.reset_index(drop=True), fb.reset_index(drop=True)],
    axis=1
)

#Disagreement between FinBERT and your VADER sentence-mean
df_finbert["sentiment_disagreement"] = (
    df_finbert["finbert_score_avg3"] - df_finbert["vader_mean"]
).abs()

#Quick check
df_finbert[[
    "symbol","chosen_sector","report_date",
    "vader_mean",
    "finbert_score_avg3","finbert_confidence","finbert_entropy",
    "sentiment_disagreement"
]].head()

Unnamed: 0,symbol,chosen_sector,report_date,vader_mean,finbert_score_avg3,finbert_confidence,finbert_entropy,sentiment_disagreement
0,AAPL,Technology,2025-10-30,0.245773,0.346798,0.62219,0.727678,0.101025
1,AAPL,Technology,2025-07-31,0.25106,0.266943,0.583298,0.875443,0.015883
2,AAPL,Technology,2025-05-01,0.216898,0.331313,0.627497,0.739914,0.114415
3,AAPL,Technology,2025-01-30,0.276358,0.346382,0.621829,0.729095,0.070024
4,AAPL,Technology,2024-10-31,0.294967,0.268895,0.693248,0.687532,0.026073


In [None]:
finbert_cols = [
    "symbol","report_date",
    "finbert_score_avg3","finbert_confidence","finbert_entropy",
    "finbert_score_std3","sentiment_disagreement"
]

df_big = df_big.merge(df_finbert[finbert_cols], on=["symbol","report_date"], how="left")

## Key Variables

- **Identifiers & Firm Metadata:**  
  `symbol`, `shortName`, `chosen_sector`, `yf_industry`, `market_cap_billion`,  
  `report_date`, `fiscal_year`, `fiscal_quarter`

- **Transcript Content:**  
  `content_full`, `transcripts`, `transcripts_id`

- **VADER-Based Sentiment & Style:**  
  `vader_mean`, `vader_pos_share`, `vader_neg_share`, `vader_neu_share`,  
  `avg_sentence_length`

- **FinBERT-Based Sentiment (Deep NLP):**  
  `finbert_score_avg3`, `finbert_confidence`, `finbert_entropy`,  
  `finbert_score_std3`

- **Hybrid Linguistic Feature:**  
  `sentiment_disagreement`  
  *(absolute difference between FinBERT sentiment and VADER sentiment)*

- **Market Reaction Variables:**  
  `price_return_1day`, `price_return_3day`, `price_return_5day`,  
  `bench_return_5day`, `CAR5`, `volume_change`


In [None]:
df_big.columns #all the columns in the big DataFrame

Index(['symbol', 'shortName', 'chosen_sector', 'yf_industry',
       'market_cap_billion', 'report_date', 'fiscal_year', 'fiscal_quarter',
       'transcripts', 'transcripts_id', 'content_full', 'vader_mean',
       'vader_pos_share', 'vader_neg_share', 'vader_neu_share',
       'avg_sentence_length', 'day0', 'price_return_1day', 'price_return_3day',
       'price_return_5day', 'bench_return_5day', 'CAR5', 'volume_change',
       'finbert_score_avg3', 'finbert_confidence', 'finbert_entropy',
       'finbert_score_std3', 'sentiment_disagreement'],
      dtype='object')

#Research Questions

1. Can sentiment extracted from earnings call language predict short-term abnormal stock returns?

- Can earnings calls be used to predict whether CAR5 is positive or negative?

- Does sentiment have more predictive power for 1-day vs 5-day returns?

- Does model performance improve when combining VADER + FinBERT features?

2. Are some firms and sectors more sentiment-sensitive than others in terms of market reaction?

- Does sentiment predict returns better in Technology and Consumer Discretionary than in Financials?

- Does firm size (market cap) dampen or amplify sentiment effects?

- Are sentiment-based predictions more accurate for certain sectors?

3. Does communication style and emotional intensity improve predictions beyond sentiment polarity alone?

- Do emotionally intense earnings calls (high confidence / low entropy) lead to stronger market reactions?

- Does disagreement between FinBERT and VADER signal uncertainty that markets react to?

- Are clearer, simpler earnings calls more predictable than complex ones?