In [1]:
import re
import pandas as pd
import numpy as np
from typing import List, Dict
!pip install flashtext vaderSentiment
from flashtext import KeywordProcessor
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



# Load company data and news set and clean names
company_data = pd.read_csv("/Users/matteici/Documents/GitHub/Financial-Article-Scrapper/article_ticker_match_experiment/company_data_rework.csv")

news_benzinga = pd.read_csv("/Users/matteici/Documents/GitHub/Financial-Article-Scrapper/article_ticker_match_experiment/raw_analyst_ratings.csv")

common_suffixes = ["inc", "inc.", "corp", "corp.", "ltd", "ltd.", "plc", "co", "co.", "s.a.", "s.p.a.", "ag"]

def clean_company_name(name: str) -> str:
    """Removes common suffixes from company names."""
    name = re.sub(r'[^a-zA-Z0-9 ]', '', name)  # Remove punctuation
    for suffix in common_suffixes:
        name = re.sub(rf'\b{suffix}\b', '', name, flags=re.IGNORECASE).strip()
    return name

# Build company lookup dictionary
company_data["clean_name"] = company_data["Name"].apply(clean_company_name)
company_lookup = dict(zip(company_data["Ticker"], company_data["clean_name"]))


# Build FlashText KeywordProcessor
keyword_processor = KeywordProcessor()
for ticker, name in company_lookup.items():
    keyword_processor.add_keyword(name, ticker)
    keyword_processor.add_keyword(ticker, ticker)

def find_mentioned_companies(title: str) -> List[str]:
    """Finds all mentioned companies in a news title using FlashText."""
    if pd.isna(title):
        return "N/A"
    matches = list(set(keyword_processor.extract_keywords(title.lower())))
    return matches if matches else "N/A"


# Apply company lookup to news headlines
news_benzinga["mentioned_companies"] = news_benzinga["headline"].apply(find_mentioned_companies)

# Drop rows where mentioned_companies is N/A or empty
news_benzinga = news_benzinga[news_benzinga["mentioned_companies"].notna()]

news_benzinga["mentioned_companies"] = news_benzinga["mentioned_companies"].apply(lambda x: [] if x == "N/A" else x)

news_benzinga = news_benzinga[news_benzinga["mentioned_companies"].apply(lambda x: len(x) > 0)]

# Initialize Sentiment Analyzer
analyzer = SentimentIntensityAnalyzer()

def extract_company_sentiment(title: str, mentioned_companies: List[str]) -> List[float]:
    """
    Computes sentiment scores for each company mentioned in a headline.
    Instead of using the full title's sentiment, we focus on the words around each company.
    """
    company_sentiments = []
    
    for company in mentioned_companies:
        # Create a regex pattern to find the company in the text
        pattern = rf'\b{re.escape(company)}\b'
        match = re.search(pattern, title, re.IGNORECASE)

        if match:
            # Extract words near the company mention
            start, end = match.start(), match.end()
            context_window = 20  # Adjust for more or less context
            context = title[max(0, start - context_window): min(len(title), end + context_window)]

            # Compute sentiment score on the extracted context
            sentiment = analyzer.polarity_scores(context)
            company_sentiments.append(sentiment["pos"] - sentiment["neg"])
        else:
            # If company isn't directly found (rare case), use full title sentiment
            sentiment = analyzer.polarity_scores(title)
            company_sentiments.append(sentiment["pos"] - sentiment["neg"])
    
    return company_sentiments

# Apply the improved function to compute sentiment scores per company
news_benzinga["sentiment_scores"] = news_benzinga.apply(
    lambda row: extract_company_sentiment(row["headline"], row["mentioned_companies"]), axis=1
)

news_benzinga["date_processed"] = pd.to_datetime(news_benzinga["date"], utc=True, errors='coerce').dt.date


# Aggregate into nested dictionary: date -> {ticker -> list of scores}
def build_daily_score_dict(df: pd.DataFrame) -> Dict[str, Dict[str, List[float]]]:
    daily_scores = {}
    for _, row in df.iterrows():
        date = str(row["date_processed"])
        tickers = row["mentioned_companies"]
        scores = row["sentiment_scores"]

        if date not in daily_scores:
            daily_scores[date] = {}

        for ticker, score in zip(tickers, scores):
            if ticker not in daily_scores[date]:
                daily_scores[date][ticker] = []
            daily_scores[date][ticker].append(score)

    return daily_scores

# Aggregate into nested dictionary: ticker -> {date -> list of scores}
def build_ticker_score_dict(df: pd.DataFrame) -> Dict[str, Dict[str, List[float]]]:
    ticker_scores = {}
    for _, row in df.iterrows():
        date = str(row["date_processed"])
        tickers = row["mentioned_companies"]
        scores = row["sentiment_scores"]

        for ticker, score in zip(tickers, scores):
            if ticker not in ticker_scores:
                ticker_scores[ticker] = {}
            if date not in ticker_scores[ticker]:
                ticker_scores[ticker][date] = []
            ticker_scores[ticker][date].append(score)

    return ticker_scores

# Create the nested sentiment dictionaries
sentiment_by_date = build_daily_score_dict(news_benzinga)
sentiment_by_ticker = build_ticker_score_dict(news_benzinga)

# Display a sample entry from each
print("Sample date-level entry:", list(sentiment_by_date.items())[0])
print("Sample ticker-level entry:", list(sentiment_by_ticker.items())[0])

# Display results
news_benzinga.head()

#pd.DataFrame.from_dict(sentiment_by_date, orient="index").to_csv("sentiment_by_date.csv")



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Sample date-level entry: ('2020-03-31', {'L': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'T': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'V': [0.0, 0.0, 0.0, 0.0, 0.0], 'DE': [0.0, 0.0, 0.0, 0.0], 'AAPL': [0.0, 0.0, 0.0, 0.0], 'ADBE': [-0.308], 'ADI': [-0.308], 'DOW': [-0.216, 0.0, 0.0, -0.216, 0.0, 0.0, -0.216, -0.216, -0.216, 0.0, 0.0, -0.216, 0.0], 'OTIS': [0.0, 0.0], 'TECH': [0.348, 0.394, 0.394, 0.394, 0.348, 0.348], 'NOW': [-0.268, -0.268], 'AVGO': [0.016, -0.217], 'BEN': [-0.12], 'ALL': [0.175, 0.0, 0.0, -0.315, 0.0], 'BLK': [-0.13], 'BWA': [0.0, -0.172, 0.0, -0.172, 0.0], 'CAG': [0.231, 0.231, 0.0, -0.241], 'CAT': [0.0, -0.022999999999999993, -0.143], 'C': [-0.022999999999999993, -0.143, -0.13, 0.0, 0.0, -0.143, 0.0, 0.161, 0

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,mentioned_companies,sentiment_scores,date_processed
30,30,Int'l. Air Transport Authority Chief Economist...,https://www.benzinga.com/news/20/03/15705690/i...,Benzinga Newsdesk,2020-03-31 00:00:00,A,[L],[0.0],2020-03-31
37,37,Citigroup Maintains Neutral on Agilent Technol...,https://www.benzinga.com/news/20/03/15615851/c...,Vick Meyer,2020-03-20 00:00:00,A,[C],[-0.13],2020-03-20
68,68,Citigroup Initiates Coverage On Agilent Techno...,https://www.benzinga.com/news/20/01/15082753/c...,Benzinga_Newsdesk,2020-01-07 00:00:00,A,[C],[0.0],2020-01-07
97,97,Shares of several healthcare companies are tra...,https://www.benzinga.com/markets/wiim/19/10/14...,Benzinga Newsdesk,2019-10-02 00:00:00,A,[ADP],[-0.293],2019-10-02
98,98,Shares of several healthcare companies are tra...,https://www.benzinga.com/markets/wiim/19/09/14...,Benzinga Newsdesk,2019-09-05 00:00:00,A,"[D, C]","[0.0, 0.0]",2019-09-05


In [None]:
import yfinance as yf
from datetime import timedelta

def get_news_based_date_ranges(df: pd.DataFrame) -> Dict[str, List[str]]:
    ranges = {}
    for ticker, group in df.explode("mentioned_companies").groupby("mentioned_companies"):
        dates = pd.to_datetime(group["date_processed"], errors='coerce').dropna()
        if not dates.empty:
            start = (dates.min() - timedelta(days=2)).strftime("%Y-%m-%d")
            end = (dates.max() + timedelta(days=2)).strftime("%Y-%m-%d")
            ranges[ticker] = [start, end]
    return ranges

def collect_stock_data(ticker: str, start_date: str, end_date: str) -> pd.DataFrame:
    try:
        return yf.download(
    ticker,
    start=start_date,
    end=end_date,
    interval='1d',
    group_by='ticker',
    auto_adjust=True,
    multi_level_index=False  
)
    except:
        return pd.DataFrame()

# Build smart date ranges for each mentioned stock
target_date_ranges = get_news_based_date_ranges(news_benzinga)

# Collect stock price data
stock_price_data = {
    ticker: collect_stock_data(ticker, date_range[0], date_range[1])
    for ticker, date_range in target_date_ranges.items()}


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed

1 Failed download:
['ABNB']: YFPricesMissingError('possibly delisted; no price data found  (1d 2015-06-22 -> 2020-04-17) (Yahoo error = "Data doesn\'t exist for startDate = 1434945600, endDate = 1587096000")')
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%****

In [107]:
import joblib
joblib.dump(stock_price_data, "exptest2_27_04_stock_price_data.pkl")


['exptest2_27_04_stock_price_data.pkl']

In [108]:
joblib.dump(sentiment_by_ticker, "exptest2_27_04_sentiment_by_ticker.pkl")
joblib.dump(sentiment_by_date, "exptest2_27_04_sentiment_by_date.pkl")

['exptest2_27_04_sentiment_by_date.pkl']

In [4]:
stock_price_data['AAPL']

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2009-08-10,4.985306,5.013593,4.925118,4.957017,300294400
2009-08-11,4.926021,4.946786,4.871552,4.900141,355342400
2009-08-12,4.891715,5.016905,4.889007,4.974774,445071200
2009-08-13,5.015098,5.075887,5.010584,5.068363,439980800
2009-08-14,5.053919,5.062646,4.981393,5.019011,305816000
...,...,...,...,...,...
2020-05-28,77.081518,78.704572,76.804119,77.441658,133560800
2020-05-29,77.684996,78.147332,77.008522,77.366226,153532400
2020-06-01,77.320001,78.439348,77.188598,78.317680,80791200
2020-06-02,78.050005,78.704580,77.607132,78.680244,87642800


In [19]:
testdf = sentiment_by_ticker['AAPL']
testvec = testdf["2011-02-22"]

In [32]:
# Diagnostic block to understand the structure

print("\n--- SENTIMENT STRUCTURE ---")
print(f"Type of sentiment: {type(sentiment_by_ticker)}")
if isinstance(sentiment_by_ticker, dict):
    sample_key = next(iter(sentiment_by_ticker.keys()))
    print(f"Example ticker: {sample_key}")
    print(f"Type of sentiment[{sample_key}]: {type(sentiment_by_ticker[sample_key])}")
    
    if isinstance(sentiment_by_ticker[sample_key], pd.DataFrame):
        print(f"Columns in sentiment[{sample_key}]: {sentiment_by_ticker[sample_key].columns.tolist()}")
        print(f"Index type of sentiment[{sample_key}]: {type(sentiment_by_ticker[sample_key].index)}")
        print("First few entries in sentiment[{sample_key}]:")
        print(sentiment_by_ticker[sample_key].head())
        
print("\n--- RETURNS STRUCTURE ---")
print(f"Type of returns_dict: {type(stock_price_data)}")
if isinstance(stock_price_data, dict):
    sample_key_r = next(iter(stock_price_data.keys()))
    print(f"Example ticker: {sample_key_r}")
    print(f"Type of returns_dict[{sample_key_r}]: {type(stock_price_data[sample_key_r])}")
    
    if isinstance(stock_price_data[sample_key_r], pd.DataFrame):
        print(f"Columns in returns_dict[{sample_key_r}]: {stock_price_data[sample_key_r].columns.tolist()}")
        print(f"Index type of returns_dict[{sample_key_r}]: {type(stock_price_data[sample_key_r].index)}")
        print("First few entries in returns_dict[{sample_key_r}]:")
        print(stock_price_data[sample_key_r].head())

print("\n--- SAMPLE VALUES CHECK ---")
try:
    sample_sentiment = sentiment_by_ticker[sample_key].iloc[0,0]
    print(f"Sample sentiment cell value (type): {type(sample_sentiment)} - {sample_sentiment}")
except Exception as e:
    print(f"Error accessing sample sentiment cell: {e}")

try:
    sample_return = stock_price_data[sample_key_r].iloc[0,0]
    print(f"Sample return cell value (type): {type(sample_return)} - {sample_return}")
except Exception as e:
    print(f"Error accessing sample return cell: {e}")



--- SENTIMENT STRUCTURE ---
Type of sentiment: <class 'dict'>
Example ticker: L
Type of sentiment[L]: <class 'dict'>

--- RETURNS STRUCTURE ---
Type of returns_dict: <class 'dict'>
Example ticker: AAPL
Type of returns_dict[AAPL]: <class 'pandas.core.frame.DataFrame'>
Columns in returns_dict[AAPL]: ['Open', 'High', 'Low', 'Close', 'Volume']
Index type of returns_dict[AAPL]: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
First few entries in returns_dict[{sample_key_r}]:
                Open      High       Low     Close     Volume
Date                                                         
2009-08-10  4.985306  5.013593  4.925118  4.957017  300294400
2009-08-11  4.926021  4.946786  4.871552  4.900141  355342400
2009-08-12  4.891715  5.016905  4.889007  4.974774  445071200
2009-08-13  5.015098  5.075887  5.010584  5.068363  439980800
2009-08-14  5.053919  5.062646  4.981393  5.019011  305816000

--- SAMPLE VALUES CHECK ---
Error accessing sample sentiment cell: 'dict' object ha

In [33]:
print(next(iter(sentiment_by_ticker['L'].items())))

('2020-03-31', [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])


In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

def process_sentiment_and_returns(sentiment_dict, returns_dict, lag_days=0):
    results = {}

    for ticker in sentiment_dict.keys():
        # Check if both sentiment and returns are available
        if ticker not in returns_dict:
            print(f"Ticker {ticker} missing from returns_dict. Skipping.")
            continue
        
        # Process sentiment
        sent_raw = sentiment_dict[ticker]
        
        if len(sent_raw) == 0:
            print(f"Sentiment data empty for {ticker}. Skipping.")
            continue

        # Convert sentiment dict to DataFrame
        try:
            sent_df = pd.DataFrame({
                pd.to_datetime(date): np.mean(scores) if len(scores) > 0 else np.nan
                for date, scores in sent_raw.items()
            }, index=[0]).T
            sent_df.columns = ['avg_sentiment']
            sent_df = sent_df.dropna()
        except Exception as e:
            print(f"Error processing sentiment for {ticker}: {e}")
            continue

        # Process returns
        ret_df = returns_dict[ticker].copy()
        if ret_df.empty:
            print(f"Returns data empty for {ticker}. Skipping.")
            continue
        
        # Compute daily returns
        ret_df['Return'] = ret_df['Close'].pct_change()
        ret_df = ret_df.dropna()

        # Merge sentiment and returns
        merged = pd.merge(
            ret_df[['Return']],
            sent_df,
            left_index=True,
            right_index=True,
            how='inner'
        )

        if merged.empty:
            print(f"No overlapping dates for {ticker} after merge. Skipping.")
            continue

        # Apply lag if needed
        if lag_days > 0:
            merged['avg_sentiment'] = merged['avg_sentiment'].shift(lag_days)
            merged = merged.dropna()

        if merged.empty:
            print(f"No data left after applying lag for {ticker}. Skipping.")
            continue

        # Regress Return ~ avg_sentiment
        X = merged[['avg_sentiment']].values
        y = merged['Return'].values

        model = LinearRegression()
        model.fit(X, y)

        # Store result
        results[ticker] = {
            'coef': model.coef_[0],
            'intercept': model.intercept_,
            'r2': model.score(X, y),
            'n_obs': len(y)
        }

    return pd.DataFrame(results).T


In [31]:
test_results_0lag = process_sentiment_and_returns(sentiment_by_ticker, stock_price_data, lag_days=0)



Returns data empty for CEG. Skipping.




Returns data empty for ABNB. Skipping.




Returns data empty for CARR. Skipping.
Returns data empty for SOLV. Skipping.
Ticker IQV missing from returns_dict. Skipping.




In [34]:
test_results_1lag = process_sentiment_and_returns(sentiment_by_ticker, stock_price_data, lag_days=1)
test_results_2lag = process_sentiment_and_returns(sentiment_by_ticker, stock_price_data, lag_days=2)
test_results_1wklag = process_sentiment_and_returns(sentiment_by_ticker, stock_price_data, lag_days=7)

No data left after applying lag for NWSA. Skipping.
Returns data empty for CEG. Skipping.
No data left after applying lag for LW. Skipping.
Returns data empty for ABNB. Skipping.
No data left after applying lag for FOXA. Skipping.
No data left after applying lag for CTVA. Skipping.
Returns data empty for CARR. Skipping.
No data left after applying lag for FTV. Skipping.
Returns data empty for SOLV. Skipping.
No data left after applying lag for LHX. Skipping.
No data left after applying lag for TFC. Skipping.
No data left after applying lag for VTRS. Skipping.
No data left after applying lag for BKNG. Skipping.
Ticker IQV missing from returns_dict. Skipping.




No data left after applying lag for NWSA. Skipping.
Returns data empty for CEG. Skipping.
No data left after applying lag for LW. Skipping.
Returns data empty for ABNB. Skipping.
No data left after applying lag for FOXA. Skipping.




No data left after applying lag for CTVA. Skipping.




Returns data empty for CARR. Skipping.
No data left after applying lag for ATO. Skipping.
No data left after applying lag for FTV. Skipping.
Returns data empty for SOLV. Skipping.
No data left after applying lag for KHC. Skipping.
No data left after applying lag for LHX. Skipping.
No data left after applying lag for TFC. Skipping.
No data left after applying lag for VTRS. Skipping.
No data left after applying lag for BKNG. Skipping.
Ticker IQV missing from returns_dict. Skipping.
No data left after applying lag for NWSA. Skipping.
No data left after applying lag for FRT. Skipping.
Returns data empty for CEG. Skipping.
No data left after applying lag for ZBH. Skipping.
No data left after applying lag for HUBB. Skipping.
No data left after applying lag for JKHY. Skipping.
No data left after applying lag for LW. Skipping.
No data left after applying lag for SNA. Skipping.
Returns data empty for ABNB. Skipping.
No data left after applying lag for LKQ. Skipping.
No data left after applying 



No data left after applying lag for TDG. Skipping.
No data left after applying lag for CTVA. Skipping.
No data left after applying lag for WST. Skipping.
No data left after applying lag for BKR. Skipping.
No data left after applying lag for VICI. Skipping.
No data left after applying lag for APTV. Skipping.
No data left after applying lag for ODFL. Skipping.
No data left after applying lag for INVH. Skipping.
Returns data empty for CARR. Skipping.
No data left after applying lag for TPL. Skipping.
No data left after applying lag for ATO. Skipping.
No data left after applying lag for FTV. Skipping.
No data left after applying lag for TDY. Skipping.
Returns data empty for SOLV. Skipping.
No data left after applying lag for PARA. Skipping.
No data left after applying lag for KHC. Skipping.
No data left after applying lag for LHX. Skipping.
No data left after applying lag for TFC. Skipping.
No data left after applying lag for VTRS. Skipping.
No data left after applying lag for BKNG. Skippi



In [35]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

def run_regressions(sentiment, returns_dict, lag_days=0, extreme_percent=None, min_obs=30):
    results = {}

    for ticker in returns_dict.keys():
        if ticker not in sentiment:
            print(f"Sentiment data missing for {ticker}. Skipping.")
            continue

        stock_returns = returns_dict[ticker]
        ticker_sentiment = sentiment[ticker]

        if not ticker_sentiment or stock_returns.empty:
            print(f"Empty data for {ticker}. Skipping.")
            continue

        # Step 1: Build sentiment dataframe
        sentiment_df = pd.DataFrame.from_dict(ticker_sentiment, orient='index')
        sentiment_df.index = pd.to_datetime(sentiment_df.index)
        sentiment_df.sort_index(inplace=True)

        # Step 2: Average sentiment per day
        sentiment_df['average_sentiment'] = sentiment_df.apply(lambda row: np.mean(row.dropna().tolist()) if not row.isnull().all() else np.nan, axis=1)
        sentiment_series = sentiment_df['average_sentiment'].dropna()

        # Step 3: Filter stocks with too few observations
        if len(sentiment_series) < min_obs:
            print(f"Not enough sentiment observations for {ticker}. Skipping.")
            continue

        # Step 4: Prepare returns
        stock_returns = stock_returns.copy()
        stock_returns['Return'] = stock_returns['Close'].pct_change()
        stock_returns = stock_returns.dropna()

        # Step 5: Align data
        aligned = pd.DataFrame({
            'Return': stock_returns['Return'],
            'Sentiment': sentiment_series
        }).dropna()

        # Step 6: Apply lagging if needed
        if lag_days > 0:
            aligned['Sentiment'] = aligned['Sentiment'].shift(lag_days)
            aligned = aligned.dropna()

        # Step 7: Select extreme sentiment days
        if extreme_percent is not None:
            lower_thresh = aligned['Sentiment'].quantile(extreme_percent)
            upper_thresh = aligned['Sentiment'].quantile(1 - extreme_percent)
            aligned = aligned[(aligned['Sentiment'] <= lower_thresh) | (aligned['Sentiment'] >= upper_thresh)]

        if len(aligned) < 10:  # Still need enough data
            print(f"Not enough data after extreme filtering for {ticker}. Skipping.")
            continue

        # Step 8: Run regression
        X = sm.add_constant(aligned['Sentiment'])
        y = aligned['Return']
        model = sm.OLS(y, X).fit()

        results[ticker] = model

    return results

# Example usage:
# results = run_regressions(sentiment, returns_dict, lag_days=1, extreme_percent=0.1, min_obs=30)


In [36]:
test_new_file = run_regressions(sentiment=sentiment_by_ticker, returns_dict=stock_price_data, lag_days=0, extreme_percent=0.1, min_obs=30)

Empty data for ABNB. Skipping.
Not enough sentiment observations for AMCR. Skipping.
Not enough sentiment observations for ANSS. Skipping.
Not enough sentiment observations for AOS. Skipping.
Not enough sentiment observations for APD. Skipping.
Not enough sentiment observations for APTV. Skipping.
Not enough sentiment observations for ATO. Skipping.
Not enough sentiment observations for AWK. Skipping.
Not enough sentiment observations for BKNG. Skipping.
Not enough sentiment observations for BKR. Skipping.
Not enough sentiment observations for BRO. Skipping.
Empty data for CARR. Skipping.
Empty data for CEG. Skipping.
Not enough sentiment observations for CHRW. Skipping.
Not enough sentiment observations for CINF. Skipping.
Not enough sentiment observations for CNP. Skipping.
Not enough sentiment observations for COR. Skipping.
Not enough sentiment observations for CTVA. Skipping.
Not enough data after extreme filtering for CZR. Skipping.
Not enough sentiment observations for EG. Skipp

In [41]:
print(test_new_file['AAPL'].summary())

                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     3.558
Date:                Mon, 28 Apr 2025   Prob (F-statistic):             0.0601
Time:                        16:42:18   Log-Likelihood:                 913.42
No. Observations:                 340   AIC:                            -1823.
Df Residuals:                     338   BIC:                            -1815.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0028      0.001      3.075      0.0

In [42]:
test_new_file_1dlag = run_regressions(sentiment=sentiment_by_ticker, returns_dict=stock_price_data, lag_days=1, extreme_percent=0.1, min_obs=30)

Empty data for ABNB. Skipping.
Not enough data after extreme filtering for AJG. Skipping.
Not enough sentiment observations for AMCR. Skipping.
Not enough sentiment observations for ANSS. Skipping.
Not enough sentiment observations for AOS. Skipping.
Not enough sentiment observations for APD. Skipping.
Not enough sentiment observations for APTV. Skipping.
Not enough sentiment observations for ATO. Skipping.
Not enough sentiment observations for AWK. Skipping.
Not enough sentiment observations for BKNG. Skipping.
Not enough sentiment observations for BKR. Skipping.
Not enough sentiment observations for BRO. Skipping.
Empty data for CARR. Skipping.
Empty data for CEG. Skipping.
Not enough sentiment observations for CHRW. Skipping.
Not enough sentiment observations for CINF. Skipping.
Not enough sentiment observations for CNP. Skipping.
Not enough sentiment observations for COR. Skipping.
Not enough sentiment observations for CTVA. Skipping.
Not enough data after extreme filtering for CZR

In [43]:
print(test_new_file_1dlag['AAPL'].summary())

                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.001
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.4968
Date:                Mon, 28 Apr 2025   Prob (F-statistic):              0.481
Time:                        16:44:30   Log-Likelihood:                 899.69
No. Observations:                 340   AIC:                            -1795.
Df Residuals:                     338   BIC:                            -1788.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0011      0.001      1.197      0.2

In [46]:
test_new_file_1dlag0_2 = run_regressions(sentiment=sentiment_by_ticker, returns_dict=stock_price_data, lag_days=1, extreme_percent=0.2, min_obs=30)
print(test_new_file_1dlag0_2['AAPL'].summary())

Empty data for ABNB. Skipping.
Not enough sentiment observations for AMCR. Skipping.
Not enough sentiment observations for ANSS. Skipping.
Not enough sentiment observations for AOS. Skipping.
Not enough sentiment observations for APD. Skipping.
Not enough sentiment observations for APTV. Skipping.
Not enough sentiment observations for ATO. Skipping.
Not enough sentiment observations for AWK. Skipping.
Not enough sentiment observations for BKNG. Skipping.
Not enough sentiment observations for BKR. Skipping.
Not enough sentiment observations for BRO. Skipping.
Empty data for CARR. Skipping.
Empty data for CEG. Skipping.
Not enough sentiment observations for CHRW. Skipping.
Not enough sentiment observations for CINF. Skipping.
Not enough sentiment observations for CNP. Skipping.
Not enough sentiment observations for COR. Skipping.
Not enough sentiment observations for CTVA. Skipping.
Not enough sentiment observations for EG. Skipping.
Not enough sentiment observations for EQR. Skipping.
N

In [48]:
test_new_file_0lag0_0_5 = run_regressions(sentiment=sentiment_by_ticker, returns_dict=stock_price_data, lag_days=0, extreme_percent=0.05, min_obs=30)
print(test_new_file_0lag0_0_5['AAPL'].summary())

Empty data for ABNB. Skipping.
Not enough data after extreme filtering for AEE. Skipping.
Not enough data after extreme filtering for AEP. Skipping.
Not enough data after extreme filtering for AJG. Skipping.
Not enough sentiment observations for AMCR. Skipping.
Not enough sentiment observations for ANSS. Skipping.
Not enough sentiment observations for AOS. Skipping.
Not enough data after extreme filtering for APA. Skipping.
Not enough sentiment observations for APD. Skipping.
Not enough sentiment observations for APTV. Skipping.
Not enough sentiment observations for ATO. Skipping.
Not enough data after extreme filtering for AVY. Skipping.
Not enough sentiment observations for AWK. Skipping.
Not enough data after extreme filtering for AXON. Skipping.
Not enough data after extreme filtering for BDX. Skipping.
Not enough data after extreme filtering for BK. Skipping.
Not enough sentiment observations for BKNG. Skipping.
Not enough sentiment observations for BKR. Skipping.
Not enough data 

In [49]:
test_new_file_1lag0_0_5 = run_regressions(sentiment=sentiment_by_ticker, returns_dict=stock_price_data, lag_days=1, extreme_percent=0.05, min_obs=30)
print(test_new_file_1lag0_0_5['AAPL'].summary())

Empty data for ABNB. Skipping.
Not enough data after extreme filtering for AEE. Skipping.
Not enough data after extreme filtering for AEP. Skipping.
Not enough data after extreme filtering for AJG. Skipping.
Not enough sentiment observations for AMCR. Skipping.
Not enough sentiment observations for ANSS. Skipping.
Not enough sentiment observations for AOS. Skipping.
Not enough data after extreme filtering for APA. Skipping.
Not enough sentiment observations for APD. Skipping.
Not enough sentiment observations for APTV. Skipping.
Not enough sentiment observations for ATO. Skipping.
Not enough data after extreme filtering for AVY. Skipping.
Not enough sentiment observations for AWK. Skipping.
Not enough data after extreme filtering for AXON. Skipping.
Not enough data after extreme filtering for BDX. Skipping.
Not enough sentiment observations for BKNG. Skipping.
Not enough sentiment observations for BKR. Skipping.
Not enough data after extreme filtering for BR. Skipping.
Not enough senti

In [50]:
import pandas as pd
import statsmodels.api as sm

def run_regression_with_compounded_sentiment(sentiment_dict, returns_dict, 
                                              lag_days=0, 
                                              extreme_percentile=0.1, 
                                              min_obs=30, 
                                              compounding_days=3):
    """
    Run OLS regression of stock returns on sentiment with lagging, extreme filtering, 
    and compounded sentiment over past few days.

    Parameters:
    - sentiment_dict: dict of dicts, each ticker's date -> list of sentiment values
    - returns_dict: dict of DataFrames, each ticker's price data
    - lag_days: int, how many days to lag the sentiment
    - extreme_percentile: float between 0 and 0.5, fraction for extreme sentiment filtering
    - min_obs: minimum number of observations to run regression
    - compounding_days: int, how many past days to average sentiment over
    """
    
    results = {}

    for ticker in sentiment_dict:
        if ticker not in returns_dict:
            print(f"Missing returns for {ticker}, skipping...")
            continue

        # Build sentiment dataframe
        sent_df = pd.DataFrame([
            (pd.to_datetime(date), sum(vals)/len(vals) if len(vals) > 0 else None)
            for date, vals in sentiment_dict[ticker].items()
        ], columns=["Date", "Sentiment"]).dropna()

        if sent_df.empty or returns_dict[ticker].empty:
            print(f"Empty data for {ticker}, skipping...")
            continue

        sent_df = sent_df.set_index("Date").sort_index()

        # Smooth sentiment: compound sentiment over past n days
        sent_df['Compounded_Sentiment'] = sent_df['Sentiment'].rolling(window=compounding_days, min_periods=1).mean()

        # Merge with returns
        price_df = returns_dict[ticker].copy()
        price_df['Return'] = price_df['Close'].pct_change()
        
        merged = pd.merge(price_df[['Return']], sent_df[['Compounded_Sentiment']], 
                          left_index=True, right_index=True, how='inner')

        # Apply lag
        if lag_days > 0:
            merged['Compounded_Sentiment'] = merged['Compounded_Sentiment'].shift(lag_days)
            merged = merged.dropna()

        if len(merged) < min_obs:
            print(f"Not enough data for {ticker}, skipping...")
            continue

        # Select extreme values
        lower_thresh = merged['Compounded_Sentiment'].quantile(extreme_percentile)
        upper_thresh = merged['Compounded_Sentiment'].quantile(1 - extreme_percentile)
        extreme_merged = merged[
            (merged['Compounded_Sentiment'] <= lower_thresh) | 
            (merged['Compounded_Sentiment'] >= upper_thresh)
        ]

        if len(extreme_merged) < min_obs:
            print(f"Not enough extreme observations for {ticker}, skipping...")
            continue

        # Regression
        X = sm.add_constant(extreme_merged['Compounded_Sentiment'])
        y = extreme_merged['Return']
        model = sm.OLS(y, X).fit()

        results[ticker] = model
    
    return results


In [52]:
test_comp_sent = run_regression_with_compounded_sentiment(sentiment_dict = sentiment_by_ticker, returns_dict = stock_price_data, 
                                              lag_days=0, 
                                              extreme_percentile=0.1, 
                                              min_obs=30, 
                                              compounding_days=3)

Not enough extreme observations for PSX, skipping...
Not enough extreme observations for KEYS, skipping...
Not enough extreme observations for PH, skipping...
Not enough extreme observations for SLB, skipping...
Not enough extreme observations for WAT, skipping...
Not enough extreme observations for DELL, skipping...
Not enough extreme observations for MAS, skipping...
Not enough extreme observations for ACGL, skipping...
Not enough extreme observations for DIS, skipping...
Not enough extreme observations for URI, skipping...
Not enough extreme observations for SYY, skipping...
Not enough extreme observations for TER, skipping...
Not enough extreme observations for IR, skipping...
Not enough extreme observations for KR, skipping...
Not enough extreme observations for MRNA, skipping...
Not enough data for SMCI, skipping...
Not enough extreme observations for ISRG, skipping...
Not enough extreme observations for STX, skipping...
Not enough extreme observations for CAG, skipping...
Not en

In [54]:
print(test_comp_sent['AAPL'].summary())


                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.010
Method:                 Least Squares   F-statistic:                     4.506
Date:                Mon, 28 Apr 2025   Prob (F-statistic):             0.0345
Time:                        17:20:21   Log-Likelihood:                 884.88
No. Observations:                 341   AIC:                            -1766.
Df Residuals:                     339   BIC:                            -1758.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.0017 

In [55]:
import pandas as pd
import statsmodels.api as sm

def run_regression_on_sentiments(sentiment_dict, returns_dict, 
                                 lag_days=0, 
                                 extreme_percentile=0.1, 
                                 min_obs=30, 
                                 compounding_days=3):
    """
    Run OLS regression of stock returns on both today's sentiment and compounded past sentiment.

    Parameters:
    - sentiment_dict: dict of dicts, each ticker's date -> list of sentiment values
    - returns_dict: dict of DataFrames, each ticker's price data
    - lag_days: int, how many days to lag the sentiment
    - extreme_percentile: float between 0 and 0.5, fraction for extreme sentiment filtering
    - min_obs: minimum number of observations to run regression
    - compounding_days: int, how many past days to average sentiment over
    """
    
    results = {}

    for ticker in sentiment_dict:
        if ticker not in returns_dict:
            print(f"Missing returns for {ticker}, skipping...")
            continue

        # Build sentiment dataframe
        sent_df = pd.DataFrame([
            (pd.to_datetime(date), sum(vals)/len(vals) if len(vals) > 0 else None)
            for date, vals in sentiment_dict[ticker].items()
        ], columns=["Date", "Sentiment"]).dropna()

        if sent_df.empty or returns_dict[ticker].empty:
            print(f"Empty data for {ticker}, skipping...")
            continue

        sent_df = sent_df.set_index("Date").sort_index()

        # Smooth sentiment: compound sentiment over past n days
        sent_df['Compounded_Sentiment'] = sent_df['Sentiment'].rolling(window=compounding_days, min_periods=1).mean()

        # Merge with returns
        price_df = returns_dict[ticker].copy()
        price_df['Return'] = price_df['Close'].pct_change()
        
        merged = pd.merge(price_df[['Return']], sent_df[['Sentiment', 'Compounded_Sentiment']], 
                          left_index=True, right_index=True, how='inner')

        # Apply lag
        if lag_days > 0:
            merged['Sentiment'] = merged['Sentiment'].shift(lag_days)
            merged['Compounded_Sentiment'] = merged['Compounded_Sentiment'].shift(lag_days)
            merged = merged.dropna()

        if len(merged) < min_obs:
            print(f"Not enough data for {ticker}, skipping...")
            continue

        # Select extreme values
        lower_thresh_sent = merged['Sentiment'].quantile(extreme_percentile)
        upper_thresh_sent = merged['Sentiment'].quantile(1 - extreme_percentile)
        lower_thresh_comp = merged['Compounded_Sentiment'].quantile(extreme_percentile)
        upper_thresh_comp = merged['Compounded_Sentiment'].quantile(1 - extreme_percentile)

        extreme_merged = merged[
            ((merged['Sentiment'] <= lower_thresh_sent) | (merged['Sentiment'] >= upper_thresh_sent)) |
            ((merged['Compounded_Sentiment'] <= lower_thresh_comp) | (merged['Compounded_Sentiment'] >= upper_thresh_comp))
        ]

        if len(extreme_merged) < min_obs:
            print(f"Not enough extreme observations for {ticker}, skipping...")
            continue

        # Regression on both variables
        X = extreme_merged[['Sentiment', 'Compounded_Sentiment']]
        X = sm.add_constant(X)
        y = extreme_merged['Return']
        model = sm.OLS(y, X).fit()

        results[ticker] = model
    
    return results


In [60]:
test_mult_regr_comp = run_regression_on_sentiments(sentiment_dict = sentiment_by_ticker, returns_dict = stock_price_data, 
                                 lag_days=0, 
                                 extreme_percentile=0.05, 
                                 min_obs=30, 
                                 compounding_days=5)

print(test_mult_regr_comp['AAPL'].summary())

Not enough extreme observations for PSX, skipping...
Not enough extreme observations for KEYS, skipping...
Not enough extreme observations for PH, skipping...
Not enough extreme observations for SLB, skipping...
Not enough extreme observations for WAT, skipping...
Not enough extreme observations for DELL, skipping...
Not enough extreme observations for MAS, skipping...
Not enough extreme observations for ACGL, skipping...
Not enough extreme observations for AME, skipping...
Not enough extreme observations for URI, skipping...
Not enough extreme observations for SYY, skipping...
Not enough extreme observations for TER, skipping...
Not enough extreme observations for IR, skipping...
Not enough extreme observations for KR, skipping...
Not enough extreme observations for MRNA, skipping...
Not enough data for SMCI, skipping...
Not enough extreme observations for ISRG, skipping...
Not enough extreme observations for WFC, skipping...
Not enough extreme observations for STX, skipping...
Not en

In [61]:
import pandas as pd
import statsmodels.api as sm

def run_regression_on_sentiments(sentiment_dict, returns_dict, 
                                 lag_days=0, 
                                 extreme_percentile=0.1, 
                                 min_obs=30, 
                                 compounding_days=3,
                                 abs_sentiment_threshold=0.0):
    """
    Run OLS regression of stock returns on both today's sentiment and compounded past sentiment.

    Filters:
    - Only keeps sentiment observations where |sentiment| > abs_sentiment_threshold
    - Then, keeps top and bottom extreme_percentile of sentiments
    """

    results = {}

    for ticker in sentiment_dict:
        if ticker not in returns_dict:
            print(f"Missing returns for {ticker}, skipping...")
            continue

        # Build sentiment dataframe
        sent_df = pd.DataFrame([
            (pd.to_datetime(date), sum(vals)/len(vals) if len(vals) > 0 else None)
            for date, vals in sentiment_dict[ticker].items()
        ], columns=["Date", "Sentiment"]).dropna()

        if sent_df.empty or returns_dict[ticker].empty:
            print(f"Empty data for {ticker}, skipping...")
            continue

        sent_df = sent_df.set_index("Date").sort_index()

        # Filter by absolute sentiment strength
        sent_df = sent_df[sent_df['Sentiment'].abs() > abs_sentiment_threshold]

        if sent_df.empty:
            print(f"No strong sentiment data for {ticker} after abs filtering, skipping...")
            continue

        # Smooth sentiment: compound over past n days
        sent_df['Compounded_Sentiment'] = sent_df['Sentiment'].rolling(window=compounding_days, min_periods=1).mean()

        # Merge with returns
        price_df = returns_dict[ticker].copy()
        price_df['Return'] = price_df['Close'].pct_change()
        
        merged = pd.merge(price_df[['Return']], sent_df[['Sentiment', 'Compounded_Sentiment']], 
                          left_index=True, right_index=True, how='inner')

        # Apply lag
        if lag_days > 0:
            merged['Sentiment'] = merged['Sentiment'].shift(lag_days)
            merged['Compounded_Sentiment'] = merged['Compounded_Sentiment'].shift(lag_days)
            merged = merged.dropna()

        if len(merged) < min_obs:
            print(f"Not enough data for {ticker} after lagging, skipping...")
            continue

        # Now take only extreme values
        lower_thresh_sent = merged['Sentiment'].quantile(extreme_percentile)
        upper_thresh_sent = merged['Sentiment'].quantile(1 - extreme_percentile)
        lower_thresh_comp = merged['Compounded_Sentiment'].quantile(extreme_percentile)
        upper_thresh_comp = merged['Compounded_Sentiment'].quantile(1 - extreme_percentile)

        extreme_merged = merged[
            ((merged['Sentiment'] <= lower_thresh_sent) | (merged['Sentiment'] >= upper_thresh_sent)) |
            ((merged['Compounded_Sentiment'] <= lower_thresh_comp) | (merged['Compounded_Sentiment'] >= upper_thresh_comp))
        ]

        if len(extreme_merged) < min_obs:
            print(f"Not enough extreme observations for {ticker}, skipping...")
            continue

        # Regression on both variables
        X = extreme_merged[['Sentiment', 'Compounded_Sentiment']]
        X = sm.add_constant(X)
        y = extreme_merged['Return']
        model = sm.OLS(y, X).fit()

        results[ticker] = model
    
    return results


In [68]:
test_compund_extreme_extreme = run_regression_on_sentiments(sentiment_dict = sentiment_by_ticker, returns_dict = stock_price_data, 
                                 lag_days=0, 
                                 extreme_percentile=0.1, 
                                 min_obs=30, 
                                 compounding_days=3,
                                 abs_sentiment_threshold=0.15)

print(test_compund_extreme_extreme['AAPL'].summary())

Not enough extreme observations for ADP, skipping...
Not enough extreme observations for DLTR, skipping...
Not enough data for PSX after lagging, skipping...
Not enough extreme observations for DXCM, skipping...
Not enough extreme observations for GEN, skipping...
Not enough extreme observations for NTAP, skipping...
Not enough extreme observations for AMP, skipping...
Not enough extreme observations for TMO, skipping...
Not enough extreme observations for AMGN, skipping...
Not enough data for BXP after lagging, skipping...
Not enough extreme observations for PSA, skipping...
Not enough extreme observations for TJX, skipping...
Not enough data for KEYS after lagging, skipping...
Not enough data for PH after lagging, skipping...
Not enough extreme observations for ED, skipping...
Not enough extreme observations for DFS, skipping...
Not enough data for SBUX after lagging, skipping...
Not enough data for AON after lagging, skipping...
Not enough extreme observations for NVDA, skipping...


In [69]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

def run_regression_on_sentiments(sentiment_dict, returns_dict, 
                                 lag_days=0, 
                                 extreme_percentile=0.1, 
                                 min_obs=30, 
                                 compounding_days=3,
                                 abs_sentiment_threshold=0.0,
                                 use_log_returns=False):
    """
    Run OLS regression of stock returns (regular or log) on both today's sentiment and compounded past sentiment.

    Parameters:
        - sentiment_dict: dict of {ticker: {date: [sentiments]}}
        - returns_dict: dict of {ticker: DataFrame with OHLCV}
        - lag_days: int, how many days to lag sentiment
        - extreme_percentile: float (0 to 0.5), percent of extremes to keep
        - min_obs: int, minimum data points needed to run regression
        - compounding_days: int, window size for sentiment compounding
        - abs_sentiment_threshold: float, minimum absolute sentiment to keep
        - use_log_returns: bool, use log returns instead of simple returns

    Returns:
        - dict of {ticker: OLS regression results}
    """
    
    results = {}

    for ticker in sentiment_dict:
        if ticker not in returns_dict:
            print(f"Missing returns for {ticker}, skipping...")
            continue

        # Build sentiment dataframe
        sent_df = pd.DataFrame([
            (pd.to_datetime(date), sum(vals)/len(vals) if len(vals) > 0 else None)
            for date, vals in sentiment_dict[ticker].items()
        ], columns=["Date", "Sentiment"]).dropna()

        if sent_df.empty or returns_dict[ticker].empty:
            print(f"Empty data for {ticker}, skipping...")
            continue

        sent_df = sent_df.set_index("Date").sort_index()

        # Filter by absolute sentiment strength
        sent_df = sent_df[sent_df['Sentiment'].abs() > abs_sentiment_threshold]

        if sent_df.empty:
            print(f"No strong sentiment data for {ticker} after abs filtering, skipping...")
            continue

        # Compounded sentiment
        sent_df['Compounded_Sentiment'] = sent_df['Sentiment'].rolling(window=compounding_days, min_periods=1).mean()

        # Merge with returns
        price_df = returns_dict[ticker].copy()
        price_df = price_df.sort_index()

        # Create return variable
        if use_log_returns:
            price_df['Return'] = np.log(price_df['Close']) - np.log(price_df['Close'].shift(1))
        else:
            price_df['Return'] = price_df['Close'].pct_change()

        merged = pd.merge(price_df[['Return']], sent_df[['Sentiment', 'Compounded_Sentiment']], 
                          left_index=True, right_index=True, how='inner')

        # Apply lag
        if lag_days > 0:
            merged['Sentiment'] = merged['Sentiment'].shift(lag_days)
            merged['Compounded_Sentiment'] = merged['Compounded_Sentiment'].shift(lag_days)
            merged = merged.dropna()

        if len(merged) < min_obs:
            print(f"Not enough data for {ticker} after lagging, skipping...")
            continue

        # Filter extreme values
        lower_thresh_sent = merged['Sentiment'].quantile(extreme_percentile)
        upper_thresh_sent = merged['Sentiment'].quantile(1 - extreme_percentile)
        lower_thresh_comp = merged['Compounded_Sentiment'].quantile(extreme_percentile)
        upper_thresh_comp = merged['Compounded_Sentiment'].quantile(1 - extreme_percentile)

        extreme_merged = merged[
            ((merged['Sentiment'] <= lower_thresh_sent) | (merged['Sentiment'] >= upper_thresh_sent)) |
            ((merged['Compounded_Sentiment'] <= lower_thresh_comp) | (merged['Compounded_Sentiment'] >= upper_thresh_comp))
        ]

        if len(extreme_merged) < min_obs:
            print(f"Not enough extreme observations for {ticker}, skipping...")
            continue

        # Regression
        X = extreme_merged[['Sentiment', 'Compounded_Sentiment']]
        X = sm.add_constant(X)
        y = extreme_merged['Return']
        model = sm.OLS(y, X).fit()

        results[ticker] = model

    return results


In [71]:
new_test = run_regression_on_sentiments(sentiment_dict = sentiment_by_ticker, returns_dict = stock_price_data, 
                                 lag_days=1, 
                                 extreme_percentile=0.2, 
                                 min_obs=30, 
                                 compounding_days=5,
                                 abs_sentiment_threshold=0.1,
                                 use_log_returns=True)

print(new_test['AAPL'].summary())

Not enough extreme observations for PSX, skipping...
Not enough data for BXP after lagging, skipping...
Not enough extreme observations for KEYS, skipping...
Not enough data for PH after lagging, skipping...
Not enough data for SBUX after lagging, skipping...
Not enough extreme observations for AON, skipping...
Not enough data for SLB after lagging, skipping...
Not enough data for WAT after lagging, skipping...
Not enough data for OMC after lagging, skipping...
Not enough data for DELL after lagging, skipping...
Not enough data for MAS after lagging, skipping...
Not enough extreme observations for ACGL, skipping...
Not enough extreme observations for MGM, skipping...
Not enough extreme observations for AME, skipping...
Not enough data for DIS after lagging, skipping...
Not enough data for SYY after lagging, skipping...
Not enough data for TER after lagging, skipping...
Not enough extreme observations for GS, skipping...
Not enough data for IR after lagging, skipping...
Not enough data 

In [72]:
def run_regression_on_sentiments(
    sentiment_dict, returns_dict, 
    lag_days=0, 
    extreme_percentile=0.1, 
    min_obs=30, 
    compounding_days=3,
    abs_sentiment_threshold=0.0,
    use_log_returns=False,
    compounding_returns_days=None  # NEW ARGUMENT
):
    """
    Run OLS regression of (log) returns on today's sentiment and compounded sentiment.

    If use_log_returns is True and compounding_returns_days is set,
    the dependent variable will be compounded log returns.
    """

    results = {}

    for ticker in sentiment_dict:
        if ticker not in returns_dict:
            print(f"Missing returns for {ticker}, skipping...")
            continue

        # Build sentiment dataframe
        sent_df = pd.DataFrame([
            (pd.to_datetime(date), sum(vals)/len(vals) if len(vals) > 0 else None)
            for date, vals in sentiment_dict[ticker].items()
        ], columns=["Date", "Sentiment"]).dropna()

        if sent_df.empty or returns_dict[ticker].empty:
            print(f"Empty data for {ticker}, skipping...")
            continue

        sent_df = sent_df.set_index("Date").sort_index()
        sent_df = sent_df[sent_df['Sentiment'].abs() > abs_sentiment_threshold]

        if sent_df.empty:
            print(f"No strong sentiment data for {ticker}, skipping...")
            continue

        # Compounded sentiment
        sent_df['Compounded_Sentiment'] = sent_df['Sentiment'].rolling(window=compounding_days, min_periods=1).mean()

        price_df = returns_dict[ticker].copy().sort_index()

        # Create returns
        if use_log_returns:
            price_df['Return'] = np.log(price_df['Close']) - np.log(price_df['Close'].shift(1))

            # Optional compounding of log returns
            if compounding_returns_days is not None and compounding_returns_days > 1:
                price_df['Return'] = price_df['Return'].rolling(window=compounding_returns_days).sum()
        else:
            price_df['Return'] = price_df['Close'].pct_change()

        merged = pd.merge(price_df[['Return']], sent_df[['Sentiment', 'Compounded_Sentiment']],
                          left_index=True, right_index=True, how='inner')

        # Apply lag
        if lag_days > 0:
            merged['Sentiment'] = merged['Sentiment'].shift(lag_days)
            merged['Compounded_Sentiment'] = merged['Compounded_Sentiment'].shift(lag_days)
            merged = merged.dropna()

        if len(merged) < min_obs:
            print(f"Not enough data for {ticker}, skipping...")
            continue

        # Extreme filtering
        lower_s, upper_s = merged['Sentiment'].quantile(extreme_percentile), merged['Sentiment'].quantile(1 - extreme_percentile)
        lower_c, upper_c = merged['Compounded_Sentiment'].quantile(extreme_percentile), merged['Compounded_Sentiment'].quantile(1 - extreme_percentile)

        merged = merged[
            (merged['Sentiment'] <= lower_s) | (merged['Sentiment'] >= upper_s) |
            (merged['Compounded_Sentiment'] <= lower_c) | (merged['Compounded_Sentiment'] >= upper_c)
        ]

        if len(merged) < min_obs:
            print(f"Not enough extreme obs for {ticker}, skipping...")
            continue

        # Regression
        X = sm.add_constant(merged[['Sentiment', 'Compounded_Sentiment']])
        y = merged['Return']
        model = sm.OLS(y, X).fit()

        results[ticker] = model

    return results


In [75]:
newnew_test = run_regression_on_sentiments(
    sentiment_dict = sentiment_by_ticker, returns_dict = stock_price_data, 
    lag_days=0, 
    extreme_percentile=0.1, 
    min_obs=30, 
    compounding_days=3,
    abs_sentiment_threshold=0.15,
    use_log_returns=True,
    compounding_returns_days=2  # NEW ARGUMENT
)

print(newnew_test['AAPL'].summary())

Not enough extreme obs for ADP, skipping...
Not enough extreme obs for DLTR, skipping...
Not enough data for PSX, skipping...
Not enough extreme obs for DXCM, skipping...
Not enough extreme obs for GEN, skipping...
Not enough extreme obs for NTAP, skipping...
Not enough extreme obs for AMP, skipping...
Not enough extreme obs for TMO, skipping...
Not enough extreme obs for AMGN, skipping...
Not enough data for BXP, skipping...
Not enough extreme obs for PSA, skipping...
Not enough extreme obs for TJX, skipping...
Not enough data for KEYS, skipping...
Not enough data for PH, skipping...
Not enough extreme obs for ED, skipping...
Not enough extreme obs for DFS, skipping...
Not enough data for SBUX, skipping...
Not enough data for AON, skipping...
Not enough extreme obs for NVDA, skipping...
Not enough data for SLB, skipping...
Not enough data for WAT, skipping...
Not enough extreme obs for DVA, skipping...
Not enough extreme obs for ADI, skipping...
Not enough data for OMC, skipping...
No

In [81]:
newnewnew_test = results = run_regression_on_sentiments(
    sentiment_dict = sentiment_by_ticker,
    returns_dict = stock_price_data,
    lag_days=2,
    extreme_percentile=0.05,
    min_obs=30,
    compounding_days=7,
    abs_sentiment_threshold=0.15,
    use_log_returns=True,
    compounding_returns_days=5
)


Not enough extreme obs for ADP, skipping...
Not enough extreme obs for DLTR, skipping...
Not enough extreme obs for EA, skipping...
Not enough extreme obs for WMT, skipping...
Not enough data for PSX, skipping...
Not enough extreme obs for DXCM, skipping...
Not enough extreme obs for GEN, skipping...
Not enough extreme obs for IP, skipping...
Not enough extreme obs for NTAP, skipping...
Not enough extreme obs for AMP, skipping...
Not enough extreme obs for TMO, skipping...
Not enough extreme obs for AMGN, skipping...
Not enough extreme obs for GRMN, skipping...
Not enough data for BXP, skipping...
Not enough extreme obs for PSA, skipping...
Not enough extreme obs for TJX, skipping...
Not enough data for KEYS, skipping...
Not enough extreme obs for IBM, skipping...
Not enough data for PH, skipping...
Not enough extreme obs for ED, skipping...
Not enough extreme obs for DFS, skipping...
Not enough data for SBUX, skipping...
Not enough extreme obs for UPS, skipping...
Not enough data for 

In [82]:

print(newnewnew_test['AAPL'].summary())

                            OLS Regression Results                            
Dep. Variable:                 Return   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                 -0.020
Method:                 Least Squares   F-statistic:                    0.2568
Date:                Wed, 30 Apr 2025   Prob (F-statistic):              0.774
Time:                        16:05:07   Log-Likelihood:                 157.84
No. Observations:                  77   AIC:                            -309.7
Df Residuals:                      74   BIC:                            -302.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    0.0083 

In [85]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

def grid_search_loosened(sentiment, returns_dict, lags=[0], 
                         min_obs_list=[10, 30], 
                         percentile_list=[0.05, 0.10], 
                         abs_threshold_list=[0.0, 0.1],
                         comp_days_list=[0, 3],
                         use_log_returns=True):
    
    results = []

    for lag in lags:
        for min_obs in min_obs_list:
            for percentile in percentile_list:
                for abs_thresh in abs_threshold_list:
                    for comp_days in comp_days_list:

                        for ticker in sentiment:
                            if ticker not in returns_dict:
                                continue

                            df_sent_raw = sentiment[ticker]
                            if not isinstance(df_sent_raw, pd.DataFrame):
                                continue

                            df_ret = returns_dict[ticker]
                            if df_ret.empty:
                                continue

                            # Flatten list of sentiment values into average
                            df_sent = df_sent_raw.copy()
                            df_sent["Sentiment"] = df_sent["Sentiment"].apply(lambda x: np.mean(x) if isinstance(x, list) else np.nan)
                            df_sent.dropna(subset=["Sentiment"], inplace=True)

                            # Add compounding sentiment
                            if comp_days > 0:
                                df_sent["Sentiment_Comp"] = df_sent["Sentiment"].rolling(window=comp_days).mean()
                            else:
                                df_sent["Sentiment_Comp"] = df_sent["Sentiment"]

                            df_sent = df_sent.dropna()

                            if len(df_sent) < min_obs:
                                continue

                            # Apply absolute sentiment threshold
                            df_sent = df_sent[df_sent["Sentiment"].abs() >= abs_thresh]
                            if len(df_sent) < min_obs:
                                continue

                            # Apply top/bottom x% filter
                            cutoff = df_sent["Sentiment"].quantile([percentile, 1 - percentile])
                            lower, upper = cutoff.iloc[0], cutoff.iloc[1]
                            df_sent = df_sent[(df_sent["Sentiment"] <= lower) | (df_sent["Sentiment"] >= upper)]
                            if len(df_sent) < min_obs:
                                continue

                            df_ret = df_ret.copy()
                            if use_log_returns:
                                df_ret["Return"] = np.log(df_ret["Close"]).diff()
                            else:
                                df_ret["Return"] = df_ret["Close"].pct_change()

                            # Lagging
                            if lag > 0:
                                df_sent_lagged = df_sent.shift(lag)
                                df_sent_lagged.columns = [col + f"_lag{lag}" for col in df_sent.columns]
                                combined = pd.concat([df_ret, df_sent_lagged], axis=1)
                                combined.dropna(subset=[f"Sentiment_lag{lag}", f"Sentiment_Comp_lag{lag}", "Return"], inplace=True)
                                X = combined[[f"Sentiment_lag{lag}", f"Sentiment_Comp_lag{lag}"]]
                            else:
                                combined = pd.concat([df_ret, df_sent], axis=1)
                                combined.dropna(subset=["Sentiment", "Sentiment_Comp", "Return"], inplace=True)
                                X = combined[["Sentiment", "Sentiment_Comp"]]

                            y = combined["Return"]
                            if len(X) < min_obs:
                                continue

                            X = sm.add_constant(X)
                            model = sm.OLS(y, X).fit()

                            results.append({
                                "Ticker": ticker,
                                "Lag": lag,
                                "Min Obs": min_obs,
                                "Percentile": percentile,
                                "Abs Threshold": abs_thresh,
                                "Comp Days": comp_days,
                                "R2": model.rsquared,
                                "P-Value Sent": model.pvalues.get("Sentiment" if lag == 0 else f"Sentiment_lag{lag}", np.nan),
                                "P-Value Comp": model.pvalues.get("Sentiment_Comp" if lag == 0 else f"Sentiment_Comp_lag{lag}", np.nan),
                                "N Obs": len(X)
                            })

    return pd.DataFrame(results)


In [86]:
df_results = grid_search_loosened(sentiment = sentiment_by_ticker, returns_dict=stock_price_data)


In [90]:
import itertools
import pandas as pd
from collections import defaultdict

# Define your parameter grid
param_grid = {
    'abs_sentiment_threshold': [0.0, 0.1, 0.2],
    'extreme_percentile': [0.2, 0.1],
    'min_obs': [30, 50],
    'compound_days': [0, 3],
    'use_log_return': [True, False],
    'use_compound_return': [True, False],
    'lag': [0, 1],
}

# Create all parameter combinations
param_combinations = list(itertools.product(*param_grid.values()))
param_names = list(param_grid.keys())

# Store results
grid_search_results = []
returns_dict = stock_price_data
# Loop through parameter combinations
for param_values in param_combinations:
    params = dict(zip(param_names, param_values))
    r_squared_list = []


    for ticker in returns_dict:
        result_df = run_regression_on_sentiments(
            ticker=ticker,
            returns_dict=returns_dict,
            sentiment=sentiment_by_ticker,
            lag=params['lag'],
            min_obs=params['min_obs'],
            extreme_percentile=params['extreme_percentile'],
            abs_sentiment_threshold=params['abs_sentiment_threshold'],
            compound_days=params['compound_days'],
            use_log_return=params['use_log_return'],
            use_compound_return=params['use_compound_return']
        )

        if result_df is not None and not result_df.empty and 'R_squared' in result_df.columns:
            r_squared_list.extend(result_df['R_squared'].dropna().tolist())

    # Store mean R² if any results were collected
    if r_squared_list:
        mean_r2 = sum(r_squared_list) / len(r_squared_list)
        result_record = {**params, 'mean_r_squared': mean_r2}
        grid_search_results.append(result_record)

# Convert to DataFrame for inspection
grid_search_df = pd.DataFrame(grid_search_results)
grid_search_df = grid_search_df.sort_values(by='mean_r_squared', ascending=False).reset_index(drop=True)


TypeError: run_regression_on_sentiments() got an unexpected keyword argument 'ticker'

In [101]:
import itertools
import pandas as pd
import numpy as np

# Define grid of parameters to search over
param_grid = {
    'abs_sentiment_threshold': [0.0, 0.1, 0.2],
    'extreme_percentile': [0.1, 0.2],
    'min_obs': [30, 50],
    'compounding_days': [3],
    'lag_days': [0, 1],
    'use_log_returns': [True, False],
    'compounding_returns_days': [None, 3]
}

# Create all combinations of parameters
param_names = list(param_grid.keys())
param_combinations = list(itertools.product(*param_grid.values()))

# Store grid search results
grid_results = []

for param_values in param_combinations:
    params = dict(zip(param_names, param_values))
    
    try:
        models = run_regression_on_sentiments(
            sentiment_dict=sentiment_by_ticker,
            returns_dict=stock_price_data,
            **params
        )
        
        # Extract R² values from the returned models
        r_squareds = [model.rsquared for model in models.values() if model is not None]
        
        if r_squareds:
            mean_r2 = np.mean(r_squareds)
            result_entry = {**params, 'mean_r_squared': mean_r2, 'n_models': len(r_squareds)}
            grid_results.append(result_entry)
    
    except Exception as e:
        print(f"Error with params {params}: {e}")

# Convert results to DataFrame
grid_search_df = pd.DataFrame(grid_results)
grid_search_df = grid_search_df.sort_values(by='mean_r_squared', ascending=False).reset_index(drop=True)




Not enough extreme obs for PSX, skipping...
Not enough extreme obs for GEN, skipping...
Not enough extreme obs for TMO, skipping...
Not enough data for BXP, skipping...
Not enough extreme obs for KEYS, skipping...
Not enough data for PH, skipping...
Not enough extreme obs for DFS, skipping...
Not enough data for SBUX, skipping...
Not enough extreme obs for AON, skipping...
Not enough data for SLB, skipping...
Not enough data for WAT, skipping...
Not enough extreme obs for OMC, skipping...
Not enough data for DELL, skipping...
Not enough extreme obs for MAS, skipping...
Not enough extreme obs for ACGL, skipping...
Not enough extreme obs for MGM, skipping...
Not enough extreme obs for AME, skipping...
Not enough extreme obs for DHI, skipping...
Not enough data for DIS, skipping...
Not enough data for SYY, skipping...
Not enough extreme obs for JPM, skipping...
Not enough data for TER, skipping...
Not enough extreme obs for CSCO, skipping...
Not enough extreme obs for MA, skipping...
Not 

In [106]:
import joblib
joblib.dump(grid_search_df, "exptest2_27_04_grid_search_df.pkl")

['exptest2_27_04_grid_search_df.pkl']