In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import statsmodels.api as sm


Mounted at /content/drive


In [None]:
# Base folder for your panel data
base_panel = "/content/drive/MyDrive/Fintech/Dataset/Panel_data"

# Load clean ETF-level panel (if you want to use it later)
etf_df = pd.read_csv(f"{base_panel}/etf_df_clean.csv", parse_dates=["Date"])

# Load ARKK & ARKF regression-ready datasets
arkk_sense = pd.read_csv(
    f"{base_panel}/arkk_sense_clean.csv",
    parse_dates=["Date", "date"]
)

arkf_sense = pd.read_csv(
    f"{base_panel}/arkf_sense_clean.csv",
    parse_dates=["Date", "date"]
)

print("=== etf_df_clean columns ===")
print(etf_df.columns.tolist())
print("\n=== arkk_sense_clean columns ===")
print(arkk_sense.columns.tolist())
print("\n=== arkf_sense_clean columns ===")
print(arkf_sense.columns.tolist())

display(arkk_sense.head())
display(arkf_sense.head())


=== etf_df_clean columns ===
['Date', 'Close', 'return_arkk', 'sentiment_arkk', 'rotation_arkk', 'trade_event', 'fund', 'return_arkf', 'sentiment_arkf', 'rotation_arkf']

=== arkk_sense_clean columns ===
['Date', 'Close', 'return_arkk', 'sentiment_arkk', 'rotation_arkk', 'trade_event', 'fund', 'date', 'rotation_nonfin', 'rotation_fintech']

=== arkf_sense_clean columns ===
['Date', 'Close', 'return_arkf', 'sentiment_arkf', 'rotation_arkf', 'trade_event', 'fund', 'date', 'rotation_nonfin', 'rotation_fintech']


Unnamed: 0,Date,Close,return_arkk,sentiment_arkk,rotation_arkk,trade_event,fund,date,rotation_nonfin,rotation_fintech
0,2025-01-03,59.82,0.051503,-0.584983,-0.01,0.0,ARKK,2025-01-03,-0.49,0.48
1,2025-01-06,61.150002,0.022233,-0.557722,0.04,1.0,ARKK,2025-01-06,0.04,0.0
2,2025-01-06,61.150002,0.022233,-0.557722,0.04,1.0,ARKK,2025-01-06,0.04,0.0
3,2025-01-06,61.150002,0.022233,-0.557722,0.04,1.0,ARKK,2025-01-06,0.04,0.0
4,2025-01-06,61.150002,0.022233,-0.557722,0.04,1.0,ARKK,2025-01-06,0.04,0.0


Unnamed: 0,Date,Close,return_arkf,sentiment_arkf,rotation_arkf,trade_event,fund,date,rotation_nonfin,rotation_fintech
0,2025-01-03,38.66,0.03231,-0.603882,-0.29,1.0,ARKF,2025-01-03,-0.26,-0.03
1,2025-01-06,39.529999,0.022504,-0.566932,0.1,1.0,ARKF,2025-01-06,-0.46,0.56
2,2025-01-06,39.529999,0.022504,-0.566932,0.1,1.0,ARKF,2025-01-06,-0.46,0.56
3,2025-01-06,39.529999,0.022504,-0.566932,0.1,1.0,ARKF,2025-01-06,-0.46,0.56
4,2025-01-07,38.040001,-0.037693,-0.5721,-0.16,1.0,ARKF,2025-01-07,-0.5,0.34


In [None]:
def run_return_reg(df, fund_name, ret_col, sent_col, date_col="Date"):
    """
    Performance regression:
    Return_t ~ Sentiment_t + Sentiment_{t-1}
    """
    temp = df.copy().sort_values(date_col)

    # Same-day and lagged sentiment
    temp["sentiment_t"]    = temp[sent_col]
    temp["sentiment_lag1"] = temp[sent_col].shift(1)

    # Drop first row with lag NaN and any other missing values
    clean = temp.dropna(subset=[ret_col, "sentiment_t", "sentiment_lag1"])

    X = clean[["sentiment_t", "sentiment_lag1"]]
    X = sm.add_constant(X)
    y = clean[ret_col]

    model = sm.OLS(y, X).fit(cov_type="HC1")  # robust SE

    print(f"\n=== {fund_name}: Return_t on Sentiment_t & Sentiment_lag1 ===")
    print(model.summary())
    return model

# ARKK performance
model_ret_arkk = run_return_reg(
    arkk_sense,
    fund_name="ARKK",
    ret_col="return_arkk",
    sent_col="sentiment_arkk"
)

# ARKF performance
model_ret_arkf = run_return_reg(
    arkf_sense,
    fund_name="ARKF",
    ret_col="return_arkf",
    sent_col="sentiment_arkf"
)



=== ARKK: Return_t on Sentiment_t & Sentiment_lag1 ===
                            OLS Regression Results                            
Dep. Variable:            return_arkk   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.660
Date:                Sat, 29 Nov 2025   Prob (F-statistic):              0.191
Time:                        23:40:53   Log-Likelihood:                 1658.9
No. Observations:                 774   AIC:                            -3312.
Df Residuals:                     771   BIC:                            -3298.
Df Model:                           2                                         
Covariance Type:                  HC1                                         
                     coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------

In [None]:
def extract_perf_stats(model, name):
    return {
        "model": name,
        "beta_sent_t": model.params.get("sentiment_t", np.nan),
        "p_sent_t": model.pvalues.get("sentiment_t", np.nan),
        "beta_sent_lag1": model.params.get("sentiment_lag1", np.nan),
        "p_sent_lag1": model.pvalues.get("sentiment_lag1", np.nan),
        "R_squared": model.rsquared,
        "N_obs": int(model.nobs),
    }

perf_rows = [
    extract_perf_stats(model_ret_arkk, "ARKK – return"),
    extract_perf_stats(model_ret_arkf, "ARKF – return"),
]

perf_summary = pd.DataFrame(perf_rows)
display(perf_summary)


Unnamed: 0,model,beta_sent_t,p_sent_t,beta_sent_lag1,p_sent_lag1,R_squared,N_obs
0,ARKK – return,0.023345,0.535159,0.011793,0.740745,0.004058,774
1,ARKF – return,0.013667,0.780383,0.037844,0.398495,0.007053,350


- ETF-level news sentiment does not meaningfully predict daily returns for ARKK or ARKF; sentiment coefficients are insignificant and explain less than 1% of return variation.