In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Load your training dataset
df_car = pd.read_csv("data/train_data_CAR5_with_text_with_oof_car5.csv")

# Sort by ticker and quarter
df_car = df_car.sort_values(by=["tic", "datacqtr"])

# Create Y(t-1) by shifting within each ticker
df_car["car5_lag1"] = df_car.groupby("tic")["car5"].shift(1)

# Drop rows with missing lag values (i.e., first quarter per firm)
df_car_valid = df_car.dropna(subset=["car5", "car5_lag1"])

# Compare actual Y(t) vs predicted Y(t-1)
y_true = df_car_valid["car5"]
y_pred = df_car_valid["car5_lag1"]

# Compute RMSE and R²
rmse_car = np.sqrt(mean_squared_error(y_true, y_pred))
r2_car = r2_score(y_true, y_pred)

print("CAR(5) Persistence Baseline RMSE:", rmse_car)
print("CAR(5) Persistence Baseline R²:", r2_car)

CAR(5) Persistence Baseline RMSE: 0.079850766566373
CAR(5) Persistence Baseline R²: -0.9540312737295016


In [6]:
# Load your revenue training dataset
df_rev = pd.read_csv("data/train_data_REV_with_text_with_oof_Total Current Operating Revenue.csv")

# Sort by ticker and quarter
df_rev = df_rev.sort_values(by=["tic", "datacqtr"])

# Create Y(t-1) column
target = "Total Current Operating Revenue"
df_rev["rev_lag1"] = df_rev.groupby("tic")[target].shift(1)

# Drop missing lag values
df_rev_valid = df_rev.dropna(subset=[target, "rev_lag1"])

# Compute RMSE and R²
rmse_rev = np.sqrt(mean_squared_error(df_rev_valid[target], df_rev_valid["rev_lag1"]))
r2_rev = r2_score(df_rev_valid[target], df_rev_valid["rev_lag1"])

print("Revenue Persistence Baseline RMSE:", rmse_rev)
print("Revenue Persistence Baseline R²:", r2_rev)


Revenue Persistence Baseline RMSE: 0.013266927594668111
Revenue Persistence Baseline R²: 0.9936958627261635
