In [None]:
import pandas as pd
df = pd.read_feather('../data/merged_factors.ft')
ocfa4q = pd.read_feather('../data/ocfa4q.ft')
ocfa4q = ocfa4q[['rk','code','return']]
df = df.merge(ocfa4q, on=['code','rk'], how = 'left')
df

Unnamed: 0,rk,code,cfo2cl,dta2ev,NNP_SD,ocfa4q,r_nta,roic4q,yoy_s,return
0,1945.0,000004,-0.539357,-0.100045,-0.001195,-0.545248,-0.373964,-1.711607,0.285189,0.114401
1,1945.0,000005,-1.196494,-0.350442,-0.242075,-1.903597,-1.304091,-1.842590,0.416450,0.000000
2,1945.0,000006,-1.569546,1.343961,2.317304,-1.987871,1.056031,0.439823,-2.245958,-0.077917
3,1945.0,000007,1.276299,-0.499103,-0.501018,2.723414,-1.013969,-1.625001,0.792931,-0.009745
4,1945.0,000009,-0.348548,-0.270275,0.043598,-0.184773,-0.233879,-0.161768,-0.313591,0.009129
...,...,...,...,...,...,...,...,...,...,...
5108265,4477.0,688793,-0.855737,-2.272339,-0.464821,-1.076195,-0.982083,-1.067425,-1.258614,-0.138720
5108266,4477.0,688798,0.685225,0.113780,0.200084,0.350019,-0.022114,0.117699,-0.778150,0.026701
5108267,4477.0,688799,0.678994,-0.233143,-1.587911,0.085114,0.300693,0.106962,-0.537688,-0.052172
5108268,4477.0,688800,-0.261622,0.404579,1.854591,-0.065149,0.876517,0.327391,1.797810,-0.047992


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers



2025-12-10 20:44:16.908281: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print(tf.__version__)
print(tf.config.list_physical_devices("GPU"))


2.12.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
def correlation_loss(y_true, y_pred):
    """Loss = 1 – Pearson correlation"""
    x = y_pred
    y = y_true

    # Normalize
    x -= tf.reduce_mean(x)
    y -= tf.reduce_mean(y)

    # Compute correlation
    corr = tf.reduce_sum(x * y) / (tf.sqrt(tf.reduce_sum(tf.square(x))) *
                                   tf.sqrt(tf.reduce_sum(tf.square(y))) + 1e-8)

    # We want to MINIMIZE the loss, so return 1 - corr
    return 1 - corr

def correlation_metric(y_true, y_pred):
    x = y_pred - tf.reduce_mean(y_pred)
    y = y_true - tf.reduce_mean(y_true)
    corr = tf.reduce_sum(x * y) / (
        tf.sqrt(tf.reduce_sum(tf.square(x))) *
        tf.sqrt(tf.reduce_sum(tf.square(y))) + 1e-8
    )
    return corr

In [18]:

# Sample training

#Testing with sample set
df_sample = df.sample(frac=0.2, random_state=42)
feature_cols = ['cfo2cl', 'dta2ev', 'NNP_SD', 'ocfa4q', 'r_nta', 'roic4q', 'yoy_s']
X = df_sample[feature_cols].values.astype('float32')
y = df_sample['return'].values.astype('float32')

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)   # regression output: predicted return
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=correlation_loss,
    metrics=[correlation_metric]  
)

model.summary()

# ===========================
# 5. Train the model
# ===========================

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=256,
    verbose=1
)

# ===========================
# 6. Evaluate on test set
# ===========================
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test MSE: {test_loss:.6f}, Test MAE: {test_mae:.6f}")

# ===========================
# 7. Make predictions
# ===========================
y_pred = model.predict(X_test_scaled)

from scipy.stats import spearmanr
spearman_corr, pval = spearmanr(y_test, y_pred)
print(f"Test Spearman Corr: {spearman_corr:.4f}")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 32)                256       
                                                                 
 dense_6 (Dense)             (None, 32)                1056      
                                                                 
 dense_7 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1,345
Trainable params: 1,345
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50


: 

: 

In [6]:
# Full Model with Time-Series Split

# Sample training


feature_cols = ['cfo2cl', 'dta2ev', 'NNP_SD', 'ocfa4q', 'r_nta', 'roic4q', 'yoy_s','rk','return']
df = df[feature_cols]
# Train / test split

# Sort by time variable
df_sorted = df.sort_values("rk").reset_index(drop=True)

# Define split index (80% train, 20% test)
split_idx = int(len(df_sorted) * 0.8)

# Train and Test
train = df_sorted.iloc[:split_idx]
test  = df_sorted.iloc[split_idx:]

X_train = train.drop(columns=["return"])   # or your Y column name
y_train = train["return"]

X_test = test.drop(columns=["return"])
y_test = test["return"]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


model = keras.Sequential([
    layers.Input(shape=(X_train_scaled.shape[1],)),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1)   # regression output: predicted return
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=correlation_loss,
    metrics=[correlation_metric]  
)

model.summary()

# ===========================
# 5. Train the model
# ===========================

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=256,
    verbose=1
)

# ===========================
# 6. Evaluate on test set
# ===========================
test_loss, test_mae = model.evaluate(X_test_scaled, y_test, verbose=0)
print(f"Test MSE: {test_loss:.6f}, Test MAE: {test_mae:.6f}")

# ===========================
# 7. Make predictions
# ===========================
y_pred = model.predict(X_test_scaled)

from scipy.stats import spearmanr
spearman_corr, pval = spearmanr(y_test, y_pred)
print(f"Test Spearman Corr: {spearman_corr:.4f}")

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 16)                144       
                                                                 
 dense_6 (Dense)             (None, 16)                272       
                                                                 
 dense_7 (Dense)             (None, 16)                272       
                                                                 
 dense_8 (Dense)             (None, 16)                272       
                                                                 
 dense_9 (Dense)             (None, 1)                 17        
                                                                 
Total params: 977
Trainable params: 977
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10


2025-12-09 12:08:36.667259: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2025-12-09 12:08:36.748932: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:954] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp_10.




2025-12-09 12:10:57.129723: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test MSE: 0.988487, Test MAE: 0.011517
   65/31927 [..............................] - ETA: 1:16

2025-12-09 12:36:35.215067: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Test Spearman Corr: -0.1522


In [7]:
y_pred = -model.predict(X_test_scaled)

from scipy.stats import spearmanr
spearman_corr, pval = spearmanr(y_test, y_pred)
print(f"Test Spearman Corr: {spearman_corr:.4f}")

Test Spearman Corr: 0.1522


In [31]:


cols = ['cfo2cl','dta2ev','NNP_SD','ocfa4q','r_nta','roic4q','yoy_s']
df_sample['mean'] = df_sample[cols].mean(axis=1)
spearman_corr = df_sample.corr(method='spearman')['return'].sort_values(ascending=False)

print(spearman_corr)



return    1.000000
cfo2cl    0.015017
ocfa4q    0.014627
NNP_SD    0.010474
mean      0.010446
dta2ev    0.005456
r_nta     0.004232
code      0.003795
roic4q   -0.001999
yoy_s    -0.003611
rk       -0.009992
Name: return, dtype: float64


# XGBoost

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
import numpy as np
from scipy.stats import spearmanr

# -----------------------
# 1. Select features
# -----------------------
feature_cols = ['cfo2cl', 'dta2ev', 'NNP_SD', 'ocfa4q', 'r_nta', 'roic4q', 'yoy_s','rk','return']
df_sort = df[feature_cols]

# -----------------------
# 2. Time-series split
# -----------------------
df_sorted = df_sort.sort_values("rk").reset_index(drop=True)

split_idx = int(len(df_sorted) * 0.8)

train = df_sorted.iloc[:split_idx]
test  = df_sorted.iloc[split_idx:]

X_train = train.drop(columns=["return", "rk"])
y_train = train["return"]

X_test  = test.drop(columns=["return", "rk"])
y_test  = test["return"]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# -----------------------
# 4. XGBoost model
# -----------------------
xgb_model = XGBRegressor(
    n_estimators=2000,           # number of trees
    learning_rate=0.01,         # shrinkage
    max_depth=10,                # tree depth (controls complexity)
    subsample=0.8,              # row subsample
    colsample_bytree=0.8,       # feature subsample per tree
    objective="reg:squarederror",
    random_state=42
)

xgb_model.fit(X_train_scaled, y_train)

# -----------------------
# 5. Predictions
# -----------------------
y_pred_train = xgb_model.predict(X_train_scaled)
y_pred_test  = xgb_model.predict(X_test_scaled)

# -----------------------
# 6. Evaluation
# -----------------------
print("Train R²:", r2_score(y_train, y_pred_train))
print("Test  R²:", r2_score(y_test, y_pred_test))
print("Test MSE:", mean_squared_error(y_test, y_pred_test))

# Correlation with return (Pearson & Spearman)
pearson_corr = np.corrcoef(y_test, y_pred_test)[0, 1]
spearman_corr, _ = spearmanr(y_test, y_pred_test)

print("Test Pearson corr(y, ŷ): ", pearson_corr)
print("Test Spearman corr(y, ŷ):", spearman_corr)


Train R²: 0.631778661756655
Test  R²: -0.06530951033804189
Test MSE: 0.02806300293695913
Test Pearson corr(y, ŷ):  0.021340899220661166
Test Spearman corr(y, ŷ): 0.026694496731058042


# Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from scipy.stats import spearmanr

# ==========================
# 1. Select Features & Target
# ==========================
feature_cols = ['cfo2cl', 'dta2ev', 'NNP_SD', 'ocfa4q', 'r_nta', 'roic4q', 'yoy_s','rk','return']
df_sort = df[feature_cols]

# -----------------------
# 2. Time-series split
# -----------------------
df_sorted = df_sort.sort_values("rk").reset_index(drop=True)

split_idx = int(len(df_sorted) * 0.8)

train = df_sorted.iloc[:split_idx]
test  = df_sorted.iloc[split_idx:]

X_train = train.drop(columns=["return", "rk"])
y_train = train["return"]

X_test  = test.drop(columns=["return", "rk"])
y_test  = test["return"]

# ==================================
# 3. Fit Random Forest Regressor
# ==================================
rf = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='auto',
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

rf.fit(X_train, y_train)

# ================================
# 4. Predictions
# ================================
y_pred = rf.predict(X_test)

# ================================
# 5. Evaluation Metrics
# ================================

# R-squared
r2 = r2_score(y_test, y_pred)

# Pearson correlation
corr = np.corrcoef(y_test, y_pred)[0, 1]

# Spearman Rank Correlation (Rank-IC)
rank_ic, _ = spearmanr(y_test, y_pred)

print("Random Forest Performance:")
print(f"R²:          {r2:.4f}")
print(f"Correlation: {corr:.4f}")
print(f"Rank-IC:     {rank_ic:.4f}")

# ================================
# 6. Factor Importance
# ================================
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print("\nFeature Importances:")
print(importances.sort_values(ascending=False))
