In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

## Revenue

In [3]:
df_train_rev = pd.read_csv("data/train_data_REV_with_text.csv")
df_test_rev = pd.read_csv("data/test_data_REV_with_text.csv")
df_train_rev = df_train_rev.sort_values(by=['datacqtr', 'tic']).reset_index(drop=True)
df_test_rev = df_test_rev.sort_values(by=['datacqtr', 'tic']).reset_index(drop=True)

In [44]:
# Create the lagged column
df_train_rev['Total Current Operating Revenue_lag1'] = df_train_rev.groupby('tic')['Total Current Operating Revenue'].shift(1)
df_train_rev['Total Current Operating Revenue_lag2'] = df_train_rev.groupby('tic')['Total Current Operating Revenue'].shift(2)
df_train_rev['Total Current Operating Revenue_lag3'] = df_train_rev.groupby('tic')['Total Current Operating Revenue'].shift(3)
df_train_rev['Total Current Operating Revenue_lag4'] = df_train_rev.groupby('tic')['Total Current Operating Revenue'].shift(4)

df_train_rev['Net Charge-Offs_lag1'] = df_train_rev.groupby('tic')['Net Charge-Offs'].shift(1)
df_train_rev['Net Charge-Offs_lag2'] = df_train_rev.groupby('tic')['Net Charge-Offs'].shift(2)
df_train_rev['Net Charge-Offs_lag3'] = df_train_rev.groupby('tic')['Net Charge-Offs'].shift(3)
df_train_rev['Net Charge-Offs_lag4'] = df_train_rev.groupby('tic')['Net Charge-Offs'].shift(4)

df_train_rev['Invested Capital - Total_lag1'] = df_train_rev.groupby('tic')['Invested Capital - Total'].shift(1)
df_train_rev['Invested Capital - Total_lag2'] = df_train_rev.groupby('tic')['Invested Capital - Total'].shift(2)
df_train_rev['Invested Capital - Total_lag3'] = df_train_rev.groupby('tic')['Invested Capital - Total'].shift(3)
df_train_rev['Invested Capital - Total_lag4'] = df_train_rev.groupby('tic')['Invested Capital - Total'].shift(4)



df_test_rev['Total Current Operating Revenue_lag1'] = df_test_rev.groupby('tic')['Total Current Operating Revenue'].shift(1)
df_test_rev['Total Current Operating Revenue_lag2'] = df_test_rev.groupby('tic')['Total Current Operating Revenue'].shift(2)
df_test_rev['Total Current Operating Revenue_lag3'] = df_test_rev.groupby('tic')['Total Current Operating Revenue'].shift(3)
df_test_rev['Total Current Operating Revenue_lag4'] = df_test_rev.groupby('tic')['Total Current Operating Revenue'].shift(4)

df_test_rev['Net Charge-Offs_lag1'] = df_test_rev.groupby('tic')['Net Charge-Offs'].shift(1)
df_test_rev['Net Charge-Offs_lag2'] = df_test_rev.groupby('tic')['Net Charge-Offs'].shift(2)
df_test_rev['Net Charge-Offs_lag3'] = df_test_rev.groupby('tic')['Net Charge-Offs'].shift(3)
df_test_rev['Net Charge-Offs_lag4'] = df_test_rev.groupby('tic')['Net Charge-Offs'].shift(4)

df_test_rev['Invested Capital - Total_lag1'] = df_test_rev.groupby('tic')['Invested Capital - Total'].shift(1)
df_test_rev['Invested Capital - Total_lag2'] = df_test_rev.groupby('tic')['Invested Capital - Total'].shift(2)
df_test_rev['Invested Capital - Total_lag3'] = df_test_rev.groupby('tic')['Invested Capital - Total'].shift(3)
df_test_rev['Invested Capital - Total_lag4'] = df_test_rev.groupby('tic')['Invested Capital - Total'].shift(4)

In [45]:
# Drop NA

df_train_rev = df_train_rev.dropna()
df_test_rev = df_test_rev.dropna()

In [49]:
def getting_training_dataset(df: pd.DataFrame, subset: list, y_value="Total Current Operating Revenue"):
    X_train = df[subset].copy().to_numpy()
    y_train = df[y_value].copy().to_numpy()

    X_test = df[subset].copy().to_numpy()
    y_test = df[y_value].copy().to_numpy()
    return X_train, y_train, X_test, y_test


def train_decision_tree(X_train, y_train, X_test, y_test, random_state=42):
    # Define the model
    tree = DecisionTreeRegressor(random_state=random_state)

    # Define the hyperparameter grid
    param_grid = {
        'max_depth': [3, 5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=tree,
        param_grid=param_grid,
        cv=5,                # 5-fold cross-validation
        scoring='neg_mean_squared_error',
        n_jobs=-1            # Use all cores
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Best model from grid search
    best_tree = grid_search.best_estimator_

    # Predict on the test set
    y_pred = best_tree.predict(X_test)

    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Test RMSE: {rmse:.4f}")
    print(f"Test MAE: {mae:.4f}")
    print(f"Test R²: {r2:.4f}")
    # print(f"Test R²: {r2}")
    # print(y_test)
    # print(y_pred)

In [50]:
# Features subset

# 1. Just lagged revenue

experiment_1_rev = [
    "Total Current Operating Revenue_lag1",
    "Total Current Operating Revenue_lag2",
    "Total Current Operating Revenue_lag3",
    "Total Current Operating Revenue_lag4",
]

# 2. Just fundamentals and lagged fundametals

experiment_2_rev = [
    "Net Interest Income",
    "Net Interest Margin",
    "Net Charge-Offs",
    "Cash and Short-Term Investments",
    "Net Income",
    "Invested Capital - Total",
    "Total Current Operating Revenue_lag1",
    "Total Current Operating Revenue_lag2",
    "Total Current Operating Revenue_lag3",
    "Total Current Operating Revenue_lag4",
    "Net Charge-Offs_lag1",
    "Net Charge-Offs_lag2",
    "Net Charge-Offs_lag3",
    "Net Charge-Offs_lag4",
    "Invested Capital - Total_lag1",
    "Invested Capital - Total_lag2",
    "Invested Capital - Total_lag3",
    "Invested Capital - Total_lag4",
]

# 3. Just lagged revenue and economics stuff

experiment_3_rev = [
    "Total Current Operating Revenue_lag1",
    "Total Current Operating Revenue_lag2",
    "Total Current Operating Revenue_lag3",
    "Total Current Operating Revenue_lag4",
    'GDP CHANGE (-1 to 1)', 'UNEMPLOYMENT RATE (0 to 1)',
    'PRIME LOAN RATE (0 to 1)', 'DEPOSITS CHANGE (-1 to 1)',
    'CONSUMER PRICE INDEX (0 to 1)', 'SAVINGS PER GROSS INCOME (-1 to 1)'
]

# 4. Just lagged revenue and market stuff

experiment_4_rev = [
    "Total Current Operating Revenue_lag1",
    "Total Current Operating Revenue_lag2",
    "Total Current Operating Revenue_lag3",
    "Total Current Operating Revenue_lag4",
    'S&P_SMA20', 'S&P_SMA40', 'S&P_SMA60',
    'S&P_RSI', 'TNX_SMA20', 'TNX_SMA40', 'TNX_SMA60', 'IRX_SMA20',
    'IRX_SMA40', 'IRX_SMA60', 'FVX_SMA20', 'FVX_SMA40', 'FVX_SMA60',
    'TYX_SMA20', 'TYX_SMA40', 'TYX_SMA60', 'DXY_SMA20', 'DXY_SMA40',
    'DXY_SMA60', 'XLF_SMA20', 'XLF_SMA40', 'XLF_SMA60', 'XLF_RSI', 'SMA20',
    'SMA40', 'SMA60', 'RSI', 'Volatility20', 'Volatility40', 'Volatility60'
]

# 5. Just lagged revenue and text sentiments

experiment_5_rev = [
    "Total Current Operating Revenue_lag1",
    "Total Current Operating Revenue_lag2",
    "Total Current Operating Revenue_lag3",
    "Total Current Operating Revenue_lag4",
    'earning_calls_sentiment', 'earning_calls_confidence',
    'earning_calls_complexity', 'news_sentiment', 'news_confidence',
    'news_complexity_score', 'reviews_rating',
    'text_blob_reviews_sentiment', 'vader_reviews_sentiment_neg',
    'vader_reviews_sentiment_pos', 'bert_reviews_label',
    'bert_reviews_score',
]

In [51]:
print("Just lagged revenue")
X_rev_train, y_rev_train, X_rev_test, y_rev_test = getting_training_dataset(df_train_rev, experiment_1_rev)
train_decision_tree(X_rev_train, y_rev_train, X_rev_test, y_rev_test)

Just lagged revenue
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 0.0087
Test MAE: 0.0044
Test R²: 0.9973


In [52]:
print("Just fundamentals and lagged fundametals")
X_rev_train, y_rev_train, X_rev_test, y_rev_test = getting_training_dataset(df_train_rev, experiment_2_rev)
train_decision_tree(X_rev_train, y_rev_train, X_rev_test, y_rev_test)

Just fundamentals and lagged fundametals
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 0.0083
Test MAE: 0.0042
Test R²: 0.9975


In [53]:
print("Just lagged revenue and economics stuff")
X_rev_train, y_rev_train, X_rev_test, y_rev_test = getting_training_dataset(df_train_rev, experiment_3_rev)
train_decision_tree(X_rev_train, y_rev_train, X_rev_test, y_rev_test)

Just lagged revenue and economics stuff
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 0.0087
Test MAE: 0.0043
Test R²: 0.9973


In [54]:
print("Just lagged revenue and market stuff")
X_rev_train, y_rev_train, X_rev_test, y_rev_test = getting_training_dataset(df_train_rev, experiment_4_rev)
train_decision_tree(X_rev_train, y_rev_train, X_rev_test, y_rev_test)

Just lagged revenue and market stuff
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 0.0085
Test MAE: 0.0043
Test R²: 0.9975


In [55]:
print("Just lagged revenue and text sentiments")
X_rev_train, y_rev_train, X_rev_test, y_rev_test = getting_training_dataset(df_train_rev, experiment_5_rev)
train_decision_tree(X_rev_train, y_rev_train, X_rev_test, y_rev_test)

Just lagged revenue and text sentiments
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 0.0087
Test MAE: 0.0045
Test R²: 0.9973


## CAR

In [56]:
df_train_car = pd.read_csv("data/train_data_CAR5_with_text.csv")
df_test_car = pd.read_csv("data/test_data_CAR5_with_text.csv")
df_train_car = df_train_car.sort_values(by=['tic', 'datacqtr']).reset_index(drop=True)
df_test_car = df_test_car.sort_values(by=['tic', 'datacqtr']).reset_index(drop=True)

In [57]:
# Create the lagged column
df_train_car['Total Current Operating Revenue_lag1'] = df_train_car.groupby('tic')['Total Current Operating Revenue'].shift(1)
df_train_car['Total Current Operating Revenue_lag2'] = df_train_car.groupby('tic')['Total Current Operating Revenue'].shift(2)
df_train_car['Total Current Operating Revenue_lag3'] = df_train_car.groupby('tic')['Total Current Operating Revenue'].shift(3)
df_train_car['Total Current Operating Revenue_lag4'] = df_train_car.groupby('tic')['Total Current Operating Revenue'].shift(4)

df_train_car['Net Charge-Offs_lag1'] = df_train_car.groupby('tic')['Net Charge-Offs'].shift(1)
df_train_car['Net Charge-Offs_lag2'] = df_train_car.groupby('tic')['Net Charge-Offs'].shift(2)
df_train_car['Net Charge-Offs_lag3'] = df_train_car.groupby('tic')['Net Charge-Offs'].shift(3)
df_train_car['Net Charge-Offs_lag4'] = df_train_car.groupby('tic')['Net Charge-Offs'].shift(4)

df_train_car['Invested Capital - Total_lag1'] = df_train_car.groupby('tic')['Invested Capital - Total'].shift(1)
df_train_car['Invested Capital - Total_lag2'] = df_train_car.groupby('tic')['Invested Capital - Total'].shift(2)
df_train_car['Invested Capital - Total_lag3'] = df_train_car.groupby('tic')['Invested Capital - Total'].shift(3)
df_train_car['Invested Capital - Total_lag4'] = df_train_car.groupby('tic')['Invested Capital - Total'].shift(4)

df_train_car['car5_lag1'] = df_train_car.groupby('tic')['car5'].shift(1)
df_train_car['car5_lag2'] = df_train_car.groupby('tic')['car5'].shift(2)
df_train_car['car5_lag3'] = df_train_car.groupby('tic')['car5'].shift(3)
df_train_car['car5_lag4'] = df_train_car.groupby('tic')['car5'].shift(4)



df_test_car['Total Current Operating Revenue_lag1'] = df_test_car.groupby('tic')['Total Current Operating Revenue'].shift(1)
df_test_car['Total Current Operating Revenue_lag2'] = df_test_car.groupby('tic')['Total Current Operating Revenue'].shift(2)
df_test_car['Total Current Operating Revenue_lag3'] = df_test_car.groupby('tic')['Total Current Operating Revenue'].shift(3)
df_test_car['Total Current Operating Revenue_lag4'] = df_test_car.groupby('tic')['Total Current Operating Revenue'].shift(4)

df_test_car['Net Charge-Offs_lag1'] = df_test_car.groupby('tic')['Net Charge-Offs'].shift(1)
df_test_car['Net Charge-Offs_lag2'] = df_test_car.groupby('tic')['Net Charge-Offs'].shift(2)
df_test_car['Net Charge-Offs_lag3'] = df_test_car.groupby('tic')['Net Charge-Offs'].shift(3)
df_test_car['Net Charge-Offs_lag4'] = df_test_car.groupby('tic')['Net Charge-Offs'].shift(4)

df_test_car['Invested Capital - Total_lag1'] = df_test_car.groupby('tic')['Invested Capital - Total'].shift(1)
df_test_car['Invested Capital - Total_lag2'] = df_test_car.groupby('tic')['Invested Capital - Total'].shift(2)
df_test_car['Invested Capital - Total_lag3'] = df_test_car.groupby('tic')['Invested Capital - Total'].shift(3)
df_test_car['Invested Capital - Total_lag4'] = df_test_car.groupby('tic')['Invested Capital - Total'].shift(4)

df_test_car['car5_lag1'] = df_test_car.groupby('tic')['car5'].shift(1)
df_test_car['car5_lag2'] = df_test_car.groupby('tic')['car5'].shift(2)
df_test_car['car5_lag3'] = df_test_car.groupby('tic')['car5'].shift(3)
df_test_car['car5_lag4'] = df_test_car.groupby('tic')['car5'].shift(4)

In [58]:
df_train_car = df_train_car.dropna()
df_test_car = df_test_car.dropna()

In [None]:
# Features subset

# 1. Just lagged car

experiment_1_car = [
    'car5_lag1', 'car5_lag2', 'car5_lag3', 'car5_lag4'
]

# 2. Just fundamentals and lagged fundametals

experiment_2_car = [
    'car5_lag1', 'car5_lag2', 'car5_lag3', 'car5_lag4',
    'Net Interest Income',
    'Net Interest Margin', 'Net Charge-Offs',
    'Cash and Short-Term Investments', 'Net Income',
    'Invested Capital - Total', 'Total Current Operating Revenue',
    'Total Current Operating Revenue_lag1',
    'Total Current Operating Revenue_lag2',
    'Total Current Operating Revenue_lag3',
    'Total Current Operating Revenue_lag4', 'Net Charge-Offs_lag1',
    'Net Charge-Offs_lag2', 'Net Charge-Offs_lag3', 'Net Charge-Offs_lag4',
    'Invested Capital - Total_lag1', 'Invested Capital - Total_lag2',
    'Invested Capital - Total_lag3', 'Invested Capital - Total_lag4',
    
]

# 3. Just lagged car and economics stuff

experiment_3_car = [
    'car5_lag1', 'car5_lag2', 'car5_lag3', 'car5_lag4',
    'GDP CHANGE (-1 to 1)', 'UNEMPLOYMENT RATE (0 to 1)',
    'PRIME LOAN RATE (0 to 1)', 'DEPOSITS CHANGE (-1 to 1)',
    'CONSUMER PRICE INDEX (0 to 1)', 'SAVINGS PER GROSS INCOME (-1 to 1)'
]

# 4. Just lagged car and market stuff

experiment_4_car = [
    'car5_lag1', 'car5_lag2', 'car5_lag3', 'car5_lag4',
    'S&P_SMA5', 'S&P_SMA20', 'S&P_SMA50', 'S&P_RSI', 'VIX', 'SMA5', 'SMA20',
    'SMA50', 'RSI', 'Volatility5', 'Volatility20', 'Volatility50',
]

# 5. Just lagged revenue and text sentiments

experiment_5_car = [
    'car5_lag1', 'car5_lag2', 'car5_lag3', 'car5_lag4',
    'earning_calls_sentiment', 'earning_calls_confidence',
    'earning_calls_complexity', 'news_sentiment', 'news_confidence',
    'news_complexity_score', 'reviews_rating',
    'text_blob_reviews_sentiment', 'vader_reviews_sentiment_neg',
    'vader_reviews_sentiment_pos', 'bert_reviews_label',
    'bert_reviews_score',
]

In [60]:
print("Just Lagged CAR")
X_car_train, y_car_train, X_car_test, y_car_test = getting_training_dataset(df_train_car, experiment_1_car, "car5")
train_decision_tree(X_car_train, y_car_train, X_car_test, y_car_test)

Just Lagged CAR
Best parameters: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 10}
Test RMSE: 0.0568
Test MAE: 0.0411
Test R²: 0.0131


In [None]:
print("Just fundamentals and lagged fundametals + Lagged CAR")
X_car_train, y_car_train, X_car_test, y_car_test = getting_training_dataset(df_train_car, experiment_2_car, "car5")
train_decision_tree(X_car_train, y_car_train, X_car_test, y_car_test)

Just fundamentals and lagged fundametals + Lagged CAR
Best parameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 10}
Test RMSE: 0.0564
Test MAE: 0.0409
Test R²: 0.0269


In [62]:
print("Just lagged car and economics stuff")
X_car_train, y_car_train, X_car_test, y_car_test = getting_training_dataset(df_train_car, experiment_3_car, "car5")
train_decision_tree(X_car_train, y_car_train, X_car_test, y_car_test)

Just lagged car and economics stuff
Best parameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 10}
Test RMSE: 0.0563
Test MAE: 0.0408
Test R²: 0.0305


In [63]:
print("Just lagged car and market stuff")
X_car_train, y_car_train, X_car_test, y_car_test = getting_training_dataset(df_train_car, experiment_4_car, "car5")
train_decision_tree(X_car_train, y_car_train, X_car_test, y_car_test)

Just lagged car and market stuff
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10}
Test RMSE: 0.0530
Test MAE: 0.0387
Test R²: 0.1414


In [64]:
print("Just lagged car and sentiment stuff")
X_car_train, y_car_train, X_car_test, y_car_test = getting_training_dataset(df_train_car, experiment_5_car, "car5")
train_decision_tree(X_car_train, y_car_train, X_car_test, y_car_test)

Just lagged car and sentiment stuff
Best parameters: {'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 10}
Test RMSE: 0.0568
Test MAE: 0.0411
Test R²: 0.0131


In [66]:
print("All columns")
all_car_cols = list(set(experiment_1_car + experiment_2_car + experiment_3_car + experiment_4_car + experiment_5_car))
X_car_train, y_car_train, X_car_test, y_car_test = getting_training_dataset(df_train_car, all_car_cols, "car5")
train_decision_tree(X_car_train, y_car_train, X_car_test, y_car_test)

All columns
Best parameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Test RMSE: 0.0555
Test MAE: 0.0403
Test R²: 0.0607
