In [10]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings

from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [11]:
stocks = pd.read_csv("./data/stocks_ti.csv")

stocks

Unnamed: 0,DATE,SPY_3M IMPLIED VOL,SPY_HIGH,SPY_LOW,SPY_OPEN,SPY_SHORT INTEREST RATIO,SPY_ewm_log_ret_13d,SPY_ewm_log_ret_1d,SPY_ewm_log_ret_21d,SPY_ewm_log_ret_3d,...,XRT_macd_sig,XRT_rsi,XRT_rsi_13d_slope,XRT_rsi_1d_slope,XRT_rsi_21d_slope,XRT_rsi_3d_slope,XRT_rsi_5d_slope,XRT_rsi_8d_slope,tweet_sentiment,news_sentiment
0,2015-01-02,14.0384,0.006498,-0.006639,0.004078,1.507,0.005375,-0.000535,0.006782,-0.010765,...,0.598815,67.988107,19.826342,-7.838144,5.204016,-1.944891,7.252436,10.295799,-0.131934,0.075776
1,2015-01-05,14.1676,-0.005173,-0.020061,-0.006152,1.507,0.006181,-0.018225,0.003427,-0.019749,...,0.593092,59.339623,16.598506,-8.648484,-0.103102,-8.596080,-3.993711,0.899108,-0.102601,0.116118
2,2015-01-06,14.3399,0.004945,-0.014305,0.001833,1.507,0.006670,-0.009464,-0.000383,-0.023986,...,0.567309,53.461876,14.660032,-5.877747,-4.602641,-22.364376,-16.471123,-4.402835,0.025503,0.050569
3,2015-01-07,14.4772,0.014409,0.005291,0.007975,1.507,0.006085,0.012384,-0.002870,-0.019645,...,0.548476,67.244094,17.009353,13.782219,11.666405,-0.744013,-0.691608,6.508423,-0.051485,0.125229
4,2015-01-08,14.5777,0.018851,0.008270,0.008368,1.507,0.004606,0.017589,-0.002922,0.000432,...,0.544027,64.685908,12.113201,-2.558186,7.219379,5.346286,-11.140343,1.352575,0.100507,0.138343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,2019-12-24,12.9785,0.000934,-0.000997,0.000778,2.463,0.021663,0.000031,0.028050,0.003051,...,0.359009,67.779961,23.646861,-0.862487,24.291589,8.342875,14.908339,-0.857149,-0.048913,0.238482
1246,2019-12-26,12.9736,0.005340,0.001276,0.001307,2.463,0.022183,0.005309,0.028235,0.004959,...,0.380095,71.863118,22.659296,4.083157,26.104189,8.547421,17.008749,15.327685,0.195297,0.277311
1247,2019-12-27,12.9749,0.002659,-0.002046,0.002474,2.463,0.023044,-0.000248,0.028175,0.005026,...,0.403777,66.666667,15.808632,-5.196451,15.789474,-1.975781,7.229581,16.047198,0.027661,0.249240
1248,2019-12-30,12.9999,0.000743,-0.007181,0.000279,2.463,0.023151,-0.005528,0.027213,0.002279,...,0.421979,60.917031,2.283937,-5.749636,7.498227,-6.862930,-2.398666,8.045409,-0.075382,0.255385


In [12]:
def prep_regression_data(df, days=13, random_state=257):
    """Prepare data for regressor"""
    # Create target variable
    df["y_true"] = df["SPY_ewm_log_ret_1d"].rolling(window=days).sum().shift(-days)
    df = df.dropna()
    # Define features
    y_true = df["y_true"].values
    if "DATE" in df.columns:
        X = df.drop(columns=["DATE", "y_true"]).values
    else:
        X = df.drop(columns=["y_true"]).values
    assert len(X) == len(y_true), "X and y_true must have the same length"

    # train test split
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y_true, test_size=0.2, random_state=random_state, shuffle=True
    # )

    test_size = 0.2
    # Calculate split index
    split_idx = int(len(X) * (1 - test_size))

    # Split into training and testing sets based on the calculated index
    X_train = X[:split_idx]
    X_test = X[split_idx:]
    y_train = y_true[:split_idx]
    y_test = y_true[split_idx:]

    # Standardization
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled, y_train, y_test

In [13]:
# from utils import prep_regression_data, feature_type_map
from utils import feature_type_map


fm = feature_type_map(stocks)
columns1 = list(set(fm["spy_returns"]))
columns2 = list(set(fm["returns"]))
columns3 = list(set(fm["returns"] + fm["sentiment"]))
columns4 = list(set(fm["returns"] + fm["technical"]))
columns5 = stocks.columns

# dictionary to store tuple of (pred, confusion_matrix, classifier) for each day, for each feature type, for each model
models_key = {
    "linear_regression": {
        "spy_returns": {1: None, 5: None, 13: None, 21: None},
        "sector_returns": {1: None, 5: None, 13: None, 21: None},
        "sector_returns_sentiment": {1: None, 5: None, 13: None, 21: None},
        "sector_returns_technical": {1: None, 5: None, 13: None, 21: None},
        "all_features": {1: None, 5: None, 13: None, 21: None},
    },
    "random_forest": {
        "spy_returns": {1: None, 5: None, 13: None, 21: None},
        "sector_returns": {1: None, 5: None, 13: None, 21: None},
        "sector_returns_sentiment": {1: None, 5: None, 13: None, 21: None},
        "sector_returns_technical": {1: None, 5: None, 13: None, 21: None},
        "all_features": {1: None, 5: None, 13: None, 21: None},
    },
    "xgboost": {
        "spy_returns": {1: None, 5: None, 13: None, 21: None},
        "sector_returns": {1: None, 5: None, 13: None, 21: None},
        "sector_returns_sentiment": {1: None, 5: None, 13: None, 21: None},
        "sector_returns_technical": {1: None, 5: None, 13: None, 21: None},
        "all_features": {1: None, 5: None, 13: None, 21: None},
    },
}

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [15]:
def linear_regressor(X_train, X_test, y_train, y_test):
    clf = LinearRegression()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    test_mse = mean_squared_error(y_test, y_pred)
    test_r2 = r2_score(y_test, y_pred)

    print(f"MSE:{test_mse}, R2:{test_r2}")

    return clf, y_pred


def random_forest_regressor(X_train, X_test, y_train, y_test):
    """Random Forest Regressor with Random Search CV"""
    from sklearn.ensemble import RandomForestRegressor

    # Define the model
    clf = RandomForestRegressor(random_state=257)

    # Set up the parameter grid to sample from during fitting
    param_distributions = {
        "n_estimators": np.arange(100, 1100, 100),
        "max_depth": [None] + list(np.arange(10, 110, 10)),
        "min_samples_split": np.arange(2, 21),
        "min_samples_leaf": np.arange(1, 21),
    }

    # Create the random search with CV object
    random_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_distributions,
        n_iter=100,  # Number of parameter settings sampled
        scoring="neg_mean_squared_error",  # Minimize MSE
        cv=3,  # 3-fold cross-validation
        verbose=1,  # Higher the number, more the verbosity
        random_state=257,
        n_jobs=-1,  # Use all available cores
    )

    # Fit the random search model
    random_search.fit(X_train, y_train)

    # Best model found by random search
    best_clf = random_search.best_estimator_

    # Predict on test data using the best model
    y_pred = best_clf.predict(X_test)

    # Evaluate the best model
    test_mse = mean_squared_error(y_test, y_pred)
    test_r2 = r2_score(y_test, y_pred)
    print(f"MSE:{test_mse}, R2:{test_r2}")

    return best_clf, y_pred


def xgboost_regressor(X_train, X_test, y_train, y_test):
    """XGBoost Regressor with Random Search CV"""
    from xgboost import XGBRegressor

    # Define the model
    clf = XGBRegressor(random_state=257)

    # Set up the parameter grid to sample from during fitting
    param_distributions = {
        "n_estimators": np.arange(100, 1100, 100),
        "max_depth": np.arange(3, 15),
        "learning_rate": np.linspace(0.01, 0.3, num=30),
        "subsample": np.linspace(0.5, 1.0, num=6),
        "colsample_bytree": np.linspace(0.5, 1.0, num=6),
    }

    # Create the random search with CV object
    random_search = RandomizedSearchCV(
        estimator=clf,
        param_distributions=param_distributions,
        n_iter=100,  # Number of parameter settings sampled
        scoring="neg_mean_squared_error",  # Minimize MSE
        cv=3,  # 3-fold cross-validation
        verbose=1,  # Higher the number, more the verbosity
        random_state=257,
        n_jobs=-1,  # Use all available cores
    )

    # Fit the random search model
    random_search.fit(X_train, y_train)

    # Best model found by random search
    best_clf = random_search.best_estimator_

    # Predict on test data using the best model
    y_pred = best_clf.predict(X_test)

    # Evaluate the best model
    test_mse = mean_squared_error(y_test, y_pred)
    test_r2 = r2_score(y_test, y_pred)
    print(f"MSE:{test_mse}, R2:{test_r2}")

    return best_clf, y_pred

In [17]:
# for i, day_no in enumerate([1, 5, 13, 21]):
for i, day_no in enumerate([21]):
    for j, features in enumerate([columns1, columns2, columns3, columns4, columns5]):
        features_type = [
            "spy_returns",
            "sector_returns",
            "sector_returns_sentiment",
            "sector_returns_technical",
            "all_features",
        ][j]

        # prep data
        X_train, X_test, y_train, y_test = prep_regression_data(
            stocks[features].copy(), day_no
        )

        ## models
        # from utils import (
        #     Linear_regression,
        #     random_forest_classifier,
        #     xgboost_classifier,
        # )

        # logistic regression
        print(f"Linear Regression on {day_no} day(s) with {features_type}...")
        models_key["linear_regression"][features_type][day_no] = linear_regressor(
            X_train, X_test, y_train, y_test
        )

        # random forest
        print(f"Random Forest on {day_no} day(s) with {features_type}...")
        models_key["random_forest"][features_type][day_no] = random_forest_regressor(
            X_train, X_test, y_train, y_test
        )

        # xgboost
        print(f"XGBoost on {day_no} day(s) with {features_type}...")
        models_key["xgboost"][features_type][day_no] = xgboost_regressor(
            X_train, X_test, y_train, y_test
        )

Linear Regression on 21 day(s) with spy_returns...
MSE:0.0012419145650559996, R2:-0.2007732881263622
Random Forest on 21 day(s) with spy_returns...
Fitting 3 folds for each of 100 candidates, totalling 300 fits
MSE:0.0012868329205509123, R2:-0.24420361976308258
XGBoost on 21 day(s) with spy_returns...
Fitting 3 folds for each of 100 candidates, totalling 300 fits
MSE:0.0012306832237762432, R2:-0.18991401086363524
Linear Regression on 21 day(s) with sector_returns...
MSE:0.0019929863593322827, R2:-0.9269641014142578
Random Forest on 21 day(s) with sector_returns...
Fitting 3 folds for each of 100 candidates, totalling 300 fits
MSE:0.0021081060388732815, R2:-1.038270176743413
XGBoost on 21 day(s) with sector_returns...
Fitting 3 folds for each of 100 candidates, totalling 300 fits
MSE:0.002377022561862841, R2:-1.298278221279972
Linear Regression on 21 day(s) with sector_returns_sentiment...
MSE:0.002100387248599921, R2:-1.0308070891545476
Random Forest on 21 day(s) with sector_returns_se

KeyboardInterrupt: 

In [None]:
plt.plot(y_test)