In [15]:
import numpy as np
#pd.options.mode.chained_assignment = None  # default='warn'
import pandas as pd
pd.options.plotting.backend = "matplotlib"
import matplotlib.pyplot as plt

import cufflinks as cf
cf.go_offline()

from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [16]:
def drop_columns_by_row(df, row_name, valid_strings):
    """
    Drops columns from a DataFrame if the specified row (by name) does not contain any of the given valid strings.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to process.
    row_name (str): The name of the row to check.
    valid_strings (list): List of strings to check for.
    
    Returns:
    pd.DataFrame: The filtered DataFrame with columns removed.
    """
    mask = df.loc[df.index[df.index.get_loc(row_name)]].astype(str).apply(lambda x: any(s in x for s in valid_strings))
    return df.loc[:, mask]


In [17]:
filename = 'Trading_Project_Data.csv'

df = pd.read_csv(filename,
                  index_col = 0,
                    )


df = df.drop('Date')
df.index.name = 'Date'

tickers = df.iloc[0]
df.columns = [f"{tickers[col]}_{col.split('.')[0]}" for col in df.columns]
df = df.drop('Ticker')

df.index = pd.to_datetime(df.index, dayfirst=True)
df = df.dropna()
df.head()

df = df.astype(float)

data = df.loc[:, df.columns.str.contains('Close', case=False)].copy()
data.columns = [col.split('_')[0] if ('Close' in col and '_' in col) else col for col in data.columns]
data = np.log(data/data.shift(1)).dropna()
data

Unnamed: 0_level_0,AAPL,AMZN,BA,CAT,CVX,GOOGL,GS,JNJ,JPM,KO,MCD,MSFT,NKE,NVDA,PFE,SOFI,TSLA,UNH,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-01-23,0.006631,0.007979,-0.016087,0.000554,-0.002811,0.007167,-0.013538,-0.016508,-0.006606,0.004689,0.005481,0.006009,0.013237,0.003664,0.004230,-0.021698,0.001627,0.004920,0.002522,0.011195
2024-01-24,-0.003490,0.005433,0.012358,0.006385,0.019721,0.011226,-0.003604,-0.005333,0.008896,-0.015830,0.001299,0.009133,-0.011250,0.024565,-0.003524,-0.024822,-0.006283,-0.004452,-0.014229,0.017113
2024-01-25,-0.001698,0.005594,-0.058910,0.034123,0.024874,0.021094,0.008660,0.003767,0.014210,0.004235,-0.010809,0.005722,0.000099,0.004147,-0.015890,0.011834,-0.129258,-0.039404,0.014474,0.025084
2024-01-26,-0.009054,0.008647,0.017627,-0.004465,0.003829,0.002105,-0.012913,-0.000376,-0.003824,0.003543,-0.016795,-0.002325,0.019458,-0.009556,0.000000,-0.003929,0.003389,0.019667,0.008743,0.008483
2024-01-29,-0.003592,0.013359,-0.001364,0.012677,-0.000402,0.008636,0.007305,-0.000878,0.002609,0.006045,0.000171,0.014233,0.010937,0.023224,0.000364,0.184070,0.041055,0.002659,0.004676,0.001261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-10,-0.024399,-0.014465,0.001396,-0.028286,0.018719,-0.009897,-0.035126,-0.001477,-0.013499,-0.010425,-0.016128,-0.013302,-0.001263,-0.030435,-0.005226,-0.027876,-0.000507,-0.007329,0.012987,-0.003654
2025-01-13,-0.010398,-0.002195,-0.008349,0.032238,0.014328,-0.005378,0.005254,0.016822,0.017931,0.009453,0.004665,-0.004210,0.012284,-0.019916,0.002990,-0.005670,0.021478,0.038523,-0.015933,0.025484
2025-01-14,-0.004790,-0.003209,-0.021032,0.024713,0.009928,-0.007093,0.015126,0.001936,0.013261,0.006306,-0.007254,-0.003650,-0.012705,-0.011095,-0.014659,0.030104,-0.017383,0.004793,-0.008118,0.003927
2025-01-15,0.019485,0.025347,-0.004922,0.008895,0.009073,0.030583,0.058431,0.001519,0.019528,-0.004362,0.002554,0.025275,-0.001406,0.033436,-0.007220,0.067336,0.077314,-0.000589,0.006040,0.016183


In [18]:
def rolling_regression_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    # Convert start_date to datetime and find the index where it occurs
    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    # Determine the rolling window size as the length of rows before the start date
    window = start_index  # This will be the number of rows before start_date

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = window_data[col]

            model = LinearRegression()
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            # Convert prediction to trading signal: 1 or -1
            signal = 1 if pred > 0 else -1

            # Multiply signal with actual log return to get predicted returns
            predicted_returns.loc[pred_index, col] = signal * df.loc[pred_index, col]

    # Filter by start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]

    return predicted_returns.astype(float)


def rolling_DTclassification_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]
    window = start_index  # Window size is the number of rows before start_date

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()

        # Create the target: 1 if return is positive, -1 if return is negative
        target = np.where(full_data[col] > 0, 1, -1)  # Binary target for classification

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = target[i - window:i]  # Use binary target for classification

            # Using DecisionTreeClassifier
            model = DecisionTreeClassifier(max_depth = 5, random_state = 100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            # Multiply signal with actual log return to get predicted returns
            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    # Filter by start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]

    return predicted_returns.astype(float)


def rolling_MLPclassification_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]
    window = start_index  # Window size is the number of rows before start_date

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()

        # Create the target: 1 if return is positive, -1 if return is negative
        target = np.where(full_data[col] > 0, 1, -1)  # Binary target for classification

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = target[i - window:i]  # Use binary target for classification

            # Using MLPClassifier
            model = MLPClassifier(hidden_layer_sizes = 4 * [15], max_iter = 2000, random_state=100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            # Multiply signal with actual log return to get predicted returns
            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    # Filter by start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]

    return predicted_returns.astype(float)


def rolling_svc_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]
    window = start_index  # Use all data before the start_date as training window

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()

        # Create binary classification target
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = target[i - window:i]

            model = SVC(kernel='linear')
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            # Predicted return = signal * actual return
            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    # Filter rows after the start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]

    return predicted_returns.astype(float)


def rolling_logistic_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]
    window = start_index  # Use all data before the start_date

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()

        # Binary target: 1 for up, -1 for down
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = target[i - window:i]

            # Logistic Regression model
            model = LogisticRegression(random_state=100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            # Multiply predicted signal (1/-1) with actual log return
            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    # Keep only rows after start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]

    return predicted_returns.astype(float)


def benchmark_returns(df, start_date='2024-03-01'):
    import numpy as np
    import pandas as pd

    # Convert start_date to datetime
    start_date = pd.to_datetime(start_date)

    # Filter the data from the start_date to the end of the dataset
    filtered_data = df[df.index >= start_date]

    # Sum the returns for each column from start_date to the end
    column_sums = filtered_data.sum(axis=0)

    # Apply np.exp to the sum of each column's returns
    exp_column_sums = np.exp(column_sums)

    return exp_column_sums


def sum_and_exp_predicted_returns(predicted_returns):
    # Sum each column of the predicted returns DataFrame
    column_sums = predicted_returns.sum(axis=0)
    
    # Apply np.exp to the sum of each column
    exp_column_sums = np.exp(column_sums)
    
    return exp_column_sums


def highlight_max_row(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]



2 LAGS

In [19]:
# Create dummy time series data
sd = '2024-03-01'
lg = [1, 2]


LR_preds = rolling_regression_predicted_returns(data,lg,sd)
Log_preds = rolling_logistic_predicted_returns(data,lg,sd)
DT_preds = rolling_DTclassification_predicted_returns(data,lg,sd)
MLP_preds = rolling_MLPclassification_predicted_returns(data,lg,sd)
SVC_preds = rolling_svc_predicted_returns(data,lg,sd)


In [20]:
benchmarks = benchmark_returns(data)
LR_preds_Sum = sum_and_exp_predicted_returns(LR_preds)
Log_preds_Sum = sum_and_exp_predicted_returns(Log_preds)
DT_preds_Sum = sum_and_exp_predicted_returns(DT_preds)
MLP_preds_Sum = sum_and_exp_predicted_returns(MLP_preds)
SVC_preds_Sum = sum_and_exp_predicted_returns(SVC_preds)


# Merge both series into a DataFrame
merged_df = pd.concat([benchmarks, LR_preds_Sum,Log_preds_Sum, DT_preds_Sum, MLP_preds_Sum, SVC_preds_Sum], axis=1)
merged_df.columns = ['Benchmark', 'LinReg','Logistic', 'DecisionTree', 'DNN', 'SVC' ]

# Apply styling
styled_df_roll2 = merged_df.add_suffix('_Roll_2')
# styled_df_roll2 = styled_df_roll2.style.apply(highlight_max_row, axis=1)
# styled_df_roll2

3 LAGS

In [21]:
# Create dummy time series data
sd = '2024-03-01'
lg = [1, 2, 3]


LR_preds = rolling_regression_predicted_returns(data,lg,sd)
Log_preds = rolling_logistic_predicted_returns(data,lg,sd)
DT_preds = rolling_DTclassification_predicted_returns(data,lg,sd)
MLP_preds = rolling_MLPclassification_predicted_returns(data,lg,sd)
SVC_preds = rolling_svc_predicted_returns(data,lg,sd)


In [22]:
benchmarks = benchmark_returns(data)
LR_preds_Sum = sum_and_exp_predicted_returns(LR_preds)
Log_preds_Sum = sum_and_exp_predicted_returns(Log_preds)
DT_preds_Sum = sum_and_exp_predicted_returns(DT_preds)
MLP_preds_Sum = sum_and_exp_predicted_returns(MLP_preds)
SVC_preds_Sum = sum_and_exp_predicted_returns(SVC_preds)


# Merge both series into a DataFrame
merged_df = pd.concat([benchmarks, LR_preds_Sum,Log_preds_Sum, DT_preds_Sum, MLP_preds_Sum, SVC_preds_Sum], axis=1)
merged_df.columns = ['Benchmark', 'LinReg','Logistic', 'DecisionTree', 'DNN', 'SVC' ]

# Apply styling
styled_df_roll3 = merged_df.add_suffix('_Roll_3')
# styled_df_roll3 = styled_df_roll3.style.apply(highlight_max_row, axis=1)
# styled_df_roll3

Expanding Window

In [23]:
def expanding_regression_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        full_data = pd.concat([series, lagged_data], axis=1).dropna()

        for i in range(start_index, len(full_data)):
            X = full_data.iloc[:i][[f'{col}_lag{lag}' for lag in lags]]
            y = full_data.iloc[:i][col]

            model = LinearRegression()
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]
            signal = 1 if pred > 0 else -1

            predicted_returns.loc[pred_index, col] = signal * df.loc[pred_index, col]

    return predicted_returns[predicted_returns.index >= start_date].astype(float)


def expanding_DTclassification_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(start_index, len(full_data)):
            X = full_data.iloc[:i][[f'{col}_lag{lag}' for lag in lags]]
            y = target[:i]

            model = DecisionTreeClassifier(max_depth=5, random_state=100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    return predicted_returns[predicted_returns.index >= start_date].astype(float)


def expanding_MLPclassification_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(start_index, len(full_data)):
            X = full_data.iloc[:i][[f'{col}_lag{lag}' for lag in lags]]
            y = target[:i]

            model = MLPClassifier(hidden_layer_sizes = 4 * [15], max_iter = 2000, random_state=100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    return predicted_returns[predicted_returns.index >= start_date].astype(float)


def expanding_svc_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(start_index, len(full_data)):
            X = full_data.iloc[:i][[f'{col}_lag{lag}' for lag in lags]]
            y = target[:i]

            model = SVC(kernel='linear')
            model.fit(X, y)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    return predicted_returns[predicted_returns.index >= start_date].astype(float)


def expanding_Logistic_predicted_returns(df, lags=[1, 2], start_date='2024-03-01'):
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(start_index, len(full_data)):
            X = full_data.iloc[:i][[f'{col}_lag{lag}' for lag in lags]]
            y = target[:i]

            model = LogisticRegression(random_state=100)
            model.fit(X, y)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            predicted_returns.loc[pred_index, col] = pred * df.loc[pred_index, col]

    return predicted_returns[predicted_returns.index >= start_date].astype(float)


def benchmark_returns(df, start_date='2024-03-01'):
    import numpy as np
    import pandas as pd

    # Convert start_date to datetime
    start_date = pd.to_datetime(start_date)

    # Filter the data from the start_date to the end of the dataset
    filtered_data = df[df.index >= start_date]

    # Sum the returns for each column from start_date to the end
    column_sums = filtered_data.sum(axis=0)

    # Apply np.exp to the sum of each column's returns
    exp_column_sums = np.exp(column_sums)

    return exp_column_sums


def sum_and_exp_predicted_returns(predicted_returns):
    # Sum each column of the predicted returns DataFrame
    column_sums = predicted_returns.sum(axis=0)
    
    # Apply np.exp to the sum of each column
    exp_column_sums = np.exp(column_sums)
    
    return exp_column_sums


def highlight_max_row(s):
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]



# 3 Lags

In [24]:
# Create dummy time series data
sd = '2024-03-01'
lg = [1, 2, 3]


LR_preds = expanding_regression_predicted_returns(data,lg,sd)
Log_preds = expanding_Logistic_predicted_returns(data,lg,sd)
DT_preds = expanding_DTclassification_predicted_returns(data,lg,sd)
MLP_preds = expanding_MLPclassification_predicted_returns(data,lg,sd)
SVC_preds = expanding_svc_predicted_returns(data,lg,sd)


In [25]:
benchmarks = benchmark_returns(data)
LR_preds_Sum = sum_and_exp_predicted_returns(LR_preds)
Log_preds_Sum = sum_and_exp_predicted_returns(Log_preds)
DT_preds_Sum = sum_and_exp_predicted_returns(DT_preds)
MLP_preds_Sum = sum_and_exp_predicted_returns(MLP_preds)
SVC_preds_Sum = sum_and_exp_predicted_returns(SVC_preds)


# Merge both series into a DataFrame
merged_df = pd.concat([benchmarks, LR_preds_Sum,Log_preds_Sum, DT_preds_Sum, MLP_preds_Sum, SVC_preds_Sum], axis=1)
merged_df.columns = ['Benchmark', 'LinReg','Logistic', 'DecisionTree', 'DNN', 'SVC' ]

# Apply styling
styled_df_exp3 = merged_df.add_suffix('_Exp_3')
# styled_df_exp3 = styled_df_exp3.style.apply(highlight_max_row, axis=1)
# styled_df_exp3

# 2 Lags

In [26]:
# Create dummy time series data
sd = '2024-03-01'
lg = [1, 2]


LR_preds = expanding_regression_predicted_returns(data,lg,sd)
Log_preds = expanding_Logistic_predicted_returns(data,lg,sd)
DT_preds = expanding_DTclassification_predicted_returns(data,lg,sd)
MLP_preds = expanding_MLPclassification_predicted_returns(data,lg,sd)
SVC_preds = expanding_svc_predicted_returns(data,lg,sd)

In [27]:
benchmarks = benchmark_returns(data)
LR_preds_Sum = sum_and_exp_predicted_returns(LR_preds)
Log_preds_Sum = sum_and_exp_predicted_returns(Log_preds)
DT_preds_Sum = sum_and_exp_predicted_returns(DT_preds)
MLP_preds_Sum = sum_and_exp_predicted_returns(MLP_preds)
SVC_preds_Sum = sum_and_exp_predicted_returns(SVC_preds)


# Merge both series into a DataFrame
merged_df = pd.concat([benchmarks, LR_preds_Sum,Log_preds_Sum, DT_preds_Sum, MLP_preds_Sum, SVC_preds_Sum], axis=1)
merged_df.columns = ['Benchmark', 'LinReg','Logistic', 'DecisionTree', 'DNN', 'SVC' ]

# Apply styling
styled_df_exp2 = merged_df.add_suffix('_Exp_2')
# styled_df_exp2 = styled_df_exp2.style.apply(highlight_max_row, axis=1)
# styled_df_exp2

In [None]:
# Concatenate them side by side
merged_df = pd.concat([styled_df_roll2, styled_df_roll3, styled_df_exp2, styled_df_exp3], axis=1)

cols_to_drop = ['Benchmark_Roll_3', 'Benchmark_Exp_2', 'Benchmark_Exp_3']  # replace with your actual column names
merged_df = merged_df.drop(columns=cols_to_drop)

mask = merged_df.eq(merged_df.max(axis=1), axis=0)

# Drop columns that are never the max in any row
cols_to_keep = mask.any(axis=0)
merged_df = merged_df.loc[:, cols_to_keep]


merged_df = merged_df.style.apply(highlight_max_row, axis=1)
merged_df

Unnamed: 0,Benchmark_Roll_2,LinReg_Roll_2,DecisionTree_Roll_2,DecisionTree_Roll_3,LinReg_Exp_2,Logistic_Exp_2,DecisionTree_Exp_2,SVC_Exp_2,LinReg_Exp_3,Logistic_Exp_3,DNN_Exp_3,SVC_Exp_3
AAPL,1.267419,0.746606,1.006234,0.755891,0.749136,1.062599,0.828012,1.062599,0.703074,1.051584,1.043285,1.051584
AMZN,1.248359,1.182004,1.279027,1.3032,1.293421,1.229644,1.415348,1.22571,1.190438,1.253798,1.174724,1.261071
BA,0.829226,1.122572,1.69031,0.958306,1.006632,1.187119,1.052222,1.187119,1.423674,1.190671,1.190671,1.190671
CAT,1.152445,0.811741,0.960309,0.784252,0.879237,1.136485,1.279052,1.136485,1.752617,1.153931,1.174753,1.153931
CVX,1.081849,1.117579,1.136309,1.413412,1.023771,1.104593,0.855557,1.104593,0.868023,1.09935,1.09935,1.09935
GOOGL,1.398305,1.115356,1.093049,0.956674,1.108548,1.451888,1.123754,1.451888,1.012432,1.45933,1.45933,1.45933
GS,1.602419,0.690114,1.12797,1.234108,1.273678,1.223917,1.193853,1.233043,1.092228,1.225797,1.254821,1.225797
JNJ,0.937535,1.053884,0.914937,1.028413,0.902292,0.94793,1.111578,0.943521,0.813709,0.966056,0.97355,0.967154
JPM,1.397843,0.936844,0.940067,0.629883,1.361815,1.3932,0.977519,1.3932,0.916987,1.379383,1.376275,1.379383
KO,1.068831,0.941263,0.963284,1.15611,1.02251,1.021033,0.762521,1.018225,1.06777,1.007863,1.0192,1.007863


NKE PFE BA UNH AMZN JNJ XOM CVX KO MCD