In [44]:
import numpy as np
import pandas as pd

pd.options.plotting.backend = "matplotlib"
import matplotlib.pyplot as plt

import cufflinks as cf
cf.go_offline()

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression


In [56]:
def process_data(file):
    df = pd.read_csv(file, header=None)
    df.columns = pd.MultiIndex.from_arrays([df.iloc[0], df.iloc[1]])
    df = df.drop([0,1]) 
    df = df.reset_index(drop=True)
    df.dropna(inplace=True)
    df = df.set_index(df.columns[0])
    df.index.name = 'Date'
    for col in df.columns:
        df[col] = df[col].astype(float)
    close_df = df['Close']
    close_df.index = pd.to_datetime(close_df.index, dayfirst=True)

    return close_df

def SMA(stocks, close_df, period_short=30, period_long=50, start_date = '2024-03-01'):
    close_df = close_df.copy()
    close_df = close_df[stocks]
    sma_short = close_df.rolling(window=period_short).mean()
    sma_long  = close_df.rolling(window=period_long).mean()

    # Generate signals:
    #   1  => SMA short < SMA long (bullish)
    #  -0.5  => SMA short > SMA long (bearish)
    signal = (sma_short < sma_long).astype(int).replace(0, -0.5)
    trade_start = start_date
    signal.loc[signal.index < trade_start] = 0

    # Daily returns from close prices
    daily_ret = close_df.pct_change()

    # Strategy return = signal * daily return
    strategy_ret = signal.shift(1) * daily_ret

    # Cumulative returns    
    cumulative_ret = (1 + strategy_ret.fillna(0)).cumprod()
    cumulative_ret = cumulative_ret.loc[cumulative_ret.index >= trade_start]
    signal = signal.loc[signal.index >= trade_start]
    return cumulative_ret, signal


In [77]:
def rolling_regression_predicted_returns(df,stocks, lags=[1, 2], start_date='2024-03-01'):
    
    df = df.loc[:, stocks]
    df = np.log(df/df.shift(1)).dropna()
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)
    signal_df = pd.DataFrame(index=df.index, columns=df.columns)

    # Convert start_date to datetime and find the index where it occurs
    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]

    # Determine the rolling window size as the length of rows before the start date
    window = start_index  # This will be the number of rows before start_date



    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        start_index = full_data.index.get_indexer([start_date], method='ffill')[0]

        # Determine the rolling window size as the length of rows before the start date
        window = start_index  # This will be the number of rows before start_date

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = window_data[col]

            model = LinearRegression()
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]

            # Convert prediction to trading signal: 1 or -1
            signal = 1 if pred > 0 else -0.5
            signal_df.loc[pred_index, col] = signal

            # Multiply signal with actual log return to get predicted returns
            predicted_returns.loc[pred_index, col] = signal * df.loc[pred_index, col]

    # Filter by start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]
    signal_df = signal_df[signal_df.index >= start_date]

    predicted_returns.iloc[0] = 0.0
    predicted_returns += 1.0
    predicted_returns = predicted_returns.cumprod()
    
    return predicted_returns.astype(float), signal_df.astype(float)

def rolling_DTclassification_predicted_returns(df,stocks, lags=[1, 2], start_date='2024-03-01'):
    
    df = df.loc[:, stocks]
    df = np.log(df/df.shift(1)).dropna()
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)
    signal_df = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]
    window = start_index  # Window size is the number of rows before start_date

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        start_index = full_data.index.get_indexer([start_date], method='ffill')[0]

        # Determine the rolling window size as the length of rows before the start date
        window = start_index  # This will be the number of rows before start_date
        
        # Create the target: 1 if return is positive, -1 if return is negative
        target = np.where(full_data[col] > 0, 1, -1)  # Binary target for classification

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = target[i - window:i]  # Use binary target for classification

            # Using DecisionTreeClassifier
            model = DecisionTreeClassifier(max_depth = 5, random_state = 100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]
            signal = {1: 1, -1: -0.5}.get(pred, 0)
            signal_df.loc[pred_index, col] = signal

            # Multiply signal with actual log return to get predicted returns
            predicted_returns.loc[pred_index, col] = signal  * df.loc[pred_index, col]

    # Filter by start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]
    signal_df = signal_df[signal_df.index >= start_date]
    
    predicted_returns.iloc[0] = 0.0
    predicted_returns += 1.0
    predicted_returns = predicted_returns.cumprod()
    
    return predicted_returns.astype(float), signal_df.astype(float)

def rolling_logistic_predicted_returns(df,stocks, lags=[1, 2], start_date='2024-03-01'):
    
    df = df.loc[:, stocks]
    df = np.log(df/df.shift(1)).dropna()
    predicted_returns = pd.DataFrame(index=df.index, columns=df.columns)
    signal_df = pd.DataFrame(index=df.index, columns=df.columns)

    start_date = pd.to_datetime(start_date)
    start_index = df.index.get_indexer([start_date], method='ffill')[0]
    window = start_index  # Use all data before the start_date

    for col in df.columns:
        series = df[col]

        # Create lagged features
        lagged_data = pd.concat([series.shift(lag) for lag in lags], axis=1)
        lagged_data.columns = [f'{col}_lag{lag}' for lag in lags]

        # Combine features and target
        full_data = pd.concat([series, lagged_data], axis=1).dropna()
        start_index = full_data.index.get_indexer([start_date], method='ffill')[0]

        # Determine the rolling window size as the length of rows before the start date
        window = start_index  # This will be the number of rows before start_date
        
        # Binary target: 1 for up, -1 for down
        target = np.where(full_data[col] > 0, 1, -1)

        for i in range(window, len(full_data)):
            window_data = full_data.iloc[i - window:i]
            X = window_data[[f'{col}_lag{lag}' for lag in lags]]
            y = target[i - window:i]

            # Logistic Regression model
            model = LogisticRegression(random_state=100)
            model.fit(X, y)

            current_features = full_data.iloc[i][[f'{col}_lag{lag}' for lag in lags]].to_frame().T
            pred_index = full_data.index[i]
            pred = model.predict(current_features)[0]
            signal = {1: 1, -1: -0.5}.get(pred, 0)
            signal_df.loc[pred_index, col] = signal

            # Multiply signal with actual log return to get predicted returns
            predicted_returns.loc[pred_index, col] = signal  * df.loc[pred_index, col]

    # Keep only rows after start_date
    predicted_returns = predicted_returns[predicted_returns.index >= pd.to_datetime(start_date)]
    signal_df = signal_df[signal_df.index >= start_date]

    predicted_returns.iloc[0] = 0.0
    predicted_returns += 1.0
    predicted_returns = predicted_returns.cumprod()

    return predicted_returns.astype(float), signal_df.astype(float)


# Benchmark

In [106]:
close_df = process_data('Trading_Project_Data.csv')

stocks = ['BA', 'NKE', 'PFE', 'UNH', 'MCD', 'AMZN','XOM', 'CVX','JNJ','KO']
start_date = '2024-03-01'

benchmark_df = close_df.loc[:,['BA', 'NKE', 'PFE', 'UNH', 'MCD', 'AMZN','XOM', 'CVX','JNJ','KO']]
benchmark_df = benchmark_df[benchmark_df.index >= start_date]
benchmark_df

def total_return(df):
    return (df.iloc[-1,:] / df.iloc[0,:]) - 1

returns = total_return(benchmark_df) + 1
returns = returns.to_frame()
returns.T


1,BA,NKE,PFE,UNH,MCD,AMZN,XOM,CVX,JNJ,KO
0,0.84465,0.707424,1.041464,1.059351,0.97994,1.238133,1.077639,1.076185,0.933256,1.077628


# Returns

In [107]:
close_df = process_data('Trading_Project_Data.csv')
ret , Sret = SMA(['BA', 'NKE', 'PFE', 'UNH'],close_df, start_date = '2024-03-01')

ret1, Sret1 = rolling_regression_predicted_returns(close_df,['MCD'], lags=[1, 2,3], start_date='2024-03-01')
ret2, Sret2 = rolling_DTclassification_predicted_returns(close_df,['AMZN','XOM'], lags=[1, 2], start_date='2024-03-01')
ret3, Sret3 = rolling_DTclassification_predicted_returns(close_df,['CVX','JNJ'], lags=[1, 2, 3], start_date='2024-03-01')
ret4, Sret4 = rolling_logistic_predicted_returns(close_df,['KO'], lags=[1, 2, 3], start_date='2024-03-01')

combined_df = pd.concat([ret,ret1, ret2, ret3, ret4], axis=1)
combined_df

1,BA,NKE,PFE,UNH,MCD,AMZN,XOM,CVX,JNJ,KO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-03-01,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2024-03-04,0.998650,1.011484,1.013163,1.007824,1.000722,1.001799,1.007041,1.013027,1.007082,0.997654
2024-03-05,0.997156,1.017733,1.009445,1.016943,1.006210,0.982087,1.000903,1.017847,1.007901,1.000078
2024-03-06,0.997503,1.021253,0.987964,1.017534,1.007551,0.978640,0.995578,1.009237,1.009889,0.999826
2024-03-07,0.992466,1.018324,0.995231,1.010881,1.010690,0.997134,1.001157,1.016289,1.011381,1.000751
...,...,...,...,...,...,...,...,...,...,...
2025-01-10,1.233728,1.300957,1.209460,1.562817,1.189935,1.508907,1.202440,1.093246,1.009885,1.123679
2025-01-13,1.238857,1.292918,1.213081,1.624196,1.187160,1.505595,1.187119,1.108910,1.026874,1.118368
2025-01-14,1.251749,1.301079,1.195428,1.632000,1.178548,1.508011,1.184788,1.119919,1.025879,1.114841
2025-01-15,1.254821,1.299251,1.186828,1.631039,1.177043,1.546234,1.203961,1.114839,1.025100,1.117273


# Strategy

In [108]:
combined_df_S = pd.concat([Sret,Sret1, Sret2, Sret3, Sret4], axis=1)
combined_df_S

1,BA,NKE,PFE,UNH,MCD,AMZN,XOM,CVX,JNJ,KO
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-03-01,-0.5,-0.5,-0.5,-0.5,-0.5,-0.5,1.0,1.0,-0.5,-0.5
2024-03-04,-0.5,-0.5,-0.5,-0.5,1.0,-0.5,-0.5,-0.5,-0.5,-0.5
2024-03-05,-0.5,-0.5,-0.5,-0.5,1.0,1.0,-0.5,1.0,1.0,-0.5
2024-03-06,-0.5,-0.5,-0.5,-0.5,1.0,1.0,-0.5,1.0,-0.5,-0.5
2024-03-07,-0.5,-0.5,-0.5,-0.5,-0.5,1.0,1.0,1.0,-0.5,-0.5
...,...,...,...,...,...,...,...,...,...,...
2025-01-10,-0.5,-0.5,1.0,1.0,1.0,1.0,-0.5,1.0,1.0,-0.5
2025-01-13,-0.5,-0.5,1.0,1.0,-0.5,1.0,-0.5,1.0,1.0,-0.5
2025-01-14,-0.5,1.0,1.0,1.0,1.0,-0.5,-0.5,1.0,-0.5,-0.5
2025-01-15,-0.5,1.0,-0.5,1.0,-0.5,1.0,1.0,-0.5,-0.5,-0.5
