### Import dependencies

In [None]:
import pandas as pd
import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error

from sklearn import linear_model as lm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import SplineTransformer

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.stattools import adfuller



import os
import matplotlib.pyplot as plt
import cProfile


### Setup

Helpers

In [None]:
def path_to_symbols(base_dir="data"):
    """Return a list of symbols from a directory by removing the file extension `csv`."""
    file_list = os.listdir(base_dir)
    res =[]

    for file in file_list:
        file = file.split('.')
        if file[1] =="csv":
            res.append(file[0])
    return res
        
def symbol_to_path(symbol, base_dir= "data"):
    """Return CSV file path given ticker symbol."""
    return os.path.join(base_dir, "{}.csv".format(str(symbol)))

def get_data(symbols, dates):
    """Read stock data (adjusted close) for given symbols from CSV files."""
    df_container = pd.DataFrame(index=dates)
    if 'SPY' not in symbols:  # add SPY for reference, if absent
        symbols.insert(0, 'SPY')
    
    


    
    if len(symbols) > 1:
        for symbol in symbols:
            if symbol == "SPY":
                df_temp = pd.read_csv("data/" + "SPY" + ".csv", index_col= "Date", parse_dates= True, usecols=['Date', 'Adj Close'], na_values="nan")
                df_temp = df_temp.rename(columns={'Adj Close':"SPY"})
                df_container = df_container.join(df_temp)
                df_container = df_container.dropna();
            else:
            
                df_temp = pd.read_csv("data/" + symbol + ".csv", index_col= "Date", parse_dates= True, usecols=['Date', 'Adj Close'], na_values="nan")
                df_temp = df_temp.rename(columns={'Adj Close':symbol})
                df_container = df_container.join(df_temp)
    else:
        df_temp = pd.read_csv("data/" + symbols[0] + ".csv", index_col= "Date", parse_dates= True, usecols=['Date', 'Adj Close'], na_values="nan")
        df_temp = df_temp.rename(columns={'Adj Close':symbols[0]})
        df_container = df_container.join(df_temp)

    return df_container

def normalise_data(df:pd.DataFrame, frame_of_reference= 0):
    """Normalises the data based on the frame of reference"""
    return df/df.iloc[frame_of_reference]

def get_rolling_mean(df:pd.DataFrame, window = 20):
    return df.rolling(window).mean()

def get_rolling_std(df:pd.DataFrame, window = 20):
    return df.rolling(window).std()

def get_bollinger_bands(df:pd.DataFrame, window = 20, num_std = 2):
    """ Returns a tuple of Bollinger Bands® `(upper band, rolling mean , lower band)`"""

    rolling_mean = get_rolling_mean(df, window)
    std = get_rolling_std(df, window)
    upper_band = rolling_mean + num_std * std
    lower_band = rolling_mean - num_std * std

    return (upper_band, rolling_mean, lower_band)

def get_daily_returns(df:pd.DataFrame):
    df_lag = df.shift(1)
    df_res = ((df/df_lag) - 1) * 100

    df_res = df_res.fillna(0)

    return df_res

def fill_missing_values(df):
    df.ffill(axis=0, inplace=True)
    df.bfill(axis=0, inplace=True)


Regressors/Predictors

In [169]:
# linear
def create_regression(df:pd.DataFrame, window=100):
    """
    `df`dataframe to create regression from \n
    `window` the number of days from the dataframe to consider in the regression
    
    """
    # TODO: make this seperate every col in df and create its regression individually
    res = df.tail(window)
    res = res.reset_index(drop=True, inplace=False)
    res.insert(0, 'index',res.index)
    temp_array = res.to_numpy()
    temp_array = np.transpose(temp_array) 
    
    reg = lm.LinearRegression()
    
    reg.fit(temp_array[0].reshape(-1, 1) ,temp_array[1])

    x = np.linspace(int(temp_array[0][0])+1, int(temp_array[0][-1])+1 + window, window+window+1)
    b = reg.intercept_
    m = reg.coef_
    y = m * x + b
    # print(temp_array[0][0])

    # print(y)
    # print(m)
    # print(x)
    # print(b)

    last_date = df.index[-1]
    new_end_date = last_date + pd.DateOffset(days=window)
    new_date_range = pd.date_range(start=last_date, end=new_end_date, freq='D')



    fin = df.tail(window)
    fin = pd.concat([fin, pd.DataFrame(index=new_date_range)])

    fin.insert(0, "regression", y)
    # fin.pop("SPY")


    return fin

# poly
def create_regression_2(df:pd.DataFrame, window=100, degree=3):
    # TODO: make this seperate every col in df and create its regression individually
    
    # convert data to scikit-learn friendly format
    res = df.tail(window)
    res = res.reset_index(drop=True, inplace=False)
    res.insert(0, 'index',res.index)
    temp_array = res.to_numpy()
    temp_array = np.transpose(temp_array) 

    x_train = temp_array[0]
    y_train = temp_array[1]

    # create poly features matrix
    poly_features = PolynomialFeatures(degree=degree)
    x_poly = poly_features.fit_transform(x_train.reshape(-1, 1))

    # fit poly regression model
    model = lm.LinearRegression()
    model.fit(x_poly, y_train)


    # use results / create predictions
    x_test = np.linspace(int(temp_array[0][0])+1, int(temp_array[0][-1])+1 + window, window+window+1)
    x_test_poly = poly_features.transform(x_test.reshape(-1, 1))
    y_pred = model.predict(x_test_poly)




    # move results to dataframe
    last_date = df.index[-1]
    new_end_date = last_date + pd.DateOffset(days=window)
    new_date_range = pd.date_range(start=last_date, end=new_end_date, freq='D')

    fin = df.tail(window)
    fin = pd.concat([fin, pd.DataFrame(index=new_date_range)])


    fin.insert(0, "regression", y_pred)
    # fin.pop("SPY")


    return fin

# spline (interpolator)
def create_regression_3(df:pd.DataFrame, window=100, knots=10):
    """
    Spline regression, bad 
    """

    # convert data to scikit-learn friendly format
    res = df.tail(window)
    res = res.reset_index(drop=True, inplace=False)
    res.insert(0, 'index',res.index)
    temp_array = res.to_numpy()
    temp_array = np.transpose(temp_array) 

    x_train = temp_array[0]
    y_train = temp_array[1]

    # Step 3: Create a spline features matrix
    # knots = 3  # Set knot points, which divide the data into segments
    spline_features = SplineTransformer(degree=3, n_knots=knots)
    x_spline = spline_features.fit_transform(x_train.reshape(-1, 1))

    # Step 4: Fit a linear regression model
    model = lm.LinearRegression()
    model.fit(x_spline, y_train)

    # Step 5: Make predictions and visualize the results
    x_test = np.linspace(int(temp_array[0][0])+1, int(temp_array[0][-1])+1 + window, window+window+1)  # Generate test data for prediction
    x_test_spline = spline_features.transform(x_test.reshape(-1, 1))
    y_pred = model.predict(x_test_spline)

    # move results to dataframe
    last_date = df.index[-1]
    new_end_date = last_date + pd.DateOffset(days=window)
    new_date_range = pd.date_range(start=last_date, end=new_end_date, freq='D')

    fin = df.tail(window)
    fin = pd.concat([fin, pd.DataFrame(index=new_date_range)])


    fin.insert(0, "regression", y_pred)
    # fin.pop("SPY")


    return fin

# ARIMA    
def create_prediction(df:pd.DataFrame, p=10,d=2,q=5):

    df.plot()
    plot_acf(df['SPY'].diff())
    
    # Step 3: Decompose the Time Series (optional)
    decomposition = seasonal_decompose(df['SPY'], model='additive')
    trend = decomposition.trend
    seasonal = decomposition.seasonal
    residual = decomposition.resid

    # Step 4: Stationarize the Data (if needed)

    # Step 5: Choose a Forecasting Model (e.g., ARIMA)
    model = ARIMA(df['SPY'], order=(p, d, q))  # Replace p, d, q with appropriate values

    # Step 6: Train the Model
    model_fit = model.fit()

    # Step 7: Make Forecasts
    forecast_periods = 30  # Number of periods to forecast
    forecast = model_fit.forecast(steps=forecast_periods)

    
    # Step 8: Visualize Forecasts
    plt.figure(figsize=(12, 6))
    plt.plot(df.index, df['SPY'], label='Original Data')
    plt.plot(pd.date_range(start=df.index[-1], periods=forecast_periods+1), [df['SPY'].iloc[-1]] + list(forecast), label='Forecast', color='red')
    plt.title('Time Series Forecast')
    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.legend()
    plt.show()

    return


# Running Code

In [173]:
def test_run():
    # Define a date range
    dates = pd.date_range('2010-01-01', '2010-02-29')
    # dates = pd.date_range('2010-01-01', '2010-12-30')
    # dates = pd.date_range('2006-01-01', '2013-12-30')

    # Choose stock symbols to read
    symbols = path_to_symbols()
    print(symbols)
    symbols = [symbols[4]]
    
    # data collection / clean-up
    df = get_data(symbols, dates)
    fill_missing_values(df)
    
    df = normalise_data(df)
    window = 20
    
    # generating regression(s)
    # reg_df = create_regression(df)      # linear reg
    # reg_df_2 = create_regression_2(df)  # poly reg
    # reg_df_3 = create_regression_3(df)  # spline reg
    # ens_df = (reg_df_3 + reg_df_2 + reg_df)/3      # average of regs
    pred = create_prediction(df)

    # df=df-1
    # PLOTTING DATA
    # ax = df.plot(title="normalised price")
    # ax.set_xlabel("Date")
    # ax.set_ylabel("Price")

    # bx = get_daily_returns(df).plot(title="daily returns")
    # bx.set_xlabel("Date")
    # bx.set_ylabel("Price")

    # cx = reg_df.plot(title="Predicted")
    # cx = reg_df_2.plot(label="Predicted",ax=cx)
    # cx = ens_df.plot(title="Predicted")
    # cx = ens_df.plot(label="Predicted", ax=ax)

    # dx = reg_df_3.plot(title="epic")

    bb = get_bollinger_bands(df, window)
    # ub_df = bb[0]
    rm_df = bb[1].shift(-10)
    # lb_df = bb[2]
    # ub_df.plot(label="upper mean", ax=ax)
    # rm_df.plot(ax=ax)
    # lb_df.plot(label="lower mean", ax=ax)



if __name__ == '__main__':
    test_run()

    # profile the performance of the code
    # cProfile.run("test_run()")


DateParseError: day is out of range for month: 2010-02-29

## Experimental Setup

In [168]:
def analyse(df:pd.DataFrame, p=3,d=2,q=1):
    result = adfuller(df.dropna())
    result = adfuller(df.diff().dropna())

    diff_df = df.diff().dropna()
    # plot_acf(df)
    # plot_acf(diff_df)


    arima_model = ARIMA(df, order=(p,d,q))
    model = arima_model.fit()
    print(model.summary())
    # arima_model.plot_predict(dynamic=False)

    
    return

def experiment():
    # Define a date range
    dates = pd.date_range('2010-01-01', '2010-03-30')
    # dates = pd.date_range('2010-01-01', '2010-12-30')
    # dates = pd.date_range('2006-01-01', '2013-12-30')

    # Choose stock symbols to read
    symbols = path_to_symbols()
    symbols = [symbols[4]]
    
    # data collection / clean-up
    df = get_data(symbols, dates)
    fill_missing_values(df)
    
    # df = normalise_data(df)
    window = 20

    # for i in range(0,10):
    analyse(df,p=10,d=1,q=10)

    




    


if __name__ == '__main__':
    experiment()

    # profile the performance of the code
    # cProfile.run("test_run()")

  warn('Non-stationary starting autoregressive parameters'


                               SARIMAX Results                                
Dep. Variable:                    SPY   No. Observations:                   89
Model:               ARIMA(10, 1, 10)   Log Likelihood                 -71.423
Date:                Tue, 05 Sep 2023   AIC                            184.847
Time:                        19:24:13   BIC                            236.871
Sample:                    01-01-2010   HQIC                           205.806
                         - 03-30-2010                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1          0.0725      1.639      0.044      0.965      -3.139       3.284
ar.L2          0.6804      1.249      0.545      0.586      -1.769       3.129
ar.L3         -0.2019      0.603     -0.335      0.7

