In [2]:
import os 
os.chdir('/Users/marcs')

In [3]:
import matplotlib.pyplot as plt
from pickle_loader import pickle_loader
import datetime as dt
import pandas as pd
import numpy as np
import pickle

In [4]:
technical_data = pickle_loader('/Users/marcs/OneDrive/Documents/stock_analysis2/technical_us.pickle')

### Remove Tickers that don't contain the full dataset 

In [5]:
def tech_clean(dataset):
    clean_dataset = dataset.copy()
    tickers = list(technical_data.keys())
    removed_tickers = []
    
    for ticker in tickers:
        if len(dataset[ticker]) == 0: # Remove tickers that are empty 
            clean_dataset.pop(ticker, None)
            removed_tickers.append(ticker)
        else: # Remove tickers that don't contain the full dataset 
            first_time = dataset[ticker].index.to_pydatetime()[0].strftime('%d-%m-%Y')
            last_time = dataset[ticker].index.to_pydatetime()[-1].strftime('%d-%m-%Y')
            if first_time != '30-09-2021' or last_time != '29-09-2023':
                clean_dataset.pop(ticker, None)
                removed_tickers.append(ticker)
    
    return clean_dataset, removed_tickers 

In [6]:
clean_tech_data, removed_tickers = tech_clean(technical_data)

In [7]:
clean_tech_data['MMM'].head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-09-30 00:00:00-04:00,165.629147,165.939037,159.841418,159.886993,3235600,0.0,0.0
2021-10-01 00:00:00-04:00,160.643498,161.700786,158.79325,161.035416,2419300,0.0,0.0
2021-10-04 00:00:00-04:00,158.729428,161.418223,158.392195,160.452072,3010100,0.0,0.0
2021-10-05 00:00:00-04:00,160.579685,162.794522,160.023713,162.129166,1888300,0.0,0.0
2021-10-06 00:00:00-04:00,160.989843,162.73983,159.97813,162.603104,2057600,0.0,0.0


### Get the train/testset for the LSTM model

The train and testset for LSTM model will be in the follwing range of dates:
2021-09-30 to 2023-06-30.

In [8]:
start_date = pd.Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York')
end_date = pd.Timestamp('2023-06-30 00:00:00-0400', tz='America/New_York')
start_date

Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York')

In [9]:
test_df = clean_tech_data['MMM']

test_df[(test_df.index >= start_date) & (test_df.index <= end_date)]

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-09-30 00:00:00-04:00,165.629147,165.939037,159.841418,159.886993,3235600,0.0,0.0
2021-10-01 00:00:00-04:00,160.643498,161.700786,158.793250,161.035416,2419300,0.0,0.0
2021-10-04 00:00:00-04:00,158.729428,161.418223,158.392195,160.452072,3010100,0.0,0.0
2021-10-05 00:00:00-04:00,160.579685,162.794522,160.023713,162.129166,1888300,0.0,0.0
2021-10-06 00:00:00-04:00,160.989843,162.739830,159.978130,162.603104,2057600,0.0,0.0
...,...,...,...,...,...,...,...
2023-06-26 00:00:00-04:00,98.526526,99.403412,98.004337,98.930489,3655400,0.0,0.0
2023-06-27 00:00:00-04:00,98.447701,98.743283,96.624962,96.812164,5429400,0.0,0.0
2023-06-28 00:00:00-04:00,96.812170,97.807282,96.329384,97.117599,3795600,0.0,0.0
2023-06-29 00:00:00-04:00,96.654523,97.994485,96.477175,97.777725,3496900,0.0,0.0


In [10]:
def get_train_test_subset(dict_technical_data, start_date, end_date):
    """
    This function takes a dictionary of technical data as input and returns a dictionary 
    containing only the subset of technical data used in the train/test set.
    
    Parameters:
    dict_fundamental_data : A dictionary where the key is the stock and the value is a 
    dataframe of the given stocks technical data.
    
    start_date: string in the format 'YYYY-MM-DD' and is start date of the train/test set
    
    end_date: string in the format 'YYYY-MM-DD' and is end date of the train/test set
    
    Returns:
    dict_train_test: A dict where the key is the stock and the value is a df containing 
    only the dates between start_date and end_date
    """
    # Convert the start and end date in pd.Timestamp format
    start_date = pd.Timestamp(start_date + ' 00:00:00-0400', tz='America/New_York')
    end_date = pd.Timestamp(end_date + ' 00:00:00-0400', tz='America/New_York')
    
    dict_train_test = {}
    
    for stock, data in dict_technical_data.items():
        stock_df = data[(data.index >= start_date) & (data.index <= end_date)]
        dict_train_test[stock] = stock_df
    
    return dict_train_test

In [11]:
dict_train_test = get_train_test_subset(clean_tech_data, '2021-09-30', '2023-06-30')

In [12]:
for stock, data in dict_train_test.items():
    print(stock,len(data))

MMM 440
AOS 440
ABT 440
ABBV 440
ACN 440
ATVI 440
ADM 440
ADBE 440
ADP 440
AAP 440
AES 440
AFL 440
A 440
APD 440
AKAM 440
ALK 440
ALB 440
ARE 440
ALGN 440
ALLE 440
LNT 440
ALL 440
GOOGL 440
GOOG 440
MO 440
AMZN 440
AMCR 440
AMD 440
AEE 440
AAL 440
AEP 440
AXP 440
AIG 440
AMT 440
AWK 440
AMP 440
ABC 440
AME 440
AMGN 440
APH 440
ADI 440
ANSS 440
AON 440
APA 440
AAPL 440
AMAT 440
APTV 440
ACGL 440
ANET 440
AJG 440
AIZ 440
T 440
ATO 440
ADSK 440
AZO 440
AVB 440
AVY 440
AXON 440
BKR 440
BALL 440
BAC 440
BBWI 440
BAX 440
BDX 440
WRB 440
BBY 440
BIO 440
TECH 440
BIIB 440
BLK 440
BK 440
BA 440
BKNG 440
BWA 440
BXP 440
BSX 440
BMY 440
AVGO 440
BR 440
BRO 440
BG 440
CHRW 440
CDNS 440
CZR 440
CPT 440
CPB 440
COF 440
CAH 440
KMX 440
CCL 440
CARR 440
CTLT 440
CAT 440
CBOE 440
CBRE 440
CDW 440
CE 440
CNC 440
CNP 440
CDAY 440
CF 440
CRL 440
SCHW 440
CHTR 440
CVX 440
CMG 440
CB 440
CHD 440
CI 440
CINF 440
CTAS 440
CSCO 440
C 440
CFG 440
CLX 440
CME 440
CMS 440
KO 440
CTSH 440
CL 440
CMCSA 440
CMA 440


### Get the LSTM model dataset

#### Helper Functions

In [13]:
import pandas as pd

def get_date_3_months_ahead(date_index):
    """
    Takes a  pandas datetime index and pd dt 3 months ahead
    
    Parameters:
    date_index:  pandas datetime indexes
    
    Returns:
    array: Array of pandas datetime indexes 3 months ahead
    """
    return date_index + pd.DateOffset(months=3)

# Example usage:
# Assuming 'date_indexes' is an array of pandas datetime indexes
# date_indexes = [...]  # Your array of pandas datetime indexes
# result_dates = get_dates_3_months_ahead(date_indexes)
# print(result_dates)


In [14]:
def find_nearest_date(date_indexes, single_date):
    """
    Finds the nearest date in the array of pandas datetime indexes to the given single datetime index.
    
    Parameters:
    date_indexes (array-like): Array of pandas datetime indexes
    single_date (pandas.Timestamp): Single pandas datetime index
    
    Returns:
    pandas.Timestamp: Nearest datetime index in the array
    """
    # Calculate absolute differences between single_date and each date in date_indexes
    differences = abs(pd.Index(date_indexes) - single_date)
    
    # Find the index of the minimum difference
    min_diff_index = differences.argmin()
    
    # Return the nearest date
    return date_indexes[min_diff_index]

# Example usage:
# Assuming 'date_indexes' is an array of pandas datetime indexes
# and 'single_date' is a single pandas datetime index
nearest_date = find_nearest_date(clean_tech_data['MMM'].index, clean_tech_data['MMM'].index[0])
print(nearest_date)

2021-09-30 00:00:00-04:00


#### Split by stock df into 6 month chunks 

In [16]:
import pandas as pd

def six_month_chunks(df):
    dt_indx = df.index
    # Resample the DataFrame with a 6-month frequency
    chunks = df.resample('6M')

    # Initialize lists to store start and end dates of chunks
    start_dates = []
    end_dates = []

    # Iterate over the chunks
    for start_date, chunk in chunks:
        if (start_date + pd.DateOffset(months=6)) > dt_indx[-1]:
            break
        
        start_dates.append(start_date)
        end_date = start_date + pd.DateOffset(months=6) + pd.DateOffset(days=-1)
        end_date = find_nearest_date(dt_indx, end_date)
        end_dates.append(end_date)
        #print(chunk)

    return start_dates, end_dates

# Call the function to get start and end dates of 6-month chunks
start_dates, end_dates = six_month_chunks(dict_train_test['MMM'])

# Print the results
for i, (start, end) in enumerate(zip(start_dates, end_dates), 1):
    print(f"Chunk {i}: Start Date: {start}, End Date: {end}")


Chunk 1: Start Date: 2021-09-30 00:00:00-04:00, End Date: 2022-03-29 00:00:00-04:00
Chunk 2: Start Date: 2022-03-31 00:00:00-04:00, End Date: 2022-09-29 00:00:00-04:00
Chunk 3: Start Date: 2022-09-30 00:00:00-04:00, End Date: 2023-03-29 00:00:00-04:00


#### Create non-standardised dataset 

In [17]:
def create_LSTM_data_single_stock(stock_df, start_dates, end_dates):
    """
    This function creates LSTM dataset for each chunk 
    
    Parameters:
    stock_df (pandas dataframe): A pandas dataframe of the given stocks fundamnetal data
    start_dates: A list of the start dates of every chunk 
    end_dates: A list of end dates for every chunk 
    
    Returns:
    single_stock_lst_X: List of Numpy arrays with 'Open','High','Low','Close' and 'Volume' as
    x , datapoints as y and each element representing a seperate chunk.
    single_stock_array_y: Array containing the following value for very chunk: \
    (The close price 3 months ahead of the last X sample in chunk / \
    The close price on last X sample in chunk)
    """
    datetime_index = stock_df.index
    single_stock_array_y = np.zeros(len(start_dates))
    single_stock_lst_X = []
    
    
    for chunk in range(len(start_dates)):
        chunk_start = find_nearest_date(datetime_index, start_dates[chunk])
        #print(chunk_start)
        chunk_end = find_nearest_date(datetime_index, end_dates[chunk])
        #print(chunk_end)
        
        chunk_df = stock_df[(stock_df.index >= chunk_start) & (stock_df.index <= chunk_end)]
        X_arr = chunk_df[['Open','High','Low','Close','Volume']].to_numpy()
        single_stock_lst_X.append(X_arr)
        #print(X_arr.shape)
        
        price_final = stock_df.loc[end_date]['Close']
        Y_date = get_date_3_months_ahead(chunk_end)
        price_3mon = stock_df.loc[Y_date]['Close']
        Y_chunk = price_3mon / price_final
        single_stock_array_y[chunk] = Y_chunk
    
    return single_stock_lst_X, single_stock_array_y
                       
single_stock_lst_X, single_stock_array_y = create_LSTM_data_single_stock(dict_train_test['MMM'], start_dates, end_dates)

In [18]:
LSTM_non_standardised = {}
min_seq_lens = []
max_seq_lens = []
for k, v in dict_train_test.items():
    stock_data = create_LSTM_data_single_stock(v, start_dates, end_dates)
    LSTM_non_standardised[k] = stock_data
    
    seq_lens = []
    for chunk in stock_data[0]:
        seq_lens.append(chunk.shape[0])
    
    min_seq_lens.append(min(seq_lens))
    max_seq_lens.append(max(seq_lens))

print("The min sequence length is:", min(min_seq_lens))
print("The max sequence length is:", max(max_seq_lens))

The min sequence length is: 124
The max sequence length is: 126


To make all sequnce lengths equal I will make them all length 124. Any sequence longer than 124 will have 1 or 2 of the first elements of the seqeunce removed. 

#### Create Standardised Dataset

In [62]:
from sklearn.preprocessing import StandardScaler

In [59]:
def standardise_LSTM_dataset(non_standardised_dataset, sequence_len):
    
    standardised_LSTM_data = {}
    for stock, data in non_standardised_dataset.items():
        X_data = data[0]
        y_data = data[1]
        
        X_standardised_data = [standardise_seq(seq, sequence_len) for seq in X_data]
        
        standardised_LSTM_data[stock] = (X_standardised_data, y_data)
    
    return standardised_LSTM_data
    
        
def standardise_seq(seq, sequence_len):
    
    standardised_len_seq = seq[-sequence_len:]
    scaler = StandardScaler()
    return scaler.fit_transform(standardised_len_seq)

In [60]:
standardised_LSTM_data = standardise_LSTM_dataset(LSTM_non_standardised , min(min_seq_lens))

In [61]:
with open("/Users/marcs/OneDrive/Documents/stock_analysis2/standardised_LSTM_data.pickle", 'wb') as f:
        pickle.dump(standardised_LSTM_data, f)