In [6]:
import os 
os.chdir('/Users/mszwagrzak')

In [8]:
import matplotlib.pyplot as plt
from pickle_loader import pickle_loader
import datetime as dt
import pandas as pd
import numpy as np
import pickle

In [10]:
technical_data = pickle_loader(r"C:\Users\mszwagrzak\OneDrive - Genomics England Ltd\Documents\stock_analysis2\batch_1\technical_us.pickle")

In [16]:
technical_data['MMM'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 503 entries, 2021-09-30 00:00:00-04:00 to 2023-09-29 00:00:00-04:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          503 non-null    float64
 1   High          503 non-null    float64
 2   Low           503 non-null    float64
 3   Close         503 non-null    float64
 4   Volume        503 non-null    int64  
 5   Dividends     503 non-null    float64
 6   Stock Splits  503 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 31.4 KB


### Remove Tickers that don't contain the full dataset 

In [18]:
def tech_clean(dataset):
    clean_dataset = dataset.copy()
    tickers = list(technical_data.keys())
    removed_tickers = []
    
    for ticker in tickers:
        if len(dataset[ticker]) == 0: # Remove tickers that are empty 
            clean_dataset.pop(ticker, None)
            removed_tickers.append(ticker)
        else: # Remove tickers that don't contain the full dataset 
            first_time = dataset[ticker].index.to_pydatetime()[0].strftime('%d-%m-%Y')
            last_time = dataset[ticker].index.to_pydatetime()[-1].strftime('%d-%m-%Y')
            if first_time != '30-09-2021' or last_time != '29-09-2023':
                clean_dataset.pop(ticker, None)
                removed_tickers.append(ticker)
    
    return clean_dataset, removed_tickers 

In [20]:
clean_tech_data, removed_tickers = tech_clean(technical_data)

In [21]:
list(clean_tech_data['MMM'].index)

[Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-01 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-04 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-05 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-06 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-07 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-08 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-11 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-12 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-13 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-14 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-15 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-18 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-19 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-20 00:00:00-0400', tz='America/New_York'),
 Timestamp('2021-10-21 00:00:00-0400', tz='America/New_

In [8]:
def get_train_test_subset(dict_technical_data, start_date, end_date):
    """
    This function takes a dictionary of technical data as input and returns a dictionary 
    containing only the subset of technical data used in the train/test set.
    
    Parameters:
    dict_fundamental_data : A dictionary where the key is the stock and the value is a 
    dataframe of the given stocks technical data.
    
    start_date: string in the format 'YYYY-MM-DD' and is start date of the train/test set
    
    end_date: string in the format 'YYYY-MM-DD' and is end date of the train/test set
    
    Returns:
    dict_train_test: A dict where the key is the stock and the value is a df containing 
    only the dates between start_date and end_date
    """
    # Convert the start and end date in pd.Timestamp format
    start_date = pd.Timestamp(start_date + ' 00:00:00-0400', tz='America/New_York')
    end_date = pd.Timestamp(end_date + ' 00:00:00-0400', tz='America/New_York')
    
    dict_train_test = {}
    
    for stock, data in dict_technical_data.items():
        stock_df = data[(data.index >= start_date) & (data.index <= end_date)]
        dict_train_test[stock] = stock_df
    
    return dict_train_test

In [15]:
dict_train_test = get_train_test_subset(clean_tech_data, '2021-09-30', '2023-06-30')

### Helper Fuctions for spliting the training chunks 

In [17]:
def find_nearest_date(date_indexes, single_date):
    """
    Finds the nearest date in the array of pandas datetime indexes to the given single datetime index.
    
    Parameters:
    date_indexes (array-like): Array of pandas datetime indexes
    single_date (pandas.Timestamp): Single pandas datetime index
    
    Returns:
    pandas.Timestamp: Nearest datetime index in the array
    """
    # Calculate absolute differences between single_date and each date in date_indexes
    differences = abs(pd.Index(date_indexes) - single_date)
    
    # Find the index of the minimum difference
    min_diff_index = differences.argmin()
    
    # Return the nearest date
    return date_indexes[min_diff_index]

# Example usage:
# Assuming 'date_indexes' is an array of pandas datetime indexes
# and 'single_date' is a single pandas datetime index
nearest_date = find_nearest_date(clean_tech_data['MMM'].index, clean_tech_data['MMM'].index[0])
print(nearest_date)

2021-09-30 00:00:00-04:00


#### Find  1, 3, 6 and 12 month chunks before pred date

In [38]:
import pandas as pd

def n_chunks(chunk_sizes, start_date, end_date):
    n_chunk_dates = {}

    for chunk in chunk_sizes:
        n_chunk_dates[chunk] = split_into_chunks(chunk)

    return n_chunk_dates


def split_into_chunks(n_months_chunk_size):
    chunks = []
    
    current_start = start_date

    while current_start <= (end_date - pd.DateOffset(months=n_months_chunk_size)):
        current_end = current_start + pd.DateOffset(months=n_months_chunk_size)
        # Slice the dates that fall within the chunk size
        chunks.append((current_start, current_end))
        # Move the start date forward by the interval provided
        current_start += pd.DateOffset(months=3)
    
    return chunks

dates = list(clean_tech_data['MMM'].index)
start_date = dates[0]
end_date = dates[-1]

chunk_dates = n_chunks((1,3,6,12), start_date, end_date)


In [34]:
print(chunk_dates[1])

[(Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2021-10-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2021-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2022-01-30 00:00:00-0500', tz='America/New_York')), (Timestamp('2022-03-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-04-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-06-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-07-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-10-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2023-01-30 00:00:00-0500', tz='America/New_York')), (Timestamp('2023-03-30 00:00:00-0400', tz='America/New_York'), Timestamp('2023-04-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2023-06-30 00:00:00-0400', tz='America/New_York'), Timestamp('2023-07-30 00:00:00-0400', tz='America/New_

In [40]:
print(chunk_dates[3])

[(Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2021-12-30 00:00:00-0500', tz='America/New_York')), (Timestamp('2021-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2022-03-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-03-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-06-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-06-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-09-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-12-30 00:00:00-0500', tz='America/New_York')), (Timestamp('2022-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2023-03-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2023-03-30 00:00:00-0400', tz='America/New_York'), Timestamp('2023-06-30 00:00:00-0400', tz='America/New_York'))]


In [42]:
print(chunk_dates[6])

[(Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-03-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2021-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2022-06-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-03-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-09-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-06-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-12-30 00:00:00-0500', tz='America/New_York')), (Timestamp('2022-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2023-03-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2023-06-30 00:00:00-0400', tz='America/New_York'))]


In [44]:
print(chunk_dates[12])

[(Timestamp('2021-09-30 00:00:00-0400', tz='America/New_York'), Timestamp('2022-09-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2021-12-30 00:00:00-0500', tz='America/New_York'), Timestamp('2022-12-30 00:00:00-0500', tz='America/New_York')), (Timestamp('2022-03-30 00:00:00-0400', tz='America/New_York'), Timestamp('2023-03-30 00:00:00-0400', tz='America/New_York')), (Timestamp('2022-06-30 00:00:00-0400', tz='America/New_York'), Timestamp('2023-06-30 00:00:00-0400', tz='America/New_York'))]


## Get Grad

In [None]:
class get_gradients:

    def __init__(self,  chunk_end_dates, data): 
        self.chunk_end_dates = chunk_end_dates
        self.data = data

    def get_gradient(self, time_periods = (1,3,6,12)):
        self.chunk_dates = self.n_chunks(time_periods)
        self.chunked_data = self.split_data()
        self.gradients = self.get_gradients()
        

    def get_gradients(self):
        # get the gradients for a given chunk of data
        chunked_data = self.chunked_data
        gradients = {}

        for time_period, chunks in chunked_data.items():
            
            

        
    
    def split_data(self):
        # splits data_set based of chunk dates
        chunk_dates = self.chunk_dates
        data = self.data
        chunked_data = {}

        for n_months, dates in chunk_dates.items():
            chunked_data[n_months] = [data[(data.index >= start_date) & (data.index <= end_date)] for start_date, end_date in dates]

        return chunked_data

    
    def n_chunks(self, chunk_sizes):
        self.chunk_end_dates = chunk_end_dates
        
        n_chunk_dates = {}

        for date in chunk_end_dates:
            current_chunks = []
            
            for chunk in chunk_sizes:
                current_chunks.append((date - pd.DateOffset(months=chunk_size), date))

            n_chunk_dates[chunk] = chunk_end_dates

        return n_chunk_dates
    
    
  
        
        




In [6]:
class test():

    def __init__(self, number):
        self.number = number

    def square_root(self):
        return self.square() 

    def square(self):
        return self.number * self.number 

test_obj = test(4)
test_obj.square_root()

    

16

In [22]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)


reg.score(X, y)
print(reg.coef_)
print(reg.intercept_)
reg.predict(np.array([[3, 5]]))

[1. 2.]
3.0


array([16.])