In [1]:
import os 
os.chdir('/Users/mszwagrzak')

In [3]:
import matplotlib.pyplot as plt
from pickle_loader import pickle_loader
import datetime as dt
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LinearRegression


In [4]:
technical_data = pickle_loader(r"C:\Users\mszwagrzak\OneDrive - Genomics England Ltd\Documents\stock_analysis2\batch_1\technical_us.pickle")

In [5]:
technical_data['MMM'].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 503 entries, 2021-09-30 00:00:00-04:00 to 2023-09-29 00:00:00-04:00
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Open          503 non-null    float64
 1   High          503 non-null    float64
 2   Low           503 non-null    float64
 3   Close         503 non-null    float64
 4   Volume        503 non-null    int64  
 5   Dividends     503 non-null    float64
 6   Stock Splits  503 non-null    float64
dtypes: float64(6), int64(1)
memory usage: 31.4 KB


In [9]:
list(technical_data['MMM']['Close'])

[159.88699340820312,
 161.03541564941406,
 160.4520721435547,
 162.12916564941406,
 162.6031036376953,
 162.05624389648438,
 161.28150939941406,
 160.67080688476562,
 159.9872589111328,
 161.4820098876953,
 164.3531036376953,
 165.82965087890625,
 165.629150390625,
 166.16688537597656,
 166.26715087890625,
 165.9937286376953,
 164.79061889648438,
 166.24891662597656,
 166.0301971435547,
 162.45729064941406,
 163.94293212890625,
 162.85829162597656,
 163.6786346435547,
 165.8570098876953,
 166.0939483642578,
 164.61741638183594,
 165.70205688476562,
 165.46505737304688,
 165.27365112304688,
 166.26715087890625,
 164.25283813476562,
 167.39735412597656,
 167.07833862304688,
 167.1330108642578,
 167.10569763183594,
 164.26637268066406,
 164.68905639648438,
 165.0841827392578,
 164.90042114257812,
 163.21888732910156,
 161.28004455566406,
 161.93243408203125,
 156.24462890625,
 156.878662109375,
 156.44677734375,
 158.58773803710938,
 162.82374572753906,
 162.22647094726562,
 161.757843017

### Remove Tickers that don't contain the full dataset 

In [11]:
def tech_clean(dataset):
    clean_dataset = dataset.copy()
    tickers = list(technical_data.keys())
    removed_tickers = []
    
    for ticker in tickers:
        if len(dataset[ticker]) == 0: # Remove tickers that are empty 
            clean_dataset.pop(ticker, None)
            removed_tickers.append(ticker)
        else: # Remove tickers that don't contain the full dataset 
            first_time = dataset[ticker].index.to_pydatetime()[0].strftime('%d-%m-%Y')
            last_time = dataset[ticker].index.to_pydatetime()[-1].strftime('%d-%m-%Y')
            if first_time != '30-09-2021' or last_time != '29-09-2023':
                clean_dataset.pop(ticker, None)
                removed_tickers.append(ticker)
    
    return clean_dataset, removed_tickers 

In [13]:
clean_tech_data, removed_tickers = tech_clean(technical_data)

In [15]:
def get_train_test_subset(dict_technical_data, start_date, end_date):
    """
    This function takes a dictionary of technical data as input and returns a dictionary 
    containing only the subset of technical data used in the train/test set.
    
    Parameters:
    dict_fundamental_data : A dictionary where the key is the stock and the value is a 
    dataframe of the given stocks technical data.
    
    start_date: string in the format 'YYYY-MM-DD' and is start date of the train/test set
    
    end_date: string in the format 'YYYY-MM-DD' and is end date of the train/test set
    
    Returns:
    dict_train_test: A dict where the key is the stock and the value is a df containing 
    only the dates between start_date and end_date
    """
    # Convert the start and end date in pd.Timestamp format
    start_date = pd.Timestamp(start_date + ' 00:00:00-0400', tz='America/New_York')
    end_date = pd.Timestamp(end_date + ' 00:00:00-0400', tz='America/New_York')
    
    dict_train_test = {}
    
    for stock, data in dict_technical_data.items():
        stock_df = data[(data.index >= start_date) & (data.index <= end_date)]
        dict_train_test[stock] = stock_df
    
    return dict_train_test

In [17]:
dict_train_test = get_train_test_subset(clean_tech_data, '2021-09-30', '2023-06-30')

## Get Grad

In [65]:
class get_gradients:


    
    def __init__(self,  chunk_end_dates, data): 
        self.chunk_end_dates = chunk_end_dates
        self.data = data

    def get_gradient(self, time_periods = (1,3,6,12)):
        self.time_periods = time_periods
        self.chunk_dates = self.n_chunks(time_periods)
        print(self.chunk_dates)
        self.chunked_data = self.split_data()
        self.gradients = self.get_gradients()

        return self.gradients  

    def get_gradients(self):
        # get the gradients for a given chunk of data
        chunked_data = self.chunked_data
        gradients = {}

        for time_period, chunks in chunked_data.items():
            time_period_grad = []
            
            for chunk in chunks:
                close_prices = list(chunk['Close'])
                X = close_prices[:-1]
                y = close_prices[1:]
                reg = LinearRegression().fit(X, y)
                time_period_grad.append(reg.coef_)
                del reg

            gradients[time_period] = time_period_grad

        return gradients
    
    def split_data(self):
        # splits data_set based of chunk dates
        chunk_dates = self.chunk_dates
        data = self.data
        chunked_data = {}

        for n_months, dates in chunk_dates.items():
            print(dates)
            chunked_data[n_months] = [data[(data.index >= start_date) & (data.index <= end_date)] for start_date, end_date in dates]

        return chunked_data
 
    def n_chunks(self, chunk_sizes):
        # get the start and end date foe each chunk 
        chunk_end_dates = self.chunk_end_dates 
        chunk_sizes = self.time_periods
        n_chunk_dates = {}

        for date in chunk_end_dates:
            current_chunks = []
            
            for chunk in chunk_sizes:
                current_chunks.append((date - pd.DateOffset(months=chunk), date))

            n_chunk_dates[chunk] = current_chunks

        return n_chunk_dates

In [67]:
fundamental_data = pickle_loader(r"C:\Users\mszwagrzak\OneDrive - Genomics England Ltd\Documents\stock_analysis2\final_fundamental_dict.pickle")
chunk_end_dates = fundamental_data['MMM'].columns
grad_obj = get_gradients(chunk_end_dates=chunk_end_dates, data=dict_train_test['MMM'])
grads = grad_obj.get_gradient()

{12: [(Timestamp('2022-08-30 00:00:00'), Timestamp('2022-09-30 00:00:00')), (Timestamp('2022-06-30 00:00:00'), Timestamp('2022-09-30 00:00:00')), (Timestamp('2022-03-30 00:00:00'), Timestamp('2022-09-30 00:00:00')), (Timestamp('2021-09-30 00:00:00'), Timestamp('2022-09-30 00:00:00'))]}
[(Timestamp('2022-08-30 00:00:00'), Timestamp('2022-09-30 00:00:00')), (Timestamp('2022-06-30 00:00:00'), Timestamp('2022-09-30 00:00:00')), (Timestamp('2022-03-30 00:00:00'), Timestamp('2022-09-30 00:00:00')), (Timestamp('2021-09-30 00:00:00'), Timestamp('2022-09-30 00:00:00'))]


TypeError: Invalid comparison between dtype=datetime64[ns, America/New_York] and Timestamp

In [6]:
class test():

    def __init__(self, number):
        self.number = number

    def square_root(self):
        return self.square() 

    def square(self):
        return self.number * self.number 

test_obj = test(4)
test_obj.square_root()

    

16

In [13]:
lst = [1,2,3,4]
print(lst[:-1], lst[1:])

[1, 2, 3] [2, 3, 4]


In [22]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)


reg.score(X, y)
print(reg.coef_)
print(reg.intercept_)
reg.predict(np.array([[3, 5]]))

[1. 2.]
3.0


array([16.])