In [2]:
import os 
os.chdir('/Users/marcs')

In [3]:
pwd

'/Users/marcs'

In [None]:
import matplotlib.pyplot as plt
from pickle_loader import pickle_loader
import datetime as dt
import pandas as pd


In [5]:
technical_data = pickle_loader('/Users/marcs/OneDrive/Documents/stock_analysis2/technical_us.pickle')
                        


## Remove empty entries 

A few of the stock symbols I used to source the data were not valid. In this section I will remove the invalid symbols which have empty entries.

In [8]:
tickers = list(technical_data.keys())

In [9]:
empty_tickers = [ticker for ticker in tickers if len(technical_data[ticker]) == 0]

In [10]:
empty_tickers

['BRK.B', 'BF.B', 'RE', 'FISV', 'PKI']

In [11]:
for empty in empty_tickers:
    technical_data.pop(empty, None)

## Check for full data 

Validate that each symbols has a technical dataset starting from '2021-09-30' and ending on '2023-09-29'

In [12]:
technical_data['MMM'].index.to_pydatetime()[0].strftime('%d-%m-%Y')

'30-09-2021'

In [13]:
technical_data['MMM'].index.to_pydatetime()[-1].strftime('%d-%m-%Y')

'29-09-2023'

In [14]:
tickers = list(technical_data.keys())

full_data = {}

for ticker in tickers:
    first_time = technical_data[ticker].index.to_pydatetime()[0].strftime('%d-%m-%Y')
    last_time = technical_data[ticker].index.to_pydatetime()[-1].strftime('%d-%m-%Y')
    if first_time == '30-09-2021' and last_time == '29-09-2023':
        full_data[ticker] = True
    else:
        full_data[ticker] = False

In [16]:
not_full = [k for k, v in full_data.items() if v == False]
not_full

['CEG', 'GEHC']

In [17]:
technical_data['CEG'].index

DatetimeIndex(['2022-01-19 00:00:00-05:00', '2022-01-20 00:00:00-05:00',
               '2022-01-21 00:00:00-05:00', '2022-01-24 00:00:00-05:00',
               '2022-01-25 00:00:00-05:00', '2022-01-26 00:00:00-05:00',
               '2022-01-27 00:00:00-05:00', '2022-01-28 00:00:00-05:00',
               '2022-01-31 00:00:00-05:00', '2022-02-01 00:00:00-05:00',
               ...
               '2023-09-18 00:00:00-04:00', '2023-09-19 00:00:00-04:00',
               '2023-09-20 00:00:00-04:00', '2023-09-21 00:00:00-04:00',
               '2023-09-22 00:00:00-04:00', '2023-09-25 00:00:00-04:00',
               '2023-09-26 00:00:00-04:00', '2023-09-27 00:00:00-04:00',
               '2023-09-28 00:00:00-04:00', '2023-09-29 00:00:00-04:00'],
              dtype='datetime64[ns, America/New_York]', name='Date', length=427, freq=None)

In [18]:
technical_data['GEHC'].index

DatetimeIndex(['2022-12-15 00:00:00-05:00', '2022-12-16 00:00:00-05:00',
               '2022-12-19 00:00:00-05:00', '2022-12-20 00:00:00-05:00',
               '2022-12-21 00:00:00-05:00', '2022-12-22 00:00:00-05:00',
               '2022-12-23 00:00:00-05:00', '2022-12-27 00:00:00-05:00',
               '2022-12-28 00:00:00-05:00', '2022-12-29 00:00:00-05:00',
               ...
               '2023-09-18 00:00:00-04:00', '2023-09-19 00:00:00-04:00',
               '2023-09-20 00:00:00-04:00', '2023-09-21 00:00:00-04:00',
               '2023-09-22 00:00:00-04:00', '2023-09-25 00:00:00-04:00',
               '2023-09-26 00:00:00-04:00', '2023-09-27 00:00:00-04:00',
               '2023-09-28 00:00:00-04:00', '2023-09-29 00:00:00-04:00'],
              dtype='datetime64[ns, America/New_York]', name='Date', length=198, freq=None)

In [19]:
for ticker in not_full:
    technical_data.pop(ticker, None)

## Techical Data Cleaning Function 

In [9]:
def tech_clean(dataset):
    clean_dataset = dataset.copy()
    tickers = list(technical_data.keys())
    removed_tickers = []
    
    for ticker in tickers:
        if len(dataset[ticker]) == 0: # Remove tickers that are empty 
            clean_dataset.pop(ticker, None)
            removed_tickers.append(ticker)
        else: # Remove tickers that don't contain the full dataset 
            first_time = dataset[ticker].index.to_pydatetime()[0].strftime('%d-%m-%Y')
            last_time = dataset[ticker].index.to_pydatetime()[-1].strftime('%d-%m-%Y')
            if first_time != '30-09-2021' or last_time != '29-09-2023':
                clean_dataset.pop(ticker, None)
                removed_tickers.append(ticker)
    
    return clean_dataset, removed_tickers 

In [10]:
clean_tech_data, removed_tickers = tech_clean(technical_data)

In [11]:
removed_tickers

['BRK.B', 'BF.B', 'CEG', 'RE', 'FISV', 'GEHC', 'PKI']