In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
from collections import defaultdict
import re
import os
from topquartile.modules.datamodule.dataloader import DataLoader
from topquartile.modules.datamodule.transforms import TechnicalCovariateTransform, FundamentalCovariateTransform

In [None]:
class DataLoader:
    """
    Loads Bloomberg-formatted data
    """
    def __init__(self, covariates_id: str, labels_id: str, label_duration: int,  pred_length: int = 20, n_train: int = 252,
                 n_test: int = 30, n_embargo: int = 20, save: bool = True, save_directory: str = ''):
        self.covariates_id = covariates_id
        self.labels_id = labels_id
        self.label_duration = label_duration
        self.pred_length = pred_length
        self.n_train = n_train
        self.n_test = n_test
        self.n_embargo = n_embargo
        self.save = save
        self.save_directory = save_directory
        self.remove_last_n = self.label_duration

        self.covariates = None
        self.labels = None
        self.pred = None
        self.data = None

        cwd = Path.cwd()
        self.covariates_path = cwd.parent / 'data' / f"{self.covariates_id}.csv"
        self.labels_path = cwd.parent / 'data' / self.labels_id

    def _get_number(self, col_name):
        match = re.match(r'^(.*?)(?:\.(\d+))?$', col_name)
        if match.group(2):
            return int(match.group(2))
        else:
            return 0


    def _load_data(self) -> pd.DataFrame:
        ticker_df = pd.read_csv(self.covariates_path,
                                skiprows=3, low_memory=False)

        tickernames = ticker_df.columns.tolist()
        tickernames = [ticker for ticker in tickernames if not ticker.startswith('Unnamed')]

        covariates = pd.read_csv(self.covariates_path, skiprows=5, index_col=0, low_memory=False)
        covariates.dropna(inplace=True, axis=0, how='all')
        covariates.dropna(inplace=True, axis=1, how='all')

        covariates.index = pd.to_datetime(covariates.index, format='mixed')

        col_dict = defaultdict(list)
        for col in covariates.columns:
            number = self._get_number(col)
            col_dict[number].append(col)

        max_number = max(col_dict.keys())
        covlist = [None] * (max_number + 1)

        for number in range(max_number + 1):
            cols = col_dict.get(number, [])
            if cols:
                covlist[number] = covariates[cols]
            else:
                covlist[number] = pd.DataFrame()

        tickernames = [ticker[:4] for ticker in tickernames] # Becos duplicates show as such "IMJS IJ EQUITY:1"

        first_occurrence_index = {}
        duplicate_indices = []

        for index, ticker in enumerate(tickernames):
            if ticker in first_occurrence_index:
                duplicate_indices.append(index)
            else:
                first_occurrence_index[ticker] = index

        unique_tickernames = []
        unique_covlist = []
        for index, ticker in enumerate(tickernames):
            if index not in duplicate_indices:
                unique_tickernames.append(ticker)
                unique_covlist.append(covlist[index])

        print(len(unique_tickernames), len(unique_covlist))
        covlist = unique_covlist
        self.tickernames = unique_tickernames

        for idx, cov in enumerate(covlist):
            cov_copy = cov.copy()
            cov_copy.loc[:, 'ticker'] = tickernames[idx]

            if idx != 0:
                cov_copy.columns = [col.split('.')[0] for col in cov_copy.columns]
            covlist[idx] = cov_copy
        self.data = pd.concat(covlist)

        return self.data

In [None]:
dataloader = DataLoader(covariates_id='dec2024', label_duration=20, labels_id='hello')

In [None]:
covlist = dataloader._load_data()

In [None]:
covlist

In [None]:
missing_tickers = covlist.loc[covlist['PX_LAST'].isna(), 'ticker']
print(missing_tickers)