In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

def count_csv_rows(file_path, has_header=True):
    """
    Counts the number of rows in a CSV file.
    
    Parameters:
    - file_path (Path): The path to the CSV file.
    - has_header (bool): Whether the CSV file has a header row.
    
    Returns:
    - int: The number of data rows in the CSV file.
    """
    with file_path.open('r', encoding='utf-8') as file:
        row_count = sum(1 for _ in file)
    return row_count - 1 if has_header else row_count

def enumerate_and_sort_csv(datadir, has_header=True):
    """
    Enumerates CSV files in the given directory, counts their rows, and sorts them.
    
    Parameters:
    - datadir (str or Path): The directory containing CSV files.
    - has_header (bool): Whether the CSV files have header rows.
    
    Returns:
    - List of tuples: Each tuple contains (file_name, row_count), sorted by row_count ascending.
    """
    datadir = Path(datadir)
    if not datadir.is_dir():
        raise ValueError(f"The path {datadir} is not a valid directory.")
    
    # Find all CSV files in the directory
    csv_files = list(datadir.glob('*.csv'))
    
    if not csv_files:
        print("No CSV files found in the specified directory.")
        return []
    
    file_row_counts = []
    
    for file in csv_files:
        try:
            rows = count_csv_rows(file, has_header=has_header)
            file_row_counts.append((file.name, rows))
        except Exception as e:
            print(f"Error processing {file.name}: {e}")
    
    # Sort the list of tuples based on row_count (ascending order)
    sorted_files = sorted(file_row_counts, key=lambda x: x[1])
    
    return sorted_files




In [3]:
# Specify the directory containing CSV files
data_directory = 'data/prices'  # Replace with your directory path

# Enumerate and sort CSV files
sorted_csv_files = enumerate_and_sort_csv(data_directory, has_header=True)

if sorted_csv_files:
    print("\nCSV files sorted by number of rows (ascending):")
    for filename, count in sorted_csv_files:
        print(f"{filename}: {count} rows")


CSV files sorted by number of rows (ascending):
INVH.csv: 2196 rows
COWZ.csv: 2222 rows
LW.csv: 2251 rows
HWM.csv: 2258 rows
CTEC.L.csv: 2272 rows
TTD.csv: 2287 rows
FTV.csv: 2342 rows
TEAM.csv: 2484 rows
HPE.csv: 2520 rows
SINCH.ST.csv: 2528 rows
1COV.DE.csv: 2558 rows
VTEB.csv: 2558 rows
PYPL.csv: 2594 rows
KHC.csv: 2594 rows
SHOP.csv: 2625 rows
ETSY.csv: 2649 rows
EVO.ST.csv: 2667 rows
AUTO.L.csv: 2679 rows
QRVO.csv: 2720 rows
KEYS.csv: 2771 rows
CFG.csv: 2789 rows
CZR.csv: 2791 rows
ZAL.DE.csv: 2812 rows
SYF.csv: 2827 rows
DGRO.csv: 2861 rows
IUSB.csv: 2861 rows
ANET.csv: 2865 rows
BME.L.csv: 2874 rows
PAYC.csv: 2901 rows
WLN.PA.csv: 2902 rows
HLT.csv: 2985 rows
ALLE.csv: 3002 rows
QUAL.csv: 3088 rows
CDW.csv: 3102 rows
NWS.csv: 3108 rows
NWSA.csv: 3108 rows
BNDX.csv: 3119 rows
VNA.DE.csv: 3122 rows
IQV.csv: 3136 rows
CCH.L.csv: 3158 rows
ZTS.csv: 3203 rows
NCLH.csv: 3212 rows
ABBV.csv: 3224 rows
IEFA.csv: 3269 rows
IXUS.csv: 3269 rows
IEMG.csv: 3269 rows
FANG.csv: 3277 rows
WDAY.

In [4]:
from collections import defaultdict
import numpy as np

def cluster_csv_pairs_hdbscan_sorted(sorted_csv_files,
                                     min_cluster_size=5,
                                     min_samples=None,
                                     alpha=1.0):
    """
    HDBSCAN clustering with cluster IDs reordered by descending row counts.
    Cluster 0 has the largest rows.
    """

    import hdbscan
    from sklearn.preprocessing import RobustScaler

    if not sorted_csv_files:
        return {}

    # Feature: row count only
    row_counts = np.array([rc for _, rc in sorted_csv_files], dtype=np.float64).reshape(-1, 1)

    # Robust scaling
    X = RobustScaler().fit_transform(row_counts)

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        alpha=alpha,
        metric="euclidean"
    )

    raw_labels = clusterer.fit_predict(X)

    # Collect raw clusters
    raw_clusters = defaultdict(list)
    for pair, lab in zip(sorted_csv_files, raw_labels):
        raw_clusters[int(lab)].append(pair)

    # Separate noise
    noise = raw_clusters.pop(-1, [])

    # Sort clusters by mean row count (descending)
    sorted_clusters = sorted(
        raw_clusters.values(),
        key=lambda items: np.mean([rc for _, rc in items]),
        reverse=True
    )

    # Reassign cluster IDs: 0 = largest rows
    final_clusters = {}
    for new_id, items in enumerate(sorted_clusters):
        final_clusters[new_id] = sorted(items, key=lambda x: x[1], reverse=True)

    if noise:
        final_clusters[-1] = sorted(noise, key=lambda x: x[1], reverse=True)

    # Pretty print
    print(f"Clusters found (excluding noise): {len(sorted_clusters)}")
    print(f"Noise points (-1): {len(noise)}\n")

    for cid in sorted(final_clusters.keys(), key=lambda x: (x == -1, x)):
        title = "Noise (-1)" if cid == -1 else f"Cluster {cid}"
        print(f"{title} | size={len(final_clusters[cid])}")
        for fname, rc in final_clusters[cid]:
            print(f"  ({fname!r}, {rc})")
        print()

    return final_clusters


# ---- Example usage ----
# clusters = cluster_csv_pairs_hdbscan_sorted(
#     sorted_csv_files,
#     min_cluster_size=5
# )


In [5]:
clusters = cluster_csv_pairs_hdbscan_sorted(sorted_csv_files,
                                              min_cluster_size=5,
                                              min_samples=5)

Clusters found (excluding noise): 42
Noise points (-1): 79

Cluster 0 | size=28
  ('HPQ.csv', 16062)
  ('PG.csv', 16062)
  ('IP.csv', 16062)
  ('DIS.csv', 16062)
  ('MRK.csv', 16062)
  ('HON.csv', 16062)
  ('CAT.csv', 16062)
  ('GE.csv', 16062)
  ('IBM.csv', 16062)
  ('GD.csv', 16062)
  ('JNJ.csv', 16062)
  ('ED.csv', 16062)
  ('DTE.csv', 16062)
  ('XOM.csv', 16062)
  ('MMM.csv', 16062)
  ('CVX.csv', 16062)
  ('BA.csv', 16062)
  ('KO.csv', 16062)
  ('MSI.csv', 16062)
  ('CNP.csv', 16062)
  ('MO.csv', 16062)
  ('KR.csv', 16062)
  ('AEP.csv', 16062)
  ('LMT.csv', 16061)
  ('RTX.csv', 15999)
  ('MCD.csv', 14927)
  ('FCIT.L.csv', 14660)
  ('SMT.L.csv', 14660)

Cluster 1 | size=17
  ('F.csv', 13463)
  ('LLY.csv', 13463)
  ('HAL.csv', 13463)
  ('PCG.csv', 13463)
  ('AXP.csv', 13463)
  ('EMR.csv', 13463)
  ('PFE.csv', 13463)
  ('DE.csv', 13463)
  ('TXN.csv', 13463)
  ('BMY.csv', 13463)
  ('ETN.csv', 13463)
  ('DD.csv', 13463)
  ('ETR.csv', 13463)
  ('WFC.csv', 13463)
  ('PEP.csv', 13463)
  ('

In [6]:
clusters

{0: [('HPQ.csv', 16062),
  ('PG.csv', 16062),
  ('IP.csv', 16062),
  ('DIS.csv', 16062),
  ('MRK.csv', 16062),
  ('HON.csv', 16062),
  ('CAT.csv', 16062),
  ('GE.csv', 16062),
  ('IBM.csv', 16062),
  ('GD.csv', 16062),
  ('JNJ.csv', 16062),
  ('ED.csv', 16062),
  ('DTE.csv', 16062),
  ('XOM.csv', 16062),
  ('MMM.csv', 16062),
  ('CVX.csv', 16062),
  ('BA.csv', 16062),
  ('KO.csv', 16062),
  ('MSI.csv', 16062),
  ('CNP.csv', 16062),
  ('MO.csv', 16062),
  ('KR.csv', 16062),
  ('AEP.csv', 16062),
  ('LMT.csv', 16061),
  ('RTX.csv', 15999),
  ('MCD.csv', 14927),
  ('FCIT.L.csv', 14660),
  ('SMT.L.csv', 14660)],
 1: [('F.csv', 13463),
  ('LLY.csv', 13463),
  ('HAL.csv', 13463),
  ('PCG.csv', 13463),
  ('AXP.csv', 13463),
  ('EMR.csv', 13463),
  ('PFE.csv', 13463),
  ('DE.csv', 13463),
  ('TXN.csv', 13463),
  ('BMY.csv', 13463),
  ('ETN.csv', 13463),
  ('DD.csv', 13463),
  ('ETR.csv', 13463),
  ('WFC.csv', 13463),
  ('PEP.csv', 13463),
  ('WMT.csv', 13403),
  ('WBA.csv', 13345)],
 2: [('AIG

In [None]:
# to get a big table for all stocks
dataframes_vol = []
for cluster_id in range(len(clusters)-1):
   
    for filename, rows in clusters[cluster_id]:
        print(f"File: {filename}, Rows: {rows}")
        # make one single pandas dataframe contains the data from above stock IDs, the index is the data, the columns are the stock IDs
        stock_id = filename[:-4]  

        file_path = Path(data_directory) / filename
        df_vol = pd.read_csv(file_path, usecols=['Date', 'Adj Close', 'Volume'], parse_dates=['Date'])
        df_vol.set_index('Date', inplace=True)
        df_vol.rename(columns={'Adj Close': f"{stock_id}_price", 'Volume': f"{stock_id}_vol"}, inplace=True)
        dataframes_vol.append(df_vol)

combined_df_vol = pd.concat(dataframes_vol, axis=1)

price_columns = [col for col in combined_df_vol.columns if col.endswith('_price')]
volume_columns = [col for col in combined_df_vol.columns if col.endswith('_vol')]

combined_df_vol[price_columns] = (
    combined_df_vol[price_columns]
        .ffill()
        .bfill()
)

combined_df_vol[volume_columns] = (
    combined_df_vol[volume_columns].fillna(0)
)

log_returns_df_vol = np.log(combined_df_vol / (combined_df_vol.shift(1) + 1e-9 ) + 1e-9)
# drop the first row
#log_returns_df_vol = log_returns_df_vol.iloc[1:]
log_returns_df_vol = log_returns_df_vol.dropna()  # Drop the first row with

price_columns = [col for col in log_returns_df_vol.columns if col.endswith('_price')]
volume_columns = [col for col in log_returns_df_vol.columns if col.endswith('_vol')]
log_returns_df_vol = log_returns_df_vol[price_columns + volume_columns]
# find out the earliest date in the dataframe
start_date = log_returns_df_vol.index.min().strftime('%Y-%m-%d')
log_returns_df_vol.to_csv(f"stock_data_vol-all-in-one-startdate-{start_date}.csv")


File: HPQ.csv, Rows: 16062
File: PG.csv, Rows: 16062
File: IP.csv, Rows: 16062
File: DIS.csv, Rows: 16062
File: MRK.csv, Rows: 16062
File: HON.csv, Rows: 16062
File: CAT.csv, Rows: 16062
File: GE.csv, Rows: 16062
File: IBM.csv, Rows: 16062
File: GD.csv, Rows: 16062
File: JNJ.csv, Rows: 16062
File: ED.csv, Rows: 16062
File: DTE.csv, Rows: 16062
File: XOM.csv, Rows: 16062
File: MMM.csv, Rows: 16062
File: CVX.csv, Rows: 16062
File: BA.csv, Rows: 16062
File: KO.csv, Rows: 16062
File: MSI.csv, Rows: 16062
File: CNP.csv, Rows: 16062
File: MO.csv, Rows: 16062
File: KR.csv, Rows: 16062
File: AEP.csv, Rows: 16062
File: LMT.csv, Rows: 16061
File: RTX.csv, Rows: 15999
File: MCD.csv, Rows: 14927
File: FCIT.L.csv, Rows: 14660
File: SMT.L.csv, Rows: 14660
File: F.csv, Rows: 13463
File: LLY.csv, Rows: 13463
File: HAL.csv, Rows: 13463
File: PCG.csv, Rows: 13463
File: AXP.csv, Rows: 13463
File: EMR.csv, Rows: 13463
File: PFE.csv, Rows: 13463
File: DE.csv, Rows: 13463
File: TXN.csv, Rows: 13463
File: BM

In [56]:
for cluster_id in range(len(clusters)-1):
    print(f"Processing cluster {cluster_id}")
    dataframes_vol = []
    for filename, rows in clusters[cluster_id]:
        print(f"File: {filename}, Rows: {rows}")
        # make one single pandas dataframe contains the data from above stock IDs, the index is the data, the columns are the stock IDs
        stock_id = filename[:-4]  

        file_path = Path(data_directory) / filename
        df_vol = pd.read_csv(file_path, usecols=['Date', 'Adj Close', 'Volume'], parse_dates=['Date'])
        df_vol.set_index('Date', inplace=True)
        df_vol.rename(columns={'Adj Close': f"{stock_id}_price", 'Volume': f"{stock_id}_vol"}, inplace=True)
        dataframes_vol.append(df_vol)

    combined_df_vol = pd.concat(dataframes_vol, axis=1)
    combined_df_vol.fillna(method='ffill', inplace=True)
    combined_df_vol.fillna(method='bfill', inplace=True)
    log_returns_df_vol = np.log(combined_df_vol / (combined_df_vol.shift(1) + 1e-9 ) + 1e-9)
    # drop the first row
    #log_returns_df_vol = log_returns_df_vol.iloc[1:]
    log_returns_df_vol = log_returns_df_vol.dropna()  # Drop the first row with

    price_columns = [col for col in log_returns_df_vol.columns if col.endswith('_price')]
    volume_columns = [col for col in log_returns_df_vol.columns if col.endswith('_vol')]
    log_returns_df_vol = log_returns_df_vol[price_columns + volume_columns]
    # find out the earliest date in the dataframe
    start_date = log_returns_df_vol.index.min().strftime('%Y-%m-%d')
    log_returns_df_vol.to_csv(f"stock_data_vol-cluster-{cluster_id}-startdate-{start_date}.csv")

Processing cluster 0
File: HPQ.csv, Rows: 16062
File: PG.csv, Rows: 16062
File: IP.csv, Rows: 16062
File: DIS.csv, Rows: 16062
File: MRK.csv, Rows: 16062
File: HON.csv, Rows: 16062
File: CAT.csv, Rows: 16062
File: GE.csv, Rows: 16062
File: IBM.csv, Rows: 16062
File: GD.csv, Rows: 16062
File: JNJ.csv, Rows: 16062
File: ED.csv, Rows: 16062
File: DTE.csv, Rows: 16062
File: XOM.csv, Rows: 16062
File: MMM.csv, Rows: 16062
File: CVX.csv, Rows: 16062
File: BA.csv, Rows: 16062
File: KO.csv, Rows: 16062
File: MSI.csv, Rows: 16062
File: CNP.csv, Rows: 16062
File: MO.csv, Rows: 16062
File: KR.csv, Rows: 16062
File: AEP.csv, Rows: 16062
File: LMT.csv, Rows: 16061
File: RTX.csv, Rows: 15999
File: MCD.csv, Rows: 14927
File: FCIT.L.csv, Rows: 14660
File: SMT.L.csv, Rows: 14660


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 1
File: F.csv, Rows: 13463
File: LLY.csv, Rows: 13463
File: HAL.csv, Rows: 13463
File: PCG.csv, Rows: 13463
File: AXP.csv, Rows: 13463
File: EMR.csv, Rows: 13463
File: PFE.csv, Rows: 13463
File: DE.csv, Rows: 13463
File: TXN.csv, Rows: 13463
File: BMY.csv, Rows: 13463
File: ETN.csv, Rows: 13463
File: DD.csv, Rows: 13463
File: ETR.csv, Rows: 13463
File: WFC.csv, Rows: 13463
File: PEP.csv, Rows: 13463
File: WMT.csv, Rows: 13403
File: WBA.csv, Rows: 13345


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 2
File: AIG.csv, Rows: 13317
File: ES.csv, Rows: 13283
File: CLX.csv, Rows: 13283
File: CVS.csv, Rows: 13283
File: VMC.csv, Rows: 13283
File: BDX.csv, Rows: 13283
File: EVRG.csv, Rows: 13283
File: MKC.csv, Rows: 13283
File: BAC.csv, Rows: 13283
File: TGT.csv, Rows: 13283
File: K.csv, Rows: 13283
File: RVTY.csv, Rows: 13283
File: CMS.csv, Rows: 13283
File: BALL.csv, Rows: 13283
File: TER.csv, Rows: 13283
File: MMC.csv, Rows: 13283
File: XEL.csv, Rows: 13283
File: NEE.csv, Rows: 13283
File: NI.csv, Rows: 13283
File: ECL.csv, Rows: 13283
File: CMI.csv, Rows: 13283
File: PNW.csv, Rows: 13283
File: CPB.csv, Rows: 13283
File: AVY.csv, Rows: 13283
File: SPGI.csv, Rows: 13283
File: TXT.csv, Rows: 13283
File: LNT.csv, Rows: 13283
File: SNA.csv, Rows: 13283
File: GWW.csv, Rows: 13283
File: ITW.csv, Rows: 13269


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 3
File: CL.csv, Rows: 13234
File: MDT.csv, Rows: 13234
File: EXC.csv, Rows: 13234
File: BK.csv, Rows: 13233
File: USB.csv, Rows: 13233
File: FRT.csv, Rows: 13233
File: EIX.csv, Rows: 13233
File: PNR.csv, Rows: 13233
File: WY.csv, Rows: 13233
File: SYY.csv, Rows: 13230


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 4
File: PEG.csv, Rows: 11549
File: UNP.csv, Rows: 11549
File: LUV.csv, Rows: 11549
File: BIO.csv, Rows: 11510
File: TYL.csv, Rows: 11497
File: CAG.csv, Rows: 11497
File: HAS.csv, Rows: 11497
File: RHI.csv, Rows: 11497
File: MAS.csv, Rows: 11497
File: TT.csv, Rows: 11497
File: NUE.csv, Rows: 11497
File: ABT.csv, Rows: 11497
File: IPG.csv, Rows: 11497
File: VTRS.csv, Rows: 11497
File: J.csv, Rows: 11497
File: JPM.csv, Rows: 11497
File: NTRS.csv, Rows: 11497
File: PCAR.csv, Rows: 11497
File: WST.csv, Rows: 11497
File: WEC.csv, Rows: 11497
File: PH.csv, Rows: 11497
File: ADM.csv, Rows: 11497
File: OMC.csv, Rows: 11497
File: CINF.csv, Rows: 11497
File: AMD.csv, Rows: 11497
File: GIS.csv, Rows: 11497
File: FITB.csv, Rows: 11497
File: PPL.csv, Rows: 11497
File: EQT.csv, Rows: 11497
File: DOV.csv, Rows: 11497
File: SWK.csv, Rows: 11497
File: TFX.csv, Rows: 11497
File: INTC.csv, Rows: 11497
File: ADI.csv, Rows: 11497
File: AFL.csv, Rows: 11497
File: STT.csv, Rows: 11497
File:

  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 5
File: GL.csv, Rows: 11359
File: OKE.csv, Rows: 11359
File: KLAC.csv, Rows: 11354
File: CSX.csv, Rows: 11336
File: PSA.csv, Rows: 11326
File: NKE.csv, Rows: 11317
File: AAPL.csv, Rows: 11309


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 6
File: UHS.csv, Rows: 11166
File: HD.csv, Rows: 11114
File: BAX.csv, Rows: 11089
File: GLW.csv, Rows: 11044
File: OXY.csv, Rows: 11044
File: SO.csv, Rows: 11044
File: SLB.csv, Rows: 11044
File: LHX.csv, Rows: 11044
File: COP.csv, Rows: 11044
File: HUM.csv, Rows: 11044
File: NOC.csv, Rows: 11044
File: ROK.csv, Rows: 11044
File: WMB.csv, Rows: 11044
File: VLO.csv, Rows: 11043
File: CI.csv, Rows: 10982
File: BBWI.csv, Rows: 10981
File: NSC.csv, Rows: 10939


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 7
File: AMGN.csv, Rows: 10674
File: RJF.csv, Rows: 10664
File: CAH.csv, Rows: 10641
File: CTAS.csv, Rows: 10630
File: PAYX.csv, Rows: 10625
File: BEN.csv, Rows: 10606
File: AOS.csv, Rows: 10601
File: VZ.csv, Rows: 10565
File: T.csv, Rows: 10565
File: JBHT.csv, Rows: 10564
File: ATO.csv, Rows: 10540
File: LRCX.csv, Rows: 10451


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 8
File: MU.csv, Rows: 10432
File: AJG.csv, Rows: 10419
File: AME.csv, Rows: 10399
File: SWKS.csv, Rows: 10364
File: EXPD.csv, Rows: 10351
Processing cluster 9
File: BBY.csv, Rows: 10210
File: DOC.csv, Rows: 10185
File: ADSK.csv, Rows: 10160
File: NVR.csv, Rows: 10145


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


File: DVN.csv, Rows: 10145
File: ROST.csv, Rows: 10132
File: JKHY.csv, Rows: 10060
File: MNST.csv, Rows: 10048
File: ORCL.csv, Rows: 9984
File: MSFT.csv, Rows: 9983
File: TROW.csv, Rows: 9970
File: COST.csv, Rows: 9902
File: ADBE.csv, Rows: 9877


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 10
File: FI.csv, Rows: 9847
File: BKR.csv, Rows: 9714
File: CDNS.csv, Rows: 9669
File: TJX.csv, Rows: 9657
File: FICO.csv, Rows: 9640
File: CCL.csv, Rows: 9638
File: FAST.csv, Rows: 9619
File: DGE.L.csv, Rows: 9615
File: SCHW.csv, Rows: 9597
File: JCI.csv, Rows: 9593
File: RS1.L.csv, Rows: 9573
File: ABF.L.csv, Rows: 9573
File: TSCO.L.csv, Rows: 9573
File: TW.L.csv, Rows: 9573
File: RR.L.csv, Rows: 9573
File: DPLM.L.csv, Rows: 9573
File: SMIN.L.csv, Rows: 9573
File: RKT.L.csv, Rows: 9573
File: AHT.L.csv, Rows: 9573
File: MKS.L.csv, Rows: 9573
File: BA.L.csv, Rows: 9573
File: WTB.L.csv, Rows: 9573
File: RIO.L.csv, Rows: 9573
File: BT-A.L.csv, Rows: 9573
File: ANTO.L.csv, Rows: 9573
File: PSON.L.csv, Rows: 9573
File: LAND.L.csv, Rows: 9573
File: STAN.L.csv, Rows: 9573
File: KGF.L.csv, Rows: 9573
File: REL.L.csv, Rows: 9573
File: LGEN.L.csv, Rows: 9573
File: BNZL.L.csv, Rows: 9573
File: IMI.L.csv, Rows: 9573
File: SDR.L.csv, Rows: 9573
File: AV.L.csv, Rows: 9573
File: W

  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 11
File: TECH.csv, Rows: 9246
File: SVT.L.csv, Rows: 9197
File: UU.L.csv, Rows: 9197
File: SGE.L.csv, Rows: 9195
File: IEX.csv, Rows: 9168
File: GEN.csv, Rows: 9153
File: EA.csv, Rows: 9092
File: EOG.csv, Rows: 9082
File: CS.PA.csv, Rows: 9076
File: PTC.csv, Rows: 9036
File: STJ.L.csv, Rows: 9011
File: CTRA.csv, Rows: 8994
File: CSCO.csv, Rows: 8988
File: EN.PA.csv, Rows: 8980
File: HOLX.csv, Rows: 8980
File: LH.csv, Rows: 8960
File: TRMB.csv, Rows: 8882


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 12
File: AZO.csv, Rows: 8706
File: REGN.csv, Rows: 8706
File: IDXX.csv, Rows: 8649
File: AES.csv, Rows: 8646
File: VRTX.csv, Rows: 8627
File: TEP.PA.csv, Rows: 8615
File: ZBRA.csv, Rows: 8611
File: BIIB.csv, Rows: 8589
File: ODFL.csv, Rows: 8562
File: APH.csv, Rows: 8551
File: KIM.csv, Rows: 8541
File: QCOM.csv, Rows: 8527
File: HWDN.L.csv, Rows: 8519
File: GILD.csv, Rows: 8501
File: ROP.csv, Rows: 8485
File: SNPS.csv, Rows: 8477
File: STZ.csv, Rows: 8463
File: MHK.csv, Rows: 8452
File: BSX.csv, Rows: 8419
File: STE.csv, Rows: 8411
File: DHI.csv, Rows: 8407
File: LIN.csv, Rows: 8399
File: SBUX.csv, Rows: 8392


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 13
File: BNP.PA.csv, Rows: 8255
File: MTCH.csv, Rows: 8250
File: SPY.csv, Rows: 8242
File: MS.csv, Rows: 8226
File: INTU.csv, Rows: 8213
File: MCHP.csv, Rows: 8208
File: CB.csv, Rows: 8204
File: ORLY.csv, Rows: 8184
File: RCL.csv, Rows: 8181
File: JBL.csv, Rows: 8178
File: AZN.csv, Rows: 8171
File: ALL.csv, Rows: 8156
File: EQR.csv, Rows: 8107


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 14
File: IT.csv, Rows: 8070
File: DECK.csv, Rows: 8062
File: REG.csv, Rows: 8052
File: INCY.csv, Rows: 8048
File: EMN.csv, Rows: 8021
File: SPG.csv, Rows: 8021
File: III.L.csv, Rows: 7996
File: MAA.csv, Rows: 7989
File: TSCO.csv, Rows: 7975
File: MLM.csv, Rows: 7975
File: ALB.csv, Rows: 7973
File: AVB.csv, Rows: 7960
File: CPRT.csv, Rows: 7956


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 15
File: WPP.L.csv, Rows: 7816
File: O.csv, Rows: 7808
File: SJM.csv, Rows: 7799
File: MCO.csv, Rows: 7799
File: MCK.csv, Rows: 7791
File: PARA.csv, Rows: 7787
File: COF.csv, Rows: 7787
Processing cluster 16
File: COR.csv, Rows: 7692
File: MDY.csv, Rows: 7671
File: DRI.csv, Rows: 7668


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


File: RMD.csv, Rows: 7651
File: WAB.csv, Rows: 7641
File: NG.L.csv, Rows: 7632
File: FCX.csv, Rows: 7626
File: LLOY.L.csv, Rows: 7619
File: IVZ.csv, Rows: 7592
File: ACGL.csv, Rows: 7579
File: EG.csv, Rows: 7566
File: POOL.csv, Rows: 7558
File: DVA.csv, Rows: 7546
File: HSIC.csv, Rows: 7543
File: EL.csv, Rows: 7533
File: WAT.csv, Rows: 7533
File: NTAP.csv, Rows: 7531
File: HIG.csv, Rows: 7514


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 17
File: BMW.DE.csv, Rows: 7421
File: SIE.DE.csv, Rows: 7421
File: IMB.L.csv, Rows: 7420
File: DBK.DE.csv, Rows: 7415
File: DTE.DE.csv, Rows: 7415
Processing cluster 18
File: JD.L.csv, Rows: 7406
File: CBK.DE.csv, Rows: 7395
File: ALV.DE.csv, Rows: 7395
File: BAYN.DE.csv, Rows: 7395
File: RWE.DE.csv, Rows: 7395
File: BAS.DE.csv, Rows: 7395
File: CON.DE.csv, Rows: 7395


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 19
File: KMX.csv, Rows: 7227
File: ERF.PA.csv, Rows: 7205
File: TTWO.csv, Rows: 7179
File: VTR.csv, Rows: 7165
File: AMZN.csv, Rows: 7157
File: ARE.csv, Rows: 7149
File: RL.csv, Rows: 7138
File: BXP.csv, Rows: 7134
Processing cluster 20
File: YUM.csv, Rows: 7071


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


File: TSM.csv, Rows: 7055
File: SAP.DE.csv, Rows: 7052
File: CHRW.csv, Rows: 7050
File: FE.csv, Rows: 7033
File: MTD.csv, Rows: 7029
File: PLD.csv, Rows: 7024
File: URI.csv, Rows: 7006
File: MUV2.DE.csv, Rows: 6998
File: ADS.DE.csv, Rows: 6998
File: HEN3.DE.csv, Rows: 6997
File: AEE.csv, Rows: 6997
File: MRK.DE.csv, Rows: 6996
File: DIA.csv, Rows: 6986
File: BEI.DE.csv, Rows: 6978
File: QIA.DE.csv, Rows: 6978
File: VRSN.csv, Rows: 6978
File: VOW3.DE.csv, Rows: 6978
File: HEI.DE.csv, Rows: 6975
File: PWR.csv, Rows: 6969
File: RHM.DE.csv, Rows: 6968
File: AMT.csv, Rows: 6959
File: FLTR.L.csv, Rows: 6945
File: MAR.csv, Rows: 6943


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 21
File: FRE.DE.csv, Rows: 6900
File: AIR.DE.csv, Rows: 6899
File: MSTR.csv, Rows: 6887
File: CTSH.csv, Rows: 6881
File: SRE.csv, Rows: 6875
File: CSGP.csv, Rows: 6873
File: RSG.csv, Rows: 6873
File: INF.L.csv, Rows: 6841
File: CCI.csv, Rows: 6840


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 22
File: SRT3.DE.csv, Rows: 6778
File: XLV.csv, Rows: 6752
File: XLY.csv, Rows: 6752
File: XLE.csv, Rows: 6752
File: XLI.csv, Rows: 6752
File: XLF.csv, Rows: 6752
File: XLK.csv, Rows: 6752
File: NVDA.csv, Rows: 6732
File: AAL.L.csv, Rows: 6732
File: UTG.L.csv, Rows: 6726
File: QQQ.csv, Rows: 6700


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 23
File: GS.csv, Rows: 6662
File: FFIV.csv, Rows: 6640
File: RMS.PA.csv, Rows: 6635
File: RI.PA.csv, Rows: 6635
File: SGO.PA.csv, Rows: 6635
File: SU.PA.csv, Rows: 6635
File: SAN.PA.csv, Rows: 6635
File: DG.PA.csv, Rows: 6635
File: DSY.PA.csv, Rows: 6635
File: CAP.PA.csv, Rows: 6635
File: KER.PA.csv, Rows: 6635
File: RNO.PA.csv, Rows: 6635
File: GLE.PA.csv, Rows: 6635
File: MC.PA.csv, Rows: 6635
File: ENGI.PA.csv, Rows: 6635
File: ORA.PA.csv, Rows: 6635
File: EL.PA.csv, Rows: 6635
File: SAF.PA.csv, Rows: 6635
File: OR.PA.csv, Rows: 6635
File: HO.PA.csv, Rows: 6635
File: TTE.PA.csv, Rows: 6635
File: AI.PA.csv, Rows: 6635
File: ML.PA.csv, Rows: 6635
File: CA.PA.csv, Rows: 6635
File: SBAC.csv, Rows: 6632


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 24
File: EOAN.DE.csv, Rows: 6600
File: ATCO-B.ST.csv, Rows: 6577
File: CPG.L.csv, Rows: 6571
File: DCC.L.csv, Rows: 6571
File: BLK.csv, Rows: 6557
File: ERIC-B.ST.csv, Rows: 6554
File: AZN.ST.csv, Rows: 6554
File: SKA-B.ST.csv, Rows: 6554
File: ASSA-B.ST.csv, Rows: 6554
File: SHB-A.ST.csv, Rows: 6554
File: HEXA-B.ST.csv, Rows: 6554
File: SWED-A.ST.csv, Rows: 6554
File: GETI-B.ST.csv, Rows: 6554
File: ELUX-B.ST.csv, Rows: 6554
File: NDA-SE.ST.csv, Rows: 6554
File: SKF-B.ST.csv, Rows: 6554
File: ALIV-SDB.ST.csv, Rows: 6554
File: TEL2-B.ST.csv, Rows: 6554
File: ATCO-A.ST.csv, Rows: 6554
File: SAND.ST.csv, Rows: 6554
File: INVE-B.ST.csv, Rows: 6554
File: VOLV-B.ST.csv, Rows: 6554
File: HM-B.ST.csv, Rows: 6554
File: SEB-A.ST.csv, Rows: 6554
File: SCA-B.ST.csv, Rows: 6554
File: KINV-B.ST.csv, Rows: 6554
File: IFX.DE.csv, Rows: 6550
File: AKAM.csv, Rows: 6537
File: UPS.csv, Rows: 6529
File: A.csv, Rows: 6523
File: TDY.csv, Rows: 6520


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 25
File: PUB.PA.csv, Rows: 6449
File: TELIA.ST.csv, Rows: 6438
File: EW.csv, Rows: 6435
File: MET.csv, Rows: 6428
File: ON.csv, Rows: 6410
File: IWB.csv, Rows: 6397
File: IYW.csv, Rows: 6397
File: IVV.csv, Rows: 6397
File: IVW.csv, Rows: 6392
File: IVE.csv, Rows: 6392
File: IJH.csv, Rows: 6392
File: IJR.csv, Rows: 6392
File: IWM.csv, Rows: 6392
File: IWF.csv, Rows: 6392
File: IWD.csv, Rows: 6392
File: HNR1.DE.csv, Rows: 6390
File: SMH.csv, Rows: 6387
File: ISRG.csv, Rows: 6378
File: CRL.csv, Rows: 6373
File: MRVL.csv, Rows: 6368
File: IUSG.csv, Rows: 6349
File: ILMN.csv, Rows: 6349
File: EZJ.L.csv, Rows: 6345
File: IUSV.csv, Rows: 6344
File: EQIX.csv, Rows: 6339
File: DHL.DE.csv, Rows: 6320
File: DB1.DE.csv, Rows: 6315
File: SPYG.csv, Rows: 6304
File: SPYV.csv, Rows: 6304
File: TPR.csv, Rows: 6300


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 26
File: GPN.csv, Rows: 6232
File: ALGN.csv, Rows: 6222
File: ABB.ST.csv, Rows: 6203
File: STLAP.PA.csv, Rows: 6200
File: STMPA.PA.csv, Rows: 6200
File: AIR.PA.csv, Rows: 6200
Processing cluster 27
File: LSEG.L.csv, Rows: 6168
File: WTW.csv, Rows: 6130
File: MDLZ.csv, Rows: 6129
File: VTI.csv, Rows: 6127
File: ACA.PA.csv, Rows: 6126
File: AXON.csv, Rows: 6125
File: FIS.csv, Rows: 6124
File: ACN.csv, Rows: 6104
File: ZBH.csv, Rows: 6100
File: BG.csv, Rows: 6094
File: IWR.csv, Rows: 6077
File: EFA.csv, Rows: 6077
File: PFG.csv, Rows: 6041


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  result = func(self.values, **kwargs)


Processing cluster 28
File: ITRK.L.csv, Rows: 5948
File: BRBY.L.csv, Rows: 5913
File: FRAS.L.csv, Rows: 5900
File: NFLX.csv, Rows: 5895
File: NDAQ.csv, Rows: 5869
File: TLT.csv, Rows: 5849
File: SHY.csv, Rows: 5849
File: IEF.csv, Rows: 5849
File: LQD.csv, Rows: 5849
File: BEZ.L.csv, Rows: 5826


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 29
File: MRO.L.csv, Rows: 5576
File: AGG.csv, Rows: 5555
File: LKQ.csv, Rows: 5550
File: DVY.csv, Rows: 5526
File: NRG.csv, Rows: 5510
File: ITOT.csv, Rows: 5475
File: VUG.csv, Rows: 5470
File: VTV.csv, Rows: 5470
File: VO.csv, Rows: 5470
File: VHT.csv, Rows: 5470
File: VGT.csv, Rows: 5470
File: VV.csv, Rows: 5470
File: VB.csv, Rows: 5470
File: VBR.csv, Rows: 5470
File: AIZ.csv, Rows: 5466


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 30
File: CBRE.csv, Rows: 5379
File: CRM.csv, Rows: 5371
File: DPZ.csv, Rows: 5358
File: ADM.L.csv, Rows: 5339
File: EXR.csv, Rows: 5334
File: GOOGL.csv, Rows: 5331
File: GOOG.csv, Rows: 5331
File: VNQ.csv, Rows: 5303
File: DLR.csv, Rows: 5281
File: MKTX.csv, Rows: 5276
File: ENT.L.csv, Rows: 5276
File: GLD.csv, Rows: 5267
File: MPWR.csv, Rows: 5266
File: LVS.csv, Rows: 5249
File: CE.csv, Rows: 5224
File: IAU.csv, Rows: 5219
File: VWO.csv, Rows: 5191
File: VGK.csv, Rows: 5191
File: MT.AS.csv, Rows: 5190
File: MTX.DE.csv, Rows: 5185
File: ALO.PA.csv, Rows: 5178
File: DXCM.csv, Rows: 5167
File: BLDR.csv, Rows: 5115


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 31
File: WBD.csv, Rows: 5108
File: EXPE.csv, Rows: 5099
File: EFV.csv, Rows: 5088
File: CF.csv, Rows: 5084
File: AMP.csv, Rows: 5060
File: AAL.csv, Rows: 5052
File: HIK.L.csv, Rows: 5051
File: SDY.csv, Rows: 5017
File: SPLG.csv, Rows: 5017
File: ICE.csv, Rows: 5016
File: LR.PA.csv, Rows: 5001
File: LYV.csv, Rows: 4992
File: CMG.csv, Rows: 4969
File: UAL.csv, Rows: 4962
File: RMV.L.csv, Rows: 4959
File: TDG.csv, Rows: 4936


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 32
File: LDOS.csv, Rows: 4786
File: VYM.csv, Rows: 4764
File: FSLR.csv, Rows: 4763
File: IGSB.csv, Rows: 4728
File: SHV.csv, Rows: 4728
File: SKG.L.csv, Rows: 4703
File: VEU.csv, Rows: 4690
File: MBB.csv, Rows: 4684
File: BR.csv, Rows: 4680
File: SMCI.csv, Rows: 4675
File: BND.csv, Rows: 4668
File: BIV.csv, Rows: 4668
File: BSV.csv, Rows: 4668
File: TMUS.csv, Rows: 4661
File: SPDW.csv, Rows: 4656
File: DAL.csv, Rows: 4651
File: PODD.csv, Rows: 4643
File: BIL.csv, Rows: 4633
File: MNDI.L.csv, Rows: 4631
File: TEL.csv, Rows: 4622
File: BX.csv, Rows: 4616
File: VEA.csv, Rows: 4593
File: LULU.csv, Rows: 4592
File: MELI.csv, Rows: 4582
File: MUB.csv, Rows: 4562


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 33
File: ULTA.csv, Rows: 4529
File: MGK.csv, Rows: 4486
File: PM.csv, Rows: 4432
File: V.csv, Rows: 4430
File: ACWI.csv, Rows: 4424
File: FRES.L.csv, Rows: 4413
File: AWK.csv, Rows: 4406
File: KDP.csv, Rows: 4396
Processing cluster 34
File: VRSK.csv, Rows: 4038
File: PHNX.L.csv, Rows: 4026
File: SCHX.csv, Rows: 4019
File: SCHF.csv, Rows: 4019
File: SCHB.csv, Rows: 4019
File: DG.csv, Rows: 4011
File: FTNT.csv, Rows: 4008
File: VCIT.csv, Rows: 4005
File: VGSH.csv, Rows: 4005
File: VCSH.csv, Rows: 4005
File: VGIT.csv, Rows: 4005
File: SCHG.csv, Rows: 3978
File: CHTR.csv, Rows: 3977
File: BNR.DE.csv, Rows: 3960
File: TQQQ.csv, Rows: 3951
File: GNRC.csv, Rows: 3951


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


Processing cluster 35
File: LYB.csv, Rows: 3899
File: OCDO.L.csv, Rows: 3857
File: TSLA.csv, Rows: 3856
File: NXPI.csv, Rows: 3829
File: VOO.csv, Rows: 3806
File: GM.csv, Rows: 3756
File: VXUS.csv, Rows: 3708
File: KMI.csv, Rows: 3698
Processing cluster 36
File: GOVT.csv, Rows: 3438
File: ENPH.csv, Rows: 3413
File: PSX.csv, Rows: 3405
File: AMCR.csv, Rows: 3382
File: META.csv, Rows: 3379
File: PANW.csv, Rows: 3336
Processing cluster 37
File: FANG.csv, Rows: 3277
File: WDAY.csv, Rows: 3277


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


File: IEFA.csv, Rows: 3269
File: IXUS.csv, Rows: 3269
File: IEMG.csv, Rows: 3269
File: ABBV.csv, Rows: 3224
Processing cluster 38
File: NCLH.csv, Rows: 3212
File: CCH.L.csv, Rows: 3158
File: IQV.csv, Rows: 3136
File: VNA.DE.csv, Rows: 3122
File: BNDX.csv, Rows: 3119
File: NWS.csv, Rows: 3108
File: NWSA.csv, Rows: 3108
File: CDW.csv, Rows: 3102
File: QUAL.csv, Rows: 3088
Processing cluster 39
File: WLN.PA.csv, Rows: 2902
File: PAYC.csv, Rows: 2901
File: BME.L.csv, Rows: 2874
File: ANET.csv, Rows: 2865
File: DGRO.csv, Rows: 2861
File: IUSB.csv, Rows: 2861


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)


File: SYF.csv, Rows: 2827
File: ZAL.DE.csv, Rows: 2812
File: CZR.csv, Rows: 2791
File: CFG.csv, Rows: 2789
File: KEYS.csv, Rows: 2771
File: QRVO.csv, Rows: 2720
Processing cluster 40
File: ETSY.csv, Rows: 2649
File: SHOP.csv, Rows: 2625
File: PYPL.csv, Rows: 2594
File: KHC.csv, Rows: 2594
File: 1COV.DE.csv, Rows: 2558
File: VTEB.csv, Rows: 2558
File: SINCH.ST.csv, Rows: 2528
Processing cluster 41
File: FTV.csv, Rows: 2342
File: TTD.csv, Rows: 2287
File: CTEC.L.csv, Rows: 2272
File: HWM.csv, Rows: 2258
File: LW.csv, Rows: 2251
File: COWZ.csv, Rows: 2222
File: INVH.csv, Rows: 2196


  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
  combined_df_vol.fillna(method='ffill', inplace=True)
  combined_df_vol.fillna(method='bfill', inplace=True)
