In [3]:
import pandas as pd
import math
import time
import numpy as np
import pandas as pd
import yfinance as yf
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [4]:
feats = pd.read_csv("data/screener-etf-2025-10-17.csv")

In [5]:
feats

Unnamed: 0,Symbol,Fund Name,Return YTD,RSI,Change YTD,CAGR 1Y,Exchange,Exp. Ratio,Volume,Beta (5Y)
0,SPLG,SPDR Portfolio S&P 500 ETF,13.81%,49.35,12.75%,15.46%,NYSEARCA,0.02%,19278105,1.01
1,VOO,Vanguard S&P 500 ETF,13.79%,49.27,12.73%,15.41%,NYSEARCA,0.03%,7590814,1.01
2,IVV,iShares Core S&P 500 ETF,13.84%,49.69,12.79%,15.46%,NYSEARCA,0.03%,5125010,1.01
3,VTI,Vanguard Total Stock Market ETF,13.46%,49.35,12.41%,15.20%,NYSEARCA,0.03%,5323971,1.02
4,VEA,Vanguard FTSE Developed Markets ETF,29.08%,57.26,26.85%,22.00%,NYSEARCA,0.03%,13741044,0.88
...,...,...,...,...,...,...,...,...,...,...
1626,AFLG,First Trust Active Factor Large Cap ETF,12.06%,53.45,11.58%,11.92%,NYSEARCA,3.67%,199642,0.96
1627,HDGE,AdvisorShares Ranger Equity Bear ETF,2.63%,57.80,2.63%,-0.69%,NYSEARCA,3.80%,92991,-1.14
1628,VPC,Virtus Private Credit Strategy ETF,-9.03%,27.67,-15.82%,-7.65%,NYSEARCA,9.86%,24361,0.74
1629,BIZD,VanEck BDC Income ETF,-9.41%,33.82,-16.42%,-5.10%,NYSEARCA,12.86%,3804815,0.82


In [6]:
feats["Return YTD"]= feats["Return YTD"].str.rstrip('%').astype(float) / 100
feats["Change YTD"]= feats["Change YTD"].str.rstrip('%').astype(float) / 100
feats["CAGR 1Y"]   = feats["CAGR 1Y"].str.rstrip('%').astype(float) / 100
feats["Exp. Ratio"]= feats["Exp. Ratio"].str.rstrip('%').astype(float) / 100

In [7]:
def cluster_and_select(df, k=5, random_state=42):
    cols = [
        "Return YTD",
        "RSI",
        "Change YTD",
        "CAGR 1Y",
        "Exp. Ratio",
        "Volume",
        "Beta (5Y)"
    ]
    X = df[cols].astype(float).values
    Z = StandardScaler().fit_transform(X)
    Zdf = pd.DataFrame(Z, index=df.index, columns=[f"Z_{c}" for c in cols])

    # Composite: emphasize high returns, high volume/liquidity, and size,
    # penalize volatility and valuation
    comp = (
        0.35 * Zdf["Z_Return YTD"]
        + 0.70 * Zdf["Z_RSI"]
        + 0.15 * Zdf["Z_CAGR 1Y"]
        - 0.10 * Zdf["Z_Exp. Ratio"]
        + 0.10 * Zdf["Z_Volume"]
        - 0.10 * Zdf["Z_Beta (5Y)"]
    )

    work = df.copy()
    work["composite"] = comp

    km = KMeans(n_clusters=k, n_init=10, random_state=random_state)
    labels = km.fit_predict(Z)
    work["cluster"] = labels

    best_cluster = int(
        work.groupby("cluster")["composite"].mean().sort_values(ascending=False).index[0]
    )
    winners = (
        work[work["cluster"] == best_cluster]
        .sort_values("composite", ascending=False)
        .head(5)
        .reset_index(drop=True)
    )
    return winners, work

In [8]:
def cluster_and_select(df, k=5, random_state=42):
    cols = [
        "Return YTD",
        "RSI",
        "Change YTD",
        "CAGR 1Y",
        "Exp. Ratio",
        "Volume",
        "Beta (5Y)"
    ]

    # Standardize numeric inputs
    X = df[cols].astype(float).values
    Z = StandardScaler().fit_transform(X)
    Zdf = pd.DataFrame(Z, index=df.index, columns=[f"Z_{c}" for c in cols])

    # Composite scoring (weighted sum)
    comp = (
        0.10 * Zdf["Z_Return YTD"]
        + 0.70 * Zdf["Z_RSI"]
        + 0.15 * Zdf["Z_CAGR 1Y"]
        - 0.10 * Zdf["Z_Exp. Ratio"]
        + 0.90 * Zdf["Z_Volume"]
        - 0.10 * Zdf["Z_Beta (5Y)"]
    )

    work = df.copy()
    work["composite"] = comp

    # Clustering
    km = KMeans(n_clusters=k, n_init=10, random_state=random_state)
    labels = km.fit_predict(Z)
    work["cluster"] = labels

    # # Pick one winner per cluster (highest composite score)
    # winners = (
    #     work.loc[work.groupby("cluster")["composite"].idxmax()]
    #     .sort_values("cluster")
    #     .reset_index(drop=True)
    # )

     # Top N per cluster by composite (desc). If a cluster has <N items, it returns what's available.
    winners = (
        work.sort_values(["cluster", "composite"], ascending=[True, False])
            .groupby("cluster", as_index=False)
            .head(1)
            .reset_index(drop=True)
    )

    return winners, work

In [12]:
winners, work = cluster_and_select(feats,k=2)

winners

Unnamed: 0,Symbol,Fund Name,Return YTD,RSI,Change YTD,CAGR 1Y,Exchange,Exp. Ratio,Volume,Beta (5Y),composite,cluster
0,SOXS,Direxion Daily Semiconductor Bear 3x Shares,-0.8165,34.81,-0.8211,-0.7981,NYSEARCA,0.0097,391823870,-4.28,27.935509,0
1,GDXU,MicroSectors Gold Miners 3X Leveraged ETN,8.9643,73.7,8.9643,4.4744,NYSEARCA,0.0095,1057175,2.02,6.113571,1


In [7]:
import pandas_market_calendars as mcal

nyse = mcal.get_calendar('NYSE')
schedule = nyse.schedule(start_date='2025-01-03', end_date='2025-01-24')
#trading_days = nyse.valid_days(start_date='2025-02-05', end_date='2025-02-09')

#print("Number of NYSE trading days:", len(trading_days))
print("Schedule:", schedule)
print("Number of NYSE trading days:", len(schedule))

schedule = nyse.schedule(start_date='2025-01-25', end_date='2025-01-27')
#trading_days = nyse.valid_days(start_date='2025-02-05', end_date='2025-02-09')

#print("Number of NYSE trading days:", len(trading_days))
print("Schedule:", schedule)
print("Number of NYSE trading days:", len(schedule))

schedule = nyse.schedule(start_date='2025-01-28', end_date='2025-01-30')
#trading_days = nyse.valid_days(start_date='2025-02-05', end_date='2025-02-09')

#print("Number of NYSE trading days:", len(trading_days))
print("Schedule:", schedule)
print("Number of NYSE trading days:", len(schedule))

Schedule:                          market_open              market_close
2025-01-03 2025-01-03 14:30:00+00:00 2025-01-03 21:00:00+00:00
2025-01-06 2025-01-06 14:30:00+00:00 2025-01-06 21:00:00+00:00
2025-01-07 2025-01-07 14:30:00+00:00 2025-01-07 21:00:00+00:00
2025-01-08 2025-01-08 14:30:00+00:00 2025-01-08 21:00:00+00:00
2025-01-10 2025-01-10 14:30:00+00:00 2025-01-10 21:00:00+00:00
2025-01-13 2025-01-13 14:30:00+00:00 2025-01-13 21:00:00+00:00
2025-01-14 2025-01-14 14:30:00+00:00 2025-01-14 21:00:00+00:00
2025-01-15 2025-01-15 14:30:00+00:00 2025-01-15 21:00:00+00:00
2025-01-16 2025-01-16 14:30:00+00:00 2025-01-16 21:00:00+00:00
2025-01-17 2025-01-17 14:30:00+00:00 2025-01-17 21:00:00+00:00
2025-01-21 2025-01-21 14:30:00+00:00 2025-01-21 21:00:00+00:00
2025-01-22 2025-01-22 14:30:00+00:00 2025-01-22 21:00:00+00:00
2025-01-23 2025-01-23 14:30:00+00:00 2025-01-23 21:00:00+00:00
2025-01-24 2025-01-24 14:30:00+00:00 2025-01-24 21:00:00+00:00
Number of NYSE trading days: 14
Schedule:    