In [1]:
import sys
from datetime import datetime
from pathlib import Path

import hdbscan
import joblib
import numpy as np
import pandas as pd
from sklearn import preprocessing

sys.path.append("../")

In [None]:
from loguru import logger

log_dir = Path.cwd().parent / "logs"
logger.add(log_dir / "log.log")

In [3]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
MIN_MATCHES = 250
LAST_TRAIN_MATCH_DATE = datetime(2023, 7, 31)
this_dir = Path().resolve()
DATA_DIR = this_dir.parent / "data"
print(DATA_DIR)
df = pd.read_csv(DATA_DIR / "ml_rows.csv", parse_dates=["start_date"])

In [5]:
df_train = {
    inns: df[(df.start_date <= LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES) & (df.innings == inns)]
    for inns in range(2)
}
df_test = {
    inns: df[(df.start_date > LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES) & (df.innings == inns)]
    for inns in range(2)
}

train_dfs = {
    inns: {over: df_train[inns].loc[(df_train[inns]["over"] == over)] for over in range(20)} for inns in range(2)
}

In [None]:
train_dfs[0][1].dtypes

In [7]:
def extract_values(d, values: list[str]):
    return d[values]


X_values = [
    "wickets_down",
    "run_rate",
    "req_rate",
    "batter_in_first_10",
    "batter_strike_rate",
    "batter_dismissal_prob",
    "batter_dismissal_vs_style",
    "bowler_economy",
    "bowler_wicket_prob",
    "bowler_wicket_vs _style",
    "bowler_wide_noball_rate",
]

y_values = [
    "outcome",
]

In [8]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

In [9]:
# see https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970

joblib_memory = joblib.Memory()


def optimize_hdb(X, param_dist: dict, inns: int, over: int):
    hdb = hdbscan.HDBSCAN(
        gen_min_span_tree=True,
        memory=joblib_memory,
        cluster_selection_method="leaf",
        metric="euclidean",
    ).fit(X)

    scorer = make_scorer(hdbscan.validity.validity_index, greater_is_better=True)
    # scorer = make_scorer(hdbscan.validity.validity_index, greater_is_better=True)

    n_iter_search = 20
    random_search = RandomizedSearchCV(
        hdb, param_distributions=param_dist, n_iter=n_iter_search, scoring=scorer, random_state=1
    )

    random_search.fit(X)

    print(f"Best Parameters {random_search.best_params_}")
    dbcv_score = random_search.best_estimator_.relative_validity_
    print(f"DBCV score :{dbcv_score}")

    # evaluate the clusters
    labels = random_search.best_estimator_.labels_
    clustered = labels >= 0

    coverage = np.sum(clustered) / X.shape[0]
    total_clusters = np.max(labels) + 1
    cluster_sizes = np.bincount(labels[clustered]).tolist()

    print(f"Percent of data retained: {coverage}")
    print(f"Total Clusters found: {total_clusters}")
    print(f"Cluster splits: {cluster_sizes}")

    best_params = [
        random_search.best_params_[param]
        for param in [
            "min_samples",
            "min_cluster_size",
            "cluster_selection_epsilon",
            "alpha",
        ]
    ]

    str_vals = [
        str(inns),
        str(over),
        str(best_params[0]),
        str(best_params[1]),
        str(best_params[2]),
        str(best_params[3]),
        f"{dbcv_score:.5f}",
        f"{coverage:.1%}",
        str(total_clusters),
    ]

    msg = "| " + " | ".join(str_vals) + " |"

    logger.info(msg.strip())

In [None]:
logger.info("starting...")
for inns in range(1):
    for over in range(2):
        X = extract_values(train_dfs[inns][over], X_values)
        # X = X.sample(1_000)
        scaler = preprocessing.StandardScaler().fit(X)
        X_scaled = scaler.transform(X)
        optimize_hdb(
            X_scaled,
            {
                "min_samples": [5, 7, 10, 14, 20, 28, 36, 50, 70, 100],
                "min_cluster_size": [50, 100, 140, 200, 280, 400],
                "cluster_selection_epsilon": [0.1, 0.14, 0.2, 0.28, 0.4, 0.56, 0.7, 0.8, 0.9, 1.0, 1.1],
                "alpha": [0.02, 0.05, 0.1, 0.3, 0.5, 0.7, 1.0],
            },
            inns,
            over,
        )
logger.info("all done!")

| over | min_samples | min_cluster_size | metric | cluster_selection_method | dbcv score | % data retained | clusters |
| --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 60 | 50 | leaf | euclidean | 0.01718671973211158 | 0.3267860971484384 | 30 |
| 1 | 60 | 50 | leaf | euclidean | 0.0004236590487102877 | 0.3814011337271879 | 21 |
| 2 | 60 | 50 | leaf | euclidean | 0.0001322767377816463 | 0.49497887748943875 | 25 |
| 3 | 60 | 50 | leaf | euclidean | 0.010659961352848637 | 0.5143344668594871 | 21 |
| 4 | 60 | 50 | leaf | euclidean | 0.00010486661772658043 | 0.46883310867426864 | 18 |
| 5 | 60 | 50 | leaf | euclidean | 0.00014236732093149724 | 0.33991540814958376 | 16 |
| 6 | 60 | 50 | leaf | euclidean | 5.389679115965339e-05 | 0.2649577908493515 | 16 |
| 7 | 60 | 50 | leaf | euclidean | 0.00024103239727212846 | 0.17515262364368867 | 16 |
| 8 | 60 | 50 | leaf | euclidean | 0.14586455105206522 | 0.2875743170280636 | 13 |
| 9 | 60 | 50 | leaf | euclidean | 0.16179595204736644 | 0.5197726026031018 | 11 |
| 10 | 60 | 50 | leaf | euclidean | 0.0027864273693727357 | 0.6821199518989879 | 10 |
| 11 | 60 | 50 | leaf | euclidean | 0.2569892575528506 | 0.6621440324009619 | 9 |
| 12 | 60 | 50 | leaf | euclidean | 0.2213270535430393 | 0.6541379266380223 | 9 |
| 13 | 60 | 50 | leaf | euclidean | 0.00023074855116647224 | 0.9396190402496744 | 8 |
| 14 | 60 | 50 | leaf | euclidean | 0.00010857887839013185 | 0.9383274896645559 | 8 |
| 15 | 60 | 50 | leaf | euclidean | 0.00016876378769864825 | 0.9377187219264961 | 7 |
| 16 | 60 | 50 | leaf | euclidean | 0.005322336377586585 | 0.942445151427509 | 5 |
| 17 | 60 | 50 | leaf | euclidean | 0.004093128615905713 | 0.9420499818906194 | 5 |
| 18 | 60 | 50 | leaf | euclidean | 0.002438867408535001 | 0.9408895265423243 | 5 |
| 19 | 60 | 50 | leaf | euclidean | 0.002230152927614161 | 0.9363866176877197 | 5 |