In [1]:
import sys
from datetime import datetime
from pathlib import Path

import hdbscan
import joblib
import numpy as np
import pandas as pd
from sklearn import preprocessing

sys.path.append("../")

In [2]:
from loguru import logger

log_dir = Path.cwd().parent / "logs"
logger.add(log_dir / "log.log")

1

In [3]:
import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [4]:
MIN_MATCHES = 250
LAST_TRAIN_MATCH_DATE = datetime(2023, 7, 31)
this_dir = Path().resolve()
DATA_DIR = this_dir.parent / "data"
print(DATA_DIR)
df = pd.read_csv(DATA_DIR / "ml_rows.csv", parse_dates=["start_date"])

/home/mikew/extracover/data


In [5]:
df_train = {
    inns: df[(df.start_date <= LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES) & (df.innings == inns)]
    for inns in range(2)
}
df_test = {
    inns: df[(df.start_date > LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES) & (df.innings == inns)]
    for inns in range(2)
}

train_dfs = {
    inns: {over: df_train[inns].loc[(df_train[inns]["over"] == over)] for over in range(20)} for inns in range(2)
}

In [6]:
train_dfs[0][1].dtypes

match_number                        int64
start_date                 datetime64[ns]
innings                             int64
ball_of_innings                     int64
over                                int64
wickets_down                        int64
run_rate                          float64
req_rate                          float64
batter_in_first_10                  int64
batter_strike_rate                float64
bowler_economy                    float64
bowler_wicket_prob                float64
bowler_wide_noball_rate           float64
outcome                             int64
dtype: object

In [7]:
def extract_values(d, values: list[str]):
    return d[values]


X_values = [
    "wickets_down",
    "run_rate",
    "req_rate",
    "batter_in_first_10",
    "batter_strike_rate",
    "bowler_economy",
    "bowler_wicket_prob",
    "bowler_wide_noball_rate",
]

y_values = [
    "outcome",
]

In [8]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

In [18]:
# see https://towardsdatascience.com/tuning-with-hdbscan-149865ac2970

joblib_memory = joblib.Memory()


def optimize_hdb(X, param_dist: dict, inns: int, over: int):
    hdb = hdbscan.HDBSCAN(
        gen_min_span_tree=True,
        memory=joblib_memory,
        cluster_selection_method="leaf",
        metric="euclidean",
    ).fit(X)

    scorer = make_scorer(hdbscan.validity.validity_index, greater_is_better=True)
    # scorer = make_scorer(hdbscan.validity.validity_index, greater_is_better=True)

    n_iter_search = 20
    random_search = RandomizedSearchCV(
        hdb, param_distributions=param_dist, n_iter=n_iter_search, scoring=scorer, random_state=1
    )

    random_search.fit(X)

    print(f"Best Parameters {random_search.best_params_}")
    dbcv_score = random_search.best_estimator_.relative_validity_
    print(f"DBCV score :{dbcv_score}")

    # evaluate the clusters
    labels = random_search.best_estimator_.labels_
    clustered = labels >= 0

    coverage = np.sum(clustered) / X.shape[0]
    total_clusters = np.max(labels) + 1
    cluster_sizes = np.bincount(labels[clustered]).tolist()

    print(f"Percent of data retained: {coverage}")
    print(f"Total Clusters found: {total_clusters}")
    print(f"Cluster splits: {cluster_sizes}")

    best_params = [
        random_search.best_params_[param]
        for param in [
            "min_samples",
            "min_cluster_size",
            "cluster_selection_epsilon",
            "alpha",
        ]
    ]

    str_vals = [
        str(inns),
        str(over),
        str(best_params[0]),
        str(best_params[1]),
        str(best_params[2]),
        str(best_params[3]),
        f"{dbcv_score:.5f}",
        f"{coverage:.1%}",
        str(total_clusters),
    ]

    msg = "| " + " | ".join(str_vals) + " |"

    logger.info(msg.strip())

In [20]:
logger.info("starting...")
for inns in range(1):
    for over in range(20):
        X = extract_values(train_dfs[inns][over], X_values)
        # X = X.sample(1_000)
        scaler = preprocessing.StandardScaler().fit(X)
        X_scaled = scaler.transform(X)
        optimize_hdb(
            X_scaled,
            {
                "min_samples": [5, 7, 10, 14, 20, 28, 40, 56, 80],
                "min_cluster_size": [20, 30, 50, 70, 100, 140, 200, 280, 400],
                "cluster_selection_epsilon": [0.1, 0.14, 0.2, 0.28, 0.4, 0.56, 0.8, 1.12, 1.6, 2.24],
                "alpha": [0.1, 0.3, 0.5, 0.7, 1.0, 1.3],
            },
            inns,
            over,
        )
logger.info("all done!")

[32m2025-01-15 16:55:07.118[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mstarting...[0m


Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:01:19.793[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 0 | 7 | 400 | 0.2 | 0.1 | 0.04266 | 99.7% | 3 |[0m


DBCV score :0.04266456326041908
Percent of data retained: 0.9968290778295748
Total Clusters found: 3
Cluster splits: [1559, 2980, 37586]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:10:03.883[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 1 | 7 | 400 | 0.2 | 0.1 | 0.01505 | 98.8% | 5 |[0m


DBCV score :0.015046827222063864
Percent of data retained: 0.9878633859015771
Total Clusters found: 5
Cluster splits: [504, 1311, 1201, 9254, 29323]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:17:51.548[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 2 | 7 | 400 | 0.2 | 0.1 | 0.00422 | 97.8% | 7 |[0m


DBCV score :0.004218813594149502
Percent of data retained: 0.9775566478839468
Total Clusters found: 7
Cluster splits: [582, 610, 604, 3683, 18381, 3721, 13188]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:24:52.120[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 3 | 7 | 400 | 0.2 | 0.1 | 0.00723 | 96.3% | 9 |[0m


DBCV score :0.007229042915650004
Percent of data retained: 0.9628366204626805
Total Clusters found: 9
Cluster splits: [610, 1275, 617, 8184, 3764, 1121, 5128, 7525, 11856]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:32:23.997[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 4 | 7 | 400 | 0.2 | 0.1 | 0.00496 | 94.3% | 10 |[0m


DBCV score :0.004959129753483794
Percent of data retained: 0.9434039381626717
Total Clusters found: 10
Cluster splits: [676, 709, 2086, 7158, 9182, 450, 2065, 8455, 6402, 2056]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:38:56.619[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 5 | 7 | 400 | 0.2 | 0.1 | 0.00348 | 91.6% | 10 |[0m


DBCV score :0.0034839107172065877
Percent of data retained: 0.9163572358586102
Total Clusters found: 10
Cluster splits: [562, 801, 743, 3754, 8706, 7695, 760, 3139, 5479, 6366]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:44:55.099[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 6 | 7 | 400 | 0.2 | 0.1 | 0.00352 | 90.1% | 10 |[0m


DBCV score :0.0035155989799579147
Percent of data retained: 0.9007486994992464
Total Clusters found: 10
Cluster splits: [1085, 711, 3637, 5616, 3861, 1159, 1469, 5789, 5080, 8648]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:51:05.070[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 7 | 7 | 400 | 0.2 | 0.1 | 0.00548 | 88.1% | 11 |[0m


DBCV score :0.0054789591409582375
Percent of data retained: 0.8808961402654328
Total Clusters found: 11
Cluster splits: [1153, 723, 2526, 1402, 3740, 4947, 483, 2504, 4173, 6151, 8372]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 17:57:14.030[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 8 | 7 | 400 | 0.2 | 0.1 | 0.00314 | 87.5% | 12 |[0m


DBCV score :0.003136611784931465
Percent of data retained: 0.8745511834102733
Total Clusters found: 12
Cluster splits: [581, 1071, 485, 1842, 4295, 3792, 1841, 992, 3081, 3669, 6778, 7378]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:02:55.815[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 9 | 7 | 400 | 0.2 | 0.1 | 0.00245 | 72.5% | 13 |[0m


DBCV score :0.002453085731905016
Percent of data retained: 0.7250782625709254
Total Clusters found: 13
Cluster splits: [1090, 650, 638, 2143, 1169, 3593, 3859, 1714, 4563, 2038, 7132, 451, 607]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:09:18.525[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 10 | 7 | 400 | 0.2 | 0.1 | 0.01715 | 68.3% | 13 |[0m


DBCV score :0.017145069925247065
Percent of data retained: 0.6834736635989818
Total Clusters found: 13
Cluster splits: [1138, 895, 3083, 908, 2692, 3909, 1390, 492, 2100, 5080, 5234, 498, 505]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:16:28.352[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 11 | 7 | 400 | 0.2 | 0.1 | 0.00272 | 80.9% | 13 |[0m


DBCV score :0.002716290384891811
Percent of data retained: 0.8085200689485349
Total Clusters found: 13
Cluster splits: [828, 459, 731, 628, 2797, 801, 4015, 2545, 6173, 5474, 3966, 1275, 3142]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:24:48.538[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 12 | 7 | 400 | 0.2 | 0.1 | 0.01302 | 80.0% | 13 |[0m


DBCV score :0.013018422489304122
Percent of data retained: 0.8003054036746958
Total Clusters found: 13
Cluster splits: [880, 490, 1033, 3549, 5761, 3028, 5471, 402, 748, 1864, 2024, 3456, 3788]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:31:54.952[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 13 | 7 | 400 | 0.2 | 0.1 | 0.01188 | 76.9% | 12 |[0m


DBCV score :0.01188485518374608
Percent of data retained: 0.7686140598322322
Total Clusters found: 12
Cluster splits: [813, 476, 1565, 2124, 4141, 4456, 5751, 1607, 3395, 3504, 928, 2302]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:39:50.334[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 14 | 7 | 400 | 0.2 | 0.1 | 0.01292 | 75.2% | 12 |[0m


DBCV score :0.012918127462438554
Percent of data retained: 0.7519907018151244
Total Clusters found: 12
Cluster splits: [771, 694, 2119, 1385, 4519, 5373, 3742, 1325, 2708, 1210, 3810, 2753]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:47:55.733[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 15 | 7 | 400 | 0.2 | 0.1 | 0.00821 | 74.0% | 13 |[0m


DBCV score :0.008214462956997834
Percent of data retained: 0.7399667006287121
Total Clusters found: 13
Cluster splits: [501, 908, 990, 2711, 4685, 2919, 4657, 3575, 3274, 791, 2457, 621, 1688]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 18:55:59.138[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 16 | 7 | 400 | 0.2 | 0.1 | 0.00646 | 71.6% | 14 |[0m


DBCV score :0.006455136674137434
Percent of data retained: 0.7163774673097002
Total Clusters found: 14
Cluster splits: [400, 541, 412, 3212, 1277, 4603, 2035, 3984, 478, 1761, 3366, 3445, 1068, 2235]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 19:06:35.705[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 17 | 7 | 400 | 0.2 | 0.1 | 0.00036 | 68.0% | 12 |[0m


DBCV score :0.00036382895595821733
Percent of data retained: 0.6802637516923231
Total Clusters found: 12
Cluster splits: [1284, 3031, 592, 1842, 3390, 3900, 1259, 2802, 3620, 2898, 1588, 927]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 19:16:31.677[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 18 | 7 | 400 | 0.2 | 0.1 | 0.00042 | 66.6% | 13 |[0m


DBCV score :0.00041756506621666285
Percent of data retained: 0.665539853599024
Total Clusters found: 13
Cluster splits: [880, 2153, 3248, 3107, 2149, 1023, 3400, 3351, 405, 2083, 812, 2232, 1342]
Best Parameters {'min_samples': 7, 'min_cluster_size': 400, 'cluster_selection_epsilon': 0.2, 'alpha': 0.1}


[32m2025-01-15 19:26:08.935[0m | [1mINFO    [0m | [36m__main__[0m:[36moptimize_hdb[0m:[36m64[0m - [1m| 0 | 19 | 7 | 400 | 0.2 | 0.1 | 0.00056 | 65.6% | 14 |[0m
[32m2025-01-15 19:26:08.937[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mall done![0m


DBCV score :0.0005554729846316405
Percent of data retained: 0.6562939381880886
Total Clusters found: 14
Cluster splits: [2292, 526, 1342, 2601, 701, 2203, 1331, 512, 3215, 2880, 1393, 2549, 2053, 1141]


| over | min_samples | min_cluster_size | metric | cluster_selection_method | dbcv score | % data retained | clusters |
| --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 60 | 50 | leaf | euclidean | 0.01718671973211158 | 0.3267860971484384 | 30 |
| 1 | 60 | 50 | leaf | euclidean | 0.0004236590487102877 | 0.3814011337271879 | 21 |
| 2 | 60 | 50 | leaf | euclidean | 0.0001322767377816463 | 0.49497887748943875 | 25 |
| 3 | 60 | 50 | leaf | euclidean | 0.010659961352848637 | 0.5143344668594871 | 21 |
| 4 | 60 | 50 | leaf | euclidean | 0.00010486661772658043 | 0.46883310867426864 | 18 |
| 5 | 60 | 50 | leaf | euclidean | 0.00014236732093149724 | 0.33991540814958376 | 16 |
| 6 | 60 | 50 | leaf | euclidean | 5.389679115965339e-05 | 0.2649577908493515 | 16 |
| 7 | 60 | 50 | leaf | euclidean | 0.00024103239727212846 | 0.17515262364368867 | 16 |
| 8 | 60 | 50 | leaf | euclidean | 0.14586455105206522 | 0.2875743170280636 | 13 |
| 9 | 60 | 50 | leaf | euclidean | 0.16179595204736644 | 0.5197726026031018 | 11 |
| 10 | 60 | 50 | leaf | euclidean | 0.0027864273693727357 | 0.6821199518989879 | 10 |
| 11 | 60 | 50 | leaf | euclidean | 0.2569892575528506 | 0.6621440324009619 | 9 |
| 12 | 60 | 50 | leaf | euclidean | 0.2213270535430393 | 0.6541379266380223 | 9 |
| 13 | 60 | 50 | leaf | euclidean | 0.00023074855116647224 | 0.9396190402496744 | 8 |
| 14 | 60 | 50 | leaf | euclidean | 0.00010857887839013185 | 0.9383274896645559 | 8 |
| 15 | 60 | 50 | leaf | euclidean | 0.00016876378769864825 | 0.9377187219264961 | 7 |
| 16 | 60 | 50 | leaf | euclidean | 0.005322336377586585 | 0.942445151427509 | 5 |
| 17 | 60 | 50 | leaf | euclidean | 0.004093128615905713 | 0.9420499818906194 | 5 |
| 18 | 60 | 50 | leaf | euclidean | 0.002438867408535001 | 0.9408895265423243 | 5 |
| 19 | 60 | 50 | leaf | euclidean | 0.002230152927614161 | 0.9363866176877197 | 5 |