In [1]:
import sys
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn import preprocessing

sys.path.append("../")

from loguru import logger

from app.utils import StopWatch

In [None]:
log_dir = Path.cwd().parent / "logs"
logger.add(log_dir / "log.log")

# Parameters

In [3]:
MIN_MATCHES = 250
LAST_TRAIN_MATCH_DATE = datetime(2023, 7, 31)

In [None]:
this_dir = Path().resolve()
DATA_DIR = this_dir.parent / "data"
print(DATA_DIR)

In [5]:
df = pd.read_csv(DATA_DIR / "ml_rows.csv", parse_dates=["start_date"])

In [None]:
df_train = df[(df.start_date <= LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES)]
df_test = df[(df.start_date > LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES)]
print(f"{df_train.shape=}, {df_test.shape=}")

In [7]:
over_dfs = {over: df_train.loc[(df_train["over"] == over)] for over in range(20)}

In [9]:
def extract_values(d, over: int, values: list[str]):
    return d[values]


X_values = [
    "innings",
    "ball_of_innings",
    "wickets_down",
    "run_rate",
    "req_rate",
    "batter_in_first_10",
    "batter_strike_rate",
    "bowler_economy",
    "bowler_wicket_prob",
    "bowler_wide_noball_rate",
]

y_values = [
    "outcome",
]

X_by_over = {over: extract_values(over_dfs[over], over, X_values) for over in range(20)}
y_by_over = {over: extract_values(over_dfs[over], over, y_values) for over in range(20)}

In [10]:
def dump_filename(name: str, over: int) -> str:
    return f"{name}_fitted_over_{over}.model"

# Do Some Fitting

#### DBSCAN

In [1]:
from hdbscan import HDBSCAN


def fit_dbscan(
    X_scaled, over: int, min_cluster_size: int, cluster_selection_method: str
) -> tuple[int, int, int, int, float, float, int]:
    clf = HDBSCAN(
        min_cluster_size=min_cluster_size,
        cluster_selection_method=cluster_selection_method,
    )
    clf.fit(X_scaled)
    # joblib.dump(clf, DATA_DIR / dump_filename(f"DBSCAN_{min_cluster_size}_{cluster_selection_method}", over))
    labels = clf.labels_
    label_counts = np.unique(labels, return_counts=True)[1]
    return (
        len(labels[labels > -1]),
        len(np.unique(clf.labels_)),
        int(np.min(label_counts)),
        int(np.max(label_counts)),
        float(np.mean(label_counts)),
        float(np.median(label_counts)),
        len(labels[labels == -1]),
    )


In [None]:
scalers = {}
X_scaled = {}
with StopWatch(decimals=2) as stopwatch:
    for over in range(0, 2):
        X = X_by_over[over]
        X = X.sample(10_000)
        scaler = preprocessing.StandardScaler().fit(X)
        X_scaled[over] = scaler.transform(X)
        scalers[over] = scaler
        # for cluster_selection_method in ["eom", "leaf"]:
        for min_cluster_size in range(10, 101, 100):
            labelled, labels, small, large, mean, median, noisy = fit_dbscan(
                X_scaled[over], over, min_cluster_size, "eom"
            )
            row_count = labelled + noisy
            msg = (
                f"""over {over}/{min_cluster_size}: {labelled=} ({labelled / row_count:.1%}) {labels=} """
                f"""{small=} {large=} {mean=:.2f} {median=:.2f} {noisy=} ({noisy / row_count:.1%})"""
            )
            # stopwatch.report_split(msg)
            logger.info(msg)

In [1]:
def optimize_hdb(X, param_dist: dict):
    hdb = HDBSCAN(gen_min_span_tree=True)


In [14]:
hot_ranges = {
    0: list(range(22, 39, 2)) + list(range(62, 79, 2)),
    1: list(range(2, 19, 2)),
    2: list(range(100)),
    3: list(range(100)),
    4: list(range(100)),
    5: list(range(100)),
    6: list(range(100)),
    7: list(range(100)),
    8: list(range(100)),
    9: list(range(100)),
    10: list(range(100)),
    11: list(range(100)),
    12: list(range(100)),
    13: list(range(100)),
    14: list(range(100)),
    15: list(range(100)),
    16: list(range(100)),
    17: list(range(100)),
    18: list(range(100)),
    19: list(range(100)),
}

def load_clf(over: int, clusters: int):
    return joblib.load(DATA_DIR / clf_filename(over, clusters))

over = 0
clusters = 25

clf = load_clf(over, clusters)

trained_predictions = clf.predict(X_scaled[over])
actual_outcomes = y_by_over[over]["outcome"]
print(trained_predictions.shape, actual_outcomes.shape)

from collections import defaultdict

cluster_preds = defaultdict(lambda: defaultdict(int))
for pred, outcome in zip(trained_predictions, actual_outcomes):
    cluster_preds[pred][outcome] += 1

VALS = len(X_values)

sums: dict[int, float] = {i: 0.0 for i in range(VALS)}
grand_tot = 0

for idx in range(clusters):
    preds = cluster_preds[idx]
    tot = sum(preds.values())
    grand_tot += tot
    pcts = [preds[idx] / tot for idx in range(VALS)]
    for i in range(VALS):
        sums[i] += preds[i]
    print(f"{idx:3d}", f"{tot:5d}", ", ".join([f"{v:6.2%}" for v in pcts]))

print("sums     ", ", ".join([f"{v / grand_tot:6.2%}" for v in sums.values()]))

df_test_over = df_test.loc[(df_test["over"] == over)]
X_test = extract_values(df_test_over, over, X_values)
X_test_scaled = scalers[over].transform(X_test)
test_predictions = clf.predict(X_test_scaled)
y_actuals = extract_values(df_test_over, over, y_values)["outcome"]

test_preds = defaultdict(lambda: defaultdict(int))
for pred, outcome in zip(test_predictions, y_actuals):
    test_preds[pred][outcome] += 1

VALS = len(X_values)


sums: dict[int, float] = {i: 0.0 for i in range(VALS)}
grand_tot = 0
for idx in range(clusters):
    preds = test_preds[idx]
    tot = sum(preds.values())
    grand_tot += tot
    if tot > 0:
        pcts = [preds[idx] / tot for idx in range(VALS)]
        for i in range(VALS):
            sums[i] += preds[i]

        print(f"{idx:3d}", f"{tot:5d}", ", ".join([f"{v:6.2%}" for v in pcts]))

print("sums     ", ", ".join([f"{v / grand_tot:6.2%}" for v in sums.values()]))