In [None]:
import pandas as pd
from sklearn import preprocessing
from pathlib import Path
from datetime import datetime
import joblib

import sys

sys.path.append("../")

from app.utils import StopWatch

# Parameters

In [None]:
MIN_MATCHES = 1000
LAST_TRAIN_MATCH_DATE = datetime(2023, 7, 31)

In [None]:
this_dir = Path().resolve()
DATA_DIR = this_dir.parent / "data"
print(DATA_DIR)

In [None]:
df = pd.read_csv(DATA_DIR / "ml_rows.csv", parse_dates=["start_date"])

In [None]:
df_train = df[
    (df.start_date <= LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES)
]
df_test = df[(df.start_date > LAST_TRAIN_MATCH_DATE) & (df.match_number > MIN_MATCHES)]
print(f"{df_train.shape=}, {df_test.shape=}")

In [None]:
phase_dfs = {phase: df_train.loc[(df_train["phase"] == phase)] for phase in [0, 1, 2]}

In [None]:
print([f"{phase}: {phase_df.shape}" for phase, phase_df in phase_dfs.items()])

In [None]:
def extract_values(d, phase: int, values: list[str]):
    return d[values]


X_values = [
    "phase",
    "innings",
    "ball_of_innings",
    "wickets_down",
    "run_rate",
    "req_rate",
    "batter_in_first_10",
    "batter_strike_rate",
    "bowler_economy",
    "bowler_wicket_prob",
    "bowler_wide_noball_rate",
]

X_by_phase = {
    phase: extract_values(phase_dfs[phase], phase, X_values) for phase in [0, 1, 2]
}

y_values = [
    "outcome",
]

y_by_phase = {
    phase: extract_values(phase_dfs[phase], phase, y_values) for phase in [0, 1, 2]
}

In [None]:
def clf_filename(phase: int, clusters: int) -> str:
    return f"Kmeans_fitted_phase_{phase}_{clusters}_clusters.model"

In [None]:
from sklearn.cluster import MiniBatchKMeans as Clustering


def fit_phase(X_scaled, phase: int, clusters: int) -> float:
    clf = Clustering(n_clusters=clusters)
    clf.fit(X_scaled)
    joblib.dump(clf, DATA_DIR / clf_filename(phase, clusters))
    return clf.score(X_scaled)

# Do Some Fitting

In [None]:
scalers = {}
X_scaled = {}
with StopWatch(decimals=2) as stopwatch:
    for phase in [0, 1, 2]:
        X = X_by_phase[phase]
        scaler = preprocessing.StandardScaler().fit(X)
        X_scaled[phase] = scaler.transform(X)
        scalers[phase] = scaler
        for clusters in [25, 50, 75, 100, 125, 150]:
            score = fit_phase(X_scaled[phase], phase, clusters)
            stopwatch.report_split(f"fitted {phase=} with {clusters=} {score=}")

In [None]:
def load_clf(phase: int, clusters: int):
    return joblib.load(DATA_DIR / clf_filename(phase, clusters))

In [None]:
phase = 0
clusters = 25

clf = load_clf(phase, clusters)

trained_predictions = clf.predict(X_scaled[phase])
actual_outcomes = y_by_phase[phase]["outcome"]
print(trained_predictions.shape, actual_outcomes.shape)

In [None]:
from collections import defaultdict

cluster_preds = defaultdict(lambda: defaultdict(int))
for pred, outcome in zip(trained_predictions, actual_outcomes):
    cluster_preds[pred][outcome] += 1

sums: dict[int, float] = {i: 0.0 for i in range(11)}
grand_tot = 0

for idx in range(clusters):
    preds = cluster_preds[idx]
    tot = sum(preds.values())
    grand_tot += tot
    pcts = [preds[idx] / tot for idx in range(11)]
    for i in range(11):
        sums[i] += preds[i]
    print(f"{idx:3d}", f"{tot:5d}", ", ".join([f"{v:6.2%}" for v in pcts]))

print("sums     ", ", ".join([f"{v/grand_tot:6.2%}" for v in sums.values()]))

In [None]:
df_test_phase = df_test.loc[(df_test["phase"] == phase)]
X_test = extract_values(df_test_phase, phase, X_values)
X_test_scaled = scalers[phase].transform(X_test)
test_predictions = clf.predict(X_test_scaled)
y_actuals = extract_values(df_test_phase, phase, y_values)["outcome"]

In [None]:
test_preds = defaultdict(lambda: defaultdict(int))
for pred, outcome in zip(test_predictions, y_actuals):
    test_preds[pred][outcome] += 1

sums: dict[int, float] = {i: 0.0 for i in range(11)}
grand_tot = 0

for idx in range(clusters):
    preds = test_preds[idx]
    tot = sum(preds.values())
    grand_tot += tot
    if tot > 0:
        pcts = [preds[idx] / tot for idx in range(11)]
        for i in range(11):
            sums[i] += preds[i]

        print(f"{idx:3d}", f"{tot:5d}", ", ".join([f"{v:6.2%}" for v in pcts]))

print("sums     ", ", ".join([f"{v/grand_tot:6.2%}" for v in sums.values()]))