## RandomForest Classifier Hyperparameter finetunning

In [3]:
from itertools import product
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from scipy import sparse
import matplotlib.pyplot as plt
import sklearn, sys
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

SEED = 42

In [4]:
pairs_df = pd.read_csv("synth_pairs_large.csv")

# Create short text fields for each side
def make_text_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["name_left"]  = (df["first_name_left"].astype(str)  + " " + df["last_name_left"].astype(str))
    df["name_right"] = (df["first_name_right"].astype(str) + " " + df["last_name_right"].astype(str))
    # Option A (no OHE): also feed country tokens into hashing
    df["country_left_txt"]  = df["country_left"].astype(str)
    df["country_right_txt"] = df["country_right"].astype(str)
    return df

pairs_df = make_text_cols(pairs_df)

# 70/30 first
train_df, temp_df = train_test_split(
    pairs_df, test_size=0.30, stratify=pairs_df["label"], random_state=SEED
)
# 15/15 from the remaining 30%
val_df, test_df = train_test_split(
    temp_df, test_size=0.50, stratify=temp_df["label"], random_state=SEED
)

print("Split sizes -> Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)
print("Train label distribution:\n", train_df["label"].value_counts(normalize=True).round(3))
print("Val   label distribution:\n", val_df["label"].value_counts(normalize=True).round(3))
print("Test  label distribution:\n", test_df["label"].value_counts(normalize=True).round(3))

Split sizes -> Train: (16800, 11) Val: (3600, 11) Test: (3600, 11)
Train label distribution:
 label
0    0.5
1    0.5
Name: proportion, dtype: float64
Val   label distribution:
 label
1    0.5
0    0.5
Name: proportion, dtype: float64
Test  label distribution:
 label
1    0.5
0    0.5
Name: proportion, dtype: float64


In [5]:
# Step 3: Pairwise TF-IDF → cosine similarities (names + countries)
text_cols = ["name_left", "name_right", "country_left_txt", "country_right_txt"]

#  Fit TF-IDF encoders on TRAIN ONLY (concat left/right to build vocab) ---
tfidf_name = TfidfVectorizer(analyzer="char", ngram_range=(2,5), sublinear_tf=True, lowercase=False)
tfidf_country = TfidfVectorizer(analyzer="char", ngram_range=(2,5), sublinear_tf=True, lowercase=False)

tfidf_name.fit(pd.concat([train_df["name_left"], train_df["name_right"]], axis=0))
tfidf_country.fit(pd.concat([train_df["country_left_txt"], train_df["country_right_txt"]], axis=0))

def _pairwise_features_to_sparse(df: pd.DataFrame) -> sparse.csr_matrix:
    # Transform each side
    nl = tfidf_name.transform(df["name_left"])
    nr = tfidf_name.transform(df["name_right"])
    cl = tfidf_country.transform(df["country_left_txt"])
    cr = tfidf_country.transform(df["country_right_txt"])

    # Cosine similarities (as dense 1-D arrays)
    name_sim = cosine_similarity(nl, nr).diagonal()
    country_sim = cosine_similarity(cl, cr).diagonal()

    # Simple auxiliary signals
    fn_len_diff = (df["first_name_left"].astype(str).str.len() - df["first_name_right"].astype(str).str.len()).abs().to_numpy()
    ln_len_diff = (df["last_name_left"].astype(str).str.len()  - df["last_name_right"].astype(str).str.len()).abs().to_numpy()
    country_eq  = (df["country_left_txt"] == df["country_right_txt"]).astype(int).to_numpy()

    # Stack into a sparse CSR (5 feature columns)
    M = np.column_stack([name_sim, country_sim, fn_len_diff, ln_len_diff, country_eq]).astype(float)
    return sparse.csr_matrix(M)

# Build split matrices
X_train_sparse = _pairwise_features_to_sparse(train_df)
X_val_sparse   = _pairwise_features_to_sparse(val_df)
X_test_sparse  = _pairwise_features_to_sparse(test_df)

# Targets (unchanged)
y_train = train_df["label"].astype(int).to_numpy()
y_val   = val_df["label"].astype(int).to_numpy()
y_test  = test_df["label"].astype(int).to_numpy()

# Progress / troubleshooting printouts
def describe_sparse(name, X):
    nnz = X.nnz if sparse.issparse(X) else np.count_nonzero(X)
    total = X.shape[0] * X.shape[1]
    density = nnz / total if total else 0.0
    print(f"{name}: shape={X.shape}, nnz={nnz:,}, density={density:.6f}")

describe_sparse("X_train", X_train_sparse)
describe_sparse("X_val  ", X_val_sparse)
describe_sparse("X_test ", X_test_sparse)
print("Targets -> y_train:", y_train.shape, "y_val:", y_val.shape, "y_test:", y_test.shape)

X_train: shape=(16800, 5), nnz=58,687, density=0.698655
X_val  : shape=(3600, 5), nnz=12,676, density=0.704222
X_test : shape=(3600, 5), nnz=12,571, density=0.698389
Targets -> y_train: (16800,) y_val: (3600,) y_test: (3600,)


In [6]:
n_estimators_list   = [200, 400, 600]
max_depth_list      = [None, 10, 20]
min_samples_leaf_list = [1, 2, 5]
MAX_ERROR_FRACTION = 0.5  # e.g., at most XX % of the batch are errors

def build_class_batch(err_idx, corr_pool, target_n):
    """Always include as many errors as possible up to target_n,
    then fill with correct examples if needed."""
    err_idx = np.asarray(err_idx)
    corr_pool = np.asarray(corr_pool)
    
    # desired number of errors, but not more than we actually have
    max_err_allowed = int(target_n * MAX_ERROR_FRACTION)
    n_err = min(len(err_idx), max_err_allowed)
    n_corr = target_n - n_err
    
    chosen_err = rng.choice(err_idx, size=n_err, replace=False) if n_err > 0 else np.array([], dtype=int)
    chosen_corr = rng.choice(corr_pool, size=n_corr, replace=False) if n_corr > 0 else np.array([], dtype=int)
    
    return np.concatenate([chosen_err, chosen_corr])

# Permutation of hyperparameters including HITL logic
print("Permutations of hyperparameters:\n")

for n_est, depth, leaf in product(n_estimators_list, max_depth_list, min_samples_leaf_list):
    print(f"n_estimators={n_est}, max_depth={depth}, min_samples_leaf={leaf}")
    
    clf = RandomForestClassifier(
    n_estimators=n_est,      # try 200–500
    max_depth=depth,       # or e.g. 10 if you want to regularize
    min_samples_split=2,
    min_samples_leaf=leaf,
    n_jobs=-1,
    random_state=SEED
    )

    clf.fit(X_train_sparse, y_train)

    # ---- TRAIN METRICS ----
    train_probs = clf.predict_proba(X_train_sparse)[:, 1]
    train_preds = (train_probs >= 0.5).astype(int)
    

    # ---- VALIDATION METRICS ----
    val_probs  = clf.predict_proba(X_val_sparse)[:, 1]
    val_preds  = (val_probs >= 0.5).astype(int)
    
    
    # ---- TEST METRICS ---- (baseline generalization)  
    test_probs = clf.predict_proba(X_test_sparse)[:, 1]
    test_preds = (test_probs >= 0.5).astype(int)

    # Build indices for errors and correct predictions
    y_val_np = np.asarray(y_val)
    val_errors_mask = (val_preds != y_val_np)
    
    # Indices of errors per class
    err_0_idx = np.where(val_errors_mask & (y_val_np == 0))[0]
    err_1_idx = np.where(val_errors_mask & (y_val_np == 1))[0]
    
    # Indices of correctly classified examples per class
    corr_0_pool = np.where((~val_errors_mask) & (y_val_np == 0))[0]
    corr_1_pool = np.where((~val_errors_mask) & (y_val_np == 1))[0]
    
    rng = np.random.default_rng(SEED)
    
    # Choose a target number per class.
    # Option A: use the smaller available total to avoid running out.
    max_per_class_0 = len(err_0_idx) + len(corr_0_pool)
    max_per_class_1 = len(err_1_idx) + len(corr_1_pool)
    target_per_class = min(max_per_class_0, max_per_class_1, 200)  # cap at 200 per class, for example 
    
    human_idx_0 = build_class_batch(err_0_idx, corr_0_pool, target_per_class)
    human_idx_1 = build_class_batch(err_1_idx, corr_1_pool, target_per_class)
    
    # Final balanced human batch indices
    human_idx = np.concatenate([human_idx_0, human_idx_1])
    rng.shuffle(human_idx)
    
    X_human = X_val_sparse[human_idx]
    y_human = y_val_np[human_idx]
    
    err_0_contrib = np.intersect1d(human_idx_0, err_0_idx).size
    corr_0_contrib = human_idx_0.size - err_0_contrib
    
    err_1_contrib = np.intersect1d(human_idx_1, err_1_idx).size
    corr_1_contrib = human_idx_1.size - err_1_contrib

    # Combine original train data with human batch
    X_train_ext = sparse.vstack([X_train_sparse, X_human])
    y_train_ext = np.concatenate([y_train, y_human])
    
    # Retrain selected model on extended data
    clf.fit(X_train_ext, y_train_ext)

    # Re-score test (unseen) to measure true generalization change
    test_probs2 = clf.predict_proba(X_test_sparse)[:, 1]
    test_preds2 = (test_probs2 >= 0.5).astype(int)
    
    print("\n=== Test (after human update) ===")
    print(classification_report(y_test, test_preds2, digits=3))
    print("ROC-AUC:", roc_auc_score(y_test, test_probs2))
    print("PR-AUC :", average_precision_score(y_test, test_probs2))


print("\nGrid search completed.")    

Permutations of hyperparameters:

n_estimators=200, max_depth=None, min_samples_leaf=1

=== Test (after human update) ===
              precision    recall  f1-score   support

           0      0.963     0.953     0.958      1800
           1      0.953     0.964     0.959      1800

    accuracy                          0.958      3600
   macro avg      0.958     0.958     0.958      3600
weighted avg      0.958     0.958     0.958      3600

ROC-AUC: 0.983980864197531
PR-AUC : 0.9722960849284576
n_estimators=200, max_depth=None, min_samples_leaf=2

=== Test (after human update) ===
              precision    recall  f1-score   support

           0      0.973     0.951     0.962      1800
           1      0.952     0.974     0.963      1800

    accuracy                          0.962      3600
   macro avg      0.962     0.962     0.962      3600
weighted avg      0.962     0.962     0.962      3600

ROC-AUC: 0.9880751543209876
PR-AUC : 0.9784087021791772
n_estimators=200, max_dep