# train_v1

Uses the labeled subset `silver.word_states` data for training purposes

In [0]:
%run "./00_setup"

In [0]:
from src.trainutils import convert_df_to_features_v1, evaluate_thresholds
from src.models.HybridFrequencyBinaryClassifier import HybridFrequencyBinaryClassifier
from src.constants import TRAINED_MODELS_PATH
from src.fileutils import get_local_path

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score
import joblib
from pyspark.sql.types import ArrayType, FloatType
import pyspark.sql.functions as F
from pathlib import Path

In [0]:
# Constants - TODO: Keep here or move?
C = 0.10
N_COMPONENTS = 300

# TODO: Notebook parameters
_SOURCE_DB_NAME = "silver"
_SOURCE_TABLE_NAME = "word_states"

In [0]:
def decompose_for_training(x_train, x_test, n_components=50):
    # Separate frequency and embeddings
    x_train_freq = x_train[:, :1]
    x_train_emb = x_train[:, 1:]

    x_test_freq = x_test[:, :1]
    x_test_emb = x_test[:, 1:]
    
    # Apply Truncated SVD to embeddings
    svd = TruncatedSVD(n_components=n_components, random_state=0)
    x_train_emb_reduced = svd.fit_transform(x_train_emb)
    x_test_emb_reduced = svd.transform(x_test_emb)

    # Recombine with frequency
    x_train_reduced = np.hstack([x_train_freq, x_train_emb_reduced])
    x_test_reduced = np.hstack([x_test_freq, x_test_emb_reduced])

    return x_train_reduced, x_test_reduced, svd

In [0]:
def get_frequency_bin_boundaries(freq):
    # Find optimized bin boundaries
    # Choose 20 candidate thresholds and split into low and high ranges
    quantiles = np.linspace(0.05, 0.95, 20)
    candidates = np.quantile(freq, quantiles)
    
    # Split into low and high ranges, avoiding complete overlap
    low_range = candidates[:10]   # 5th to 50th percentile
    high_range = candidates[10:]  # 50th to 95th percentile
    
    thresholds, _ = evaluate_thresholds(freq, y, low_range, high_range)
    low_threshold, high_threshold = thresholds
    
    if low_threshold is None or high_threshold is None:
        # substitue hard coded values
        low_threshold = 1.68
        high_threshold = 4.72

    return low_threshold, high_threshold

In [0]:
def train(X_train_reduced, y_train, freq, words):
    
    low_threshold, high_threshold = get_frequency_bin_boundaries(freq)

    # Initialize model and train
    low_freq_clf = GaussianNB()
    mid_freq_clf = LogisticRegression(class_weight="balanced",
        max_iter=1000,
        C=C,
        random_state=0
    )
    
    high_freq_clf = RandomForestClassifier(
        n_estimators=100,
        max_depth=15,
        class_weight='balanced',
        random_state=0,
        n_jobs=1
    )
              
    clf = HybridFrequencyBinaryClassifier(
        low_mid_threshold=low_threshold,
        mid_high_threshold=high_threshold,
        low_freq_model=low_freq_clf,
        mid_freq_model=mid_freq_clf,
        high_freq_model=high_freq_clf
    )
    
    clf.fit(x_train_reduced, y_train)

    return clf

In [0]:
df = spark.sql(f"SELECT * FROM {_SOURCE_DB_NAME}.{_SOURCE_TABLE_NAME} WHERE label is not NULL")

In [0]:
print(f"Loaded {df.count()} training samples from {_SOURCE_DB_NAME}.{_SOURCE_TABLE_NAME}")

In [0]:
# Convert frequencies to log frequencies
df = df.withColumn("log_frequency", F.log10(F.col("frequency") + 1))

# Concatenate log_frequency + features into single feature vector
final_df = convert_df_to_features_v1(df)

In [0]:
# Convert to Pandas df to get X and y
pandas_df = final_df.toPandas()
X = pandas_df['features'].tolist()  # Convert Series of arrays to list of arrays
y = pandas_df['label'].tolist()
words = [row.word for row in df.select("word").collect()]
freq = np.array([row.log_frequency for row in df.select("log_frequency").collect()])

X = np.array(pandas_df['features'].tolist())  # Shape: (n_samples, 769)
y = np.array(pandas_df['label'].tolist())     # Shape: (n_samples,)

In [0]:
# Get the test and train split and decompose the X data
x_train, x_test, y_train, y_test, words_train, words_test = train_test_split(
    X,
    y,
    words, 
    test_size=0.33, 
    random_state=0, 
    stratify=y
)
    
x_train_reduced, x_test_reduced, svd = decompose_for_training(x_train, x_test, N_COMPONENTS)

In [0]:
# Train the model
clf = train(x_train_reduced, y_train, freq, words)

In [0]:
# Score the results
y_pred = clf.predict(x_test_reduced)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_true=y_test, y_pred=y_pred)
y_proba = clf.predict_proba(x_test_reduced)
auc = roc_auc_score(y_true=y_test, y_score=y_proba[:, 1])

print(f"confusion matrix:")
print(confusion_matrix(y_true=y_test, y_pred=y_pred))
print(f"accuracy: {acc}, f1: {f1}, AUC: {auc}")

In [0]:
results = pd.DataFrame({
    "word": words_test,
    "frequency": x_test_reduced[:, 0],
    "true": y_test,
    "pred": y_pred
})

false_negatives = results[(results["true"] == 1) & (results["pred"] == 0)]
true_negatives = results[(results["true"] == 0) & (results["pred"] == 0)]
false_positives = results[(results["true"] == 0) & (results["pred"] == 1)]
true_positives = results[(results["true"] == 1) & (results["pred"] == 1)]
print(f"false negatives\n{false_negatives.head()}")
print(f"true negatives\n{true_negatives.head()}")
print(f"false positives\n{false_positives.head()}")
print(f"true positives\n{true_positives.head()}")

In [0]:
# Save model and svd to joblib
model = {
    "svd": svd,
    "clf": clf,
}

model_file = f"{TRAINED_MODELS_PATH}/model_v1.joblib"
model_path = Path(get_local_path(model_file))
model_path.parent.mkdir(parents=True, exist_ok=True)

with open(model_path, "wb") as f_model:
    joblib.dump(model, f_model, protocol=None)