In [4]:
import pandas as pd
import numpy as np
import pickle
import json
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sklearn.metrics import f1_score, precision_score, recall_score
from tabulate import tabulate
from tqdm import tqdm

# Load trained preprocessing models
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)
with open("pca.pkl", "rb") as f:
    pca = pickle.load(f)

# Load frequency maps
with open("frequency_maps.pkl", "rb") as f:
    freq_maps = pickle.load(f)

predictor = Predictor(endpoint_name="kmeans", serializer=CSVSerializer())

# Load dataset
datasets = {
    "test": pd.read_csv("./fraudTest.csv")
}

def encode_categorical_features(dataframe):
    """
    Apply frequency encoding using pre-saved frequency maps.
    Fill missing (unseen) values with 0.
    """
    cat_cols = dataframe.select_dtypes(include=["object"]).columns
    freq_frames = {}

    for col in cat_cols:
        if col in freq_maps:
            mapped_col = dataframe[col].map(freq_maps[col])
            freq_frames[col + "_freq"] = mapped_col.fillna(0) 
        else:
            print(f"⚠️ Warning: Column '{col}' not found in frequency map. Filling with 0.")
            freq_frames[col + "_freq"] = pd.Series(0, index=dataframe.index)

    if freq_frames:
        freq_df = pd.DataFrame(freq_frames, index=dataframe.index)
        dataframe = pd.concat([dataframe, freq_df], axis=1)

    dataframe.drop(columns=cat_cols, inplace=True)
    return dataframe


def preprocess(df):
    keep = ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'zip', 'trans_num', 'is_fraud']
    df = df[keep].copy()
    df_encoded = encode_categorical_features(df.copy())
    X = df_encoded.drop(columns=['is_fraud'])
    y = df_encoded['is_fraud'].values

    print("Columns used for scaling and PCA:")
    print(X.columns.tolist())

    X_scaled = scaler.transform(X)
    X_pca = pca.transform(X_scaled)
    return X_pca, y

# Inference
all_distances = []
all_labels = []
all_rows = []

for name, df in datasets.items():
    print(f"Processing {name} (original size: {len(df)})...")

    fraud_df = df[df['is_fraud'] == 1].sample(n=min(100, len(df[df['is_fraud'] == 1])), random_state=42)
    nonfraud_df = df[df['is_fraud'] == 0].sample(n=min(100, len(df[df['is_fraud'] == 0])), random_state=42)
    df_sampled = pd.concat([fraud_df, nonfraud_df]).sample(frac=1, random_state=42).reset_index(drop=True)

    print(len(fraud_df))
    print(len(df_sampled))

    X_pca, y_true = preprocess(df_sampled)

    for i, (row, actual) in enumerate(tqdm(zip(X_pca, y_true), total=len(y_true), desc=f"Inferencing {name}")):
        payload = ",".join(map(str, row)) + "\n"
        resp = json.loads(predictor.predict(payload))['predictions'][0]
        distance = resp.get('distance_to_cluster')
        all_distances.append(distance)
        all_labels.append(actual)
        all_rows.append(df_sampled.iloc[i])

print("Inference completed on all datasets.\n")

# Add distances to original DataFrame
df_with_distances = pd.DataFrame(all_rows).reset_index(drop=True)
df_with_distances["distance"] = all_distances
df_with_distances.to_csv("inference_results.csv", index=False)
print("Saved full original data with distances to inference_results.csv")

# Threshold optimization (optional)
thresholds = np.linspace(min(all_distances), max(all_distances), 100)
metrics = []

for t in thresholds:
    preds = [1 if d > t else 0 for d in all_distances]
    precision = precision_score(all_labels, preds)
    recall = recall_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    metrics.append((t, precision, recall, f1))

best = max(metrics, key=lambda x: x[3])
best_threshold, best_precision, best_recall, best_f1 = best

print("Best Threshold Found:")
print(f"- Distance Threshold: {best_threshold:.4f}")
print(f"- Precision: {best_precision:.2%}")
print(f"- Recall:    {best_recall:.2%}")
print(f"- F1 Score:  {best_f1:.2%}")

print("\nTop 5 Thresholds by F1 Score:")
top5 = sorted(metrics, key=lambda x: x[3], reverse=True)[:5]
print(tabulate(top5, headers=["Threshold", "Precision", "Recall", "F1 Score"]))

print("\n")
print(all_distances[0:10])


📦 Processing test (original size: 555719)...
100
200
✅ Columns used for scaling and PCA:
['cc_num', 'amt', 'zip', 'trans_date_trans_time_freq', 'merchant_freq', 'category_freq', 'trans_num_freq']


Inferencing test: 100%|██████████| 200/200 [00:02<00:00, 97.04it/s] 


✅ Inference completed on all datasets.

✅ Saved full original data with distances to inference_results.csv
🎯 Best Threshold Found:
- Distance Threshold: 0.4504
- Precision: 50.00%
- Recall:    99.00%
- F1 Score:  66.44%

📊 Top 5 Thresholds by F1 Score:
  Threshold    Precision    Recall    F1 Score
-----------  -----------  --------  ----------
   0.450375     0.5           0.99    0.66443
   0.494215     0.5           0.99    0.66443
   0.538055     0.5           0.99    0.66443
   0.318854     0.497487      0.99    0.662207
   0.362694     0.497487      0.99    0.662207


[3.1374692916870117, 2.9022696018218994, 3.2945199012756348, 3.3673059940338135, 2.9708876609802246, 4.354086399078369, 2.467928647994995, 2.892545461654663, 0.7626126408576965, 2.599315881729126]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
