In [2]:
import numpy as np
import pandas as pd
import os, sys
import seaborn as sns

In [3]:
from bin.baseline_ml import *

In [18]:
data = np.load('output_training/phase2_dataset1_rus/RandomForestClassifier_rus_predictions.npz')

In [19]:
y_test, y_pred, y_pred_proba = data['y_test'], data['y_pred'], data['y_pred_proba']

In [20]:
y_pred_proba

array([0.00189874, 0.031625  , 0.0107921 , ..., 0.        , 0.        ,
       0.06      ])

In [21]:
y_test, y_pred_proba

(array([0, 0, 0, ..., 0, 0, 0]),
 array([0.00189874, 0.031625  , 0.0107921 , ..., 0.        , 0.        ,
        0.06      ]))

In [44]:
def analyze_cost(y_true, y_prob, opt_thresholds, fp_cost, fn_cost):
    """
    Evaluate cost at optimal thresholds from F-beta optimization, including baseline (no prediction).

    Parameters:
    y_true (array-like): True binary labels
    y_prob (array-like): Predicted probabilities
    opt_thresholds (dict): Dictionary of beta -> optimal threshold
    fp_cost (float): Cost of a false positive
    fn_cost (float): Cost of a false negative

    Returns:
    pd.DataFrame: Table with beta, threshold, FP, FN, costs, and total cost
    """
    data = []

    # Baseline: predict all 0 (no positive predictions)
    y_pred_base = np.zeros_like(y_true)
    FP_base = np.sum((y_pred_base == 1) & (y_true == 0))  # always 0
    FN_base = np.sum((y_pred_base == 0) & (y_true == 1))
    total_cost_base = FP_base * fp_cost + FN_base * fn_cost
    data.append({
        'Beta': 'baseline',
        'Threshold': 'N/A',
        'FP': FP_base,
        'FN': FN_base,
        'FP_cost': fp_cost,
        'FN_cost': fn_cost,
        'Total_cost': total_cost_base
    })

    # Model predictions
    for beta, threshold in opt_thresholds.items():
        y_pred = (y_prob >= threshold).astype(int)
        FP = np.sum((y_pred == 1) & (y_true == 0))
        FN = np.sum((y_pred == 0) & (y_true == 1))
        total_cost = FP * fp_cost + FN * fn_cost
        data.append({
            'Beta': beta,
            'Threshold': threshold,
            'FP': FP,
            'FN': FN,
            'FP_cost': fp_cost,
            'FN_cost': fn_cost,
            'Total_cost': total_cost
        })

    return pd.DataFrame(data)


In [50]:
# Assuming you have y_test and y_pred_proba
betas = [0.1, 0.5, 1, 2, 5]
opt_thresholds = {}
for beta in betas:
    max_score, df = optimize_f_thresholds(y_test, y_pred_proba, beta=beta)
    opt_thresholds[beta] = float(df.columns[np.argmax(df.values)])

# thresholds_to_test = sorted(set(opt_thresholds.values()))
# fp_costs = [1, 5, 10]
# fn_costs = [1, 5, 10]

# cost_df = analyze_cost(y_test, y_pred_proba, thresholds_to_test, fp_costs, fn_costs)


In [51]:
opt_thresholds

{0.1: 0.99, 0.5: 0.9500000000000001, 1: 0.9, 2: 0.77, 5: 0.49}

In [52]:
fp_cost = 1/2
analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost = fp_cost, fn_cost = 1 - fp_cost)

Unnamed: 0,Beta,Threshold,FP,FN,FP_cost,FN_cost,Total_cost
0,baseline,,0,2604,0.5,0.5,1302.0
1,0.1,0.99,132,2513,0.5,0.5,1322.5
2,0.5,0.95,1171,2071,0.5,0.5,1621.0
3,1,0.9,3086,1659,0.5,0.5,2372.5
4,2,0.77,8278,1090,0.5,0.5,4684.0
5,5,0.49,23102,488,0.5,0.5,11795.0


In [60]:
import pandas as pd

# Define the fp_cost
fp_cost = 1/2

# Run the analyze_cost function (assuming it returns a DataFrame)
cost_df = analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost=fp_cost, fn_cost=1-fp_cost)

# Save the DataFrame to a CSV file on your device
cost_df.to_csv("cost_analysis_fp_0_5.csv", index=False)

print("Cost analysis saved to 'cost_analysis_fp_0_5.csv'")

Cost analysis saved to 'cost_analysis_fp_0_5.csv'


In [57]:
fp_cost = 1/10
analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost = fp_cost, fn_cost = 1 - fp_cost)

Unnamed: 0,Beta,Threshold,FP,FN,FP_cost,FN_cost,Total_cost
0,baseline,,0,2604,0.1,0.9,2343.6
1,0.1,0.99,132,2513,0.1,0.9,2274.9
2,0.5,0.95,1171,2071,0.1,0.9,1981.0
3,1,0.9,3086,1659,0.1,0.9,1801.7
4,2,0.77,8278,1090,0.1,0.9,1808.8
5,5,0.49,23102,488,0.1,0.9,2749.4


In [None]:
cost_df.to_csv("cost_analysis_fp_0_1.csv", index=False)

In [64]:
fp_cost = 1/20
analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost = fp_cost, fn_cost = 1 - fp_cost)


Unnamed: 0,Beta,Threshold,FP,FN,FP_cost,FN_cost,Total_cost
0,baseline,,0,2604,0.05,0.95,2473.8
1,0.1,0.99,132,2513,0.05,0.95,2393.95
2,0.5,0.95,1171,2071,0.05,0.95,2026.0
3,1,0.9,3086,1659,0.05,0.95,1730.35
4,2,0.77,8278,1090,0.05,0.95,1449.4
5,5,0.49,23102,488,0.05,0.95,1618.7


In [66]:
fp_cost = 1/50
analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost = fp_cost, fn_cost = 1 - fp_cost)


Unnamed: 0,Beta,Threshold,FP,FN,FP_cost,FN_cost,Total_cost
0,baseline,,0,2604,0.02,0.98,2551.92
1,0.1,0.99,132,2513,0.02,0.98,2465.38
2,0.5,0.95,1171,2071,0.02,0.98,2053.0
3,1,0.9,3086,1659,0.02,0.98,1687.54
4,2,0.77,8278,1090,0.02,0.98,1233.76
5,5,0.49,23102,488,0.02,0.98,940.28


In [67]:
fp_cost = 6/10
analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost = fp_cost, fn_cost = 1 - fp_cost)


Unnamed: 0,Beta,Threshold,FP,FN,FP_cost,FN_cost,Total_cost
0,baseline,,0,2604,0.6,0.4,1041.6
1,0.1,0.99,132,2513,0.6,0.4,1084.4
2,0.5,0.95,1171,2071,0.6,0.4,1531.0
3,1,0.9,3086,1659,0.6,0.4,2515.2
4,2,0.77,8278,1090,0.6,0.4,5402.8
5,5,0.49,23102,488,0.6,0.4,14056.4


In [38]:
# Assuming you have y_test and y_pred_proba
betas = [0.1, 0.5, 1, 2, 5]
opt_thresholds = {}
for beta in betas:
    max_score, df = optimize_f_thresholds(y_test, y_pred_proba, beta=beta)
    opt_thresholds[beta] = float(df.columns[np.argmax(df.values)])

In [39]:
fp_cost = 1/20
analyze_cost(y_test, y_pred_proba, opt_thresholds, fp_cost = fp_cost, fn_cost = 1 - fp_cost)

Unnamed: 0,Beta,Threshold,FP,FN,FP_cost,FN_cost,Total_cost
0,baseline,,0,2604,0.05,0.95,2473.8
1,0.1,0.99,132,2513,0.05,0.95,2393.95
2,0.5,0.95,1171,2071,0.05,0.95,2026.0
3,1,0.9,3086,1659,0.05,0.95,1730.35
4,2,0.77,8278,1090,0.05,0.95,1449.4
5,5,0.49,23102,488,0.05,0.95,1618.7
