In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from sklearn.utils.class_weight import compute_sample_weight

# --- Load your dataset ---
train_df = pd.read_parquet('train_file.parquet')
test_df = pd.read_parquet('test_file.parquet')

# Assuming you have your X and y from these datasets
X_train = train_df.drop(columns=['target'])  # replace 'target' with your actual target column name
y_train = train_df['target']
X_test = test_df.drop(columns=['target'])
y_test = test_df['target']

# --- Function to calculate various metrics ---
def calculate_metrics(y_true, y_pred, y_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    precision = tp / (tp + fp + 1e-10)
    recall = tp / (tp + fn + 1e-10)
    accuracy = (tp + tn) / (tp + tn + fp + fn + 1e-10)
    tnr = tn / (tn + fp + 1e-10)  # True Negative Rate
    fnr = fn / (fn + tp + 1e-10)  # False Negative Rate
    tpr = recall  # True Positive Rate (same as Recall)
    fpr = fp / (fp + tn + 1e-10)  # False Positive Rate
    
    # ROC AUC and PR AUC
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    
    return {
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'TNR': tnr,
        'FNR': fnr,
        'TPR': tpr,
        'FPR': fpr,
        'ROC AUC': roc_auc,
        'PR AUC': pr_auc
    }

# --- Define the hyperparameter search space ---
space = {
    'n_estimators': hp.choice('n_estimators', [50, 100, 200]),
    'max_depth': hp.choice('max_depth', [10, 20, None]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4]),
    'max_features': hp.choice('max_features', ['auto', 'sqrt', 'log2']),
    'class_weight': hp.choice('class_weight', ['balanced', None])
}

# --- Function to evaluate performance for different imbalance ratios ---
def evaluate_imbalance_ratios(X_train, y_train, X_test, y_test, imbalance_ratios):
    results = []
    
    for ratio in imbalance_ratios:
        # Adjust class weights based on imbalance ratio (for simulation)
        class_weights = {0: ratio, 1: 1-ratio}  # 0 is majority class, 1 is minority class
        
        # Define objective function for Hyperopt
        def objective(params):
            # Train the model with the given hyperparameters
            rf_model = RandomForestClassifier(
                n_estimators=params['n_estimators'],
                max_depth=params['max_depth'],
                min_samples_split=params['min_samples_split'],
                min_samples_leaf=params['min_samples_leaf'],
                max_features=params['max_features'],
                class_weight=class_weights,  # Adjusted based on imbalance ratio
                random_state=42
            )
            
            # Train the model on the full training data
            rf_model.fit(X_train, y_train)
            
            # Get predictions and probabilities
            y_pred_rf = rf_model.predict(X_test)
            y_proba_rf = rf_model.predict_proba(X_test)[:, 1]
            
            # Calculate metrics
            metrics = calculate_metrics(y_test, y_pred_rf, y_proba_rf)
            
            # We want to maximize TNR, so we return the negative TNR for minimization
            return {
                'loss': -metrics['TNR'],  # Hyperopt minimizes, so we return the negative of TNR
                'status': STATUS_OK,
                'metrics': metrics
            }

        # Run Hyperopt to optimize hyperparameters for the current imbalance ratio
        trials = Trials()
        best = fmin(
            fn=objective,          # The objective function to minimize
            space=space,           # Search space for the hyperparameters
            algo=tpe.suggest,      # Use Tree of Parzen Estimators algorithm for optimization
            max_evals=50,          # Number of trials to run
            trials=trials          # Store trial results
        )
        
        # Extract metrics for analysis
        trial_results = []
        for trial in trials.trials:
            metrics = trial['result']['metrics']
            metrics['Imbalance Ratio'] = ratio  # Store imbalance ratio
            trial_results.append(metrics)
        
        # Append the results of this imbalance ratio to the overall results
        results.extend(trial_results)
    
    # Convert results to DataFrame for easy visualization
    return pd.DataFrame(results)

# --- Define class imbalance ratios to evaluate ---
imbalance_ratios = [0.9, 0.8, 0.7, 0.6, 0.5]  # Majority class ratio (90%, 80%, etc.)

# --- Evaluate model performance across different imbalance ratios ---
results_df = evaluate_imbalance_ratios(X_train, y_train, X_test, y_test, imbalance_ratios)

# --- Plot model performance for each imbalance ratio ---
metrics_to_plot = ['TNR', 'FNR', 'ROC AUC', 'PR AUC']
plt.figure(figsize=(12, 8))

for metric in metrics_to_plot:
    plt.plot(results_df['Imbalance Ratio'], results_df[metric], label=metric)

plt.xlabel('Class Imbalance Ratio (Majority Class Proportion)')
plt.ylabel('Metric Value')
plt.title('Model Performance at Different Class Imbalance Ratios with Hyperparameter Tuning')
plt.legend()
plt.grid(True)
plt.show()