# Plot results

In [1]:
import math
import re
import ast
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
os.chdir('/home/mrsergazinov/TabLLM/')
from train_eval import TASK_TYPE

# Initialize an empty list to store the results
results = []

# Replace 'your_file.txt' with the path to your text file
with open('results.txt', 'r') as f:
    lines = f.readlines()

# Variables to hold current parameters
current_params = None

# Loop through the lines
for line in lines:
    line = line.strip()
    if line.startswith('Parameters:'):
        # Extract the parameters
        params_str = line[len('Parameters: '):]
        # Safely evaluate the string to a dictionary
        params = ast.literal_eval(params_str)
        current_params = params
    elif line.startswith('Overall Average Metric'):
        # Extract the mean and standard deviation
        match = re.search(r'Overall Average Metric over \d+ runs: ([\d\.]+)% ± ([\d\.]+)%', line)
        if match and current_params is not None:
            mean_accuracy = float(match.group(1))
            std_accuracy = float(match.group(2))
            # Combine parameters and accuracies into a single dictionary
            result = {
                'dataset_name': current_params.get('dataset_name'),
                'model_name': current_params.get('model_name'),
                'num_encoder': current_params.get('num_encoder'),
                'num_encoder_trainable': current_params.get('num_encoder_trainable'),
                'num_encoder_scale': current_params.get('scaler'),
                'Average Accuracy (%)': mean_accuracy,
                'Std Dev (%)': std_accuracy / math.sqrt(10),
            }
            results.append(result)
            current_params = None  # Reset for the next block

# Create a DataFrame from the results
df = pd.DataFrame(results)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# sort by num_encoder and then by model_name
df = df.sort_values(by=['dataset_name', 'model_name'])
df

Unnamed: 0,dataset_name,model_name,num_encoder,num_encoder_trainable,num_encoder_scale,Average Accuracy (%),Std Dev (%)
0,adult,MLP,,True,,84.19,0.056921
18,adult,MLP,FourierFeatures,False,SquareScalingFeatures,84.42,0.056921
22,adult,MLP,BinningFeatures,False,SquareScalingFeatures,84.32,0.06957
23,adult,MLP,ComboFeatures,False,SquareScalingFeatures,84.51,0.075895
28,adult,MLP,FourierFeatures,True,,84.63,0.082219
3,adult,ModernNCA,,True,,83.98,0.031623
27,adult,ModernNCA,FourierFeatures,False,SquareScalingFeatures,84.44,0.050596
40,adult,ModernNCA,FourierFeatures,True,,84.55,0.04111
45,adult,ModernNCA,ComboFeatures,False,SquareScalingFeatures,84.4,0.056921
46,adult,ModernNCA,BinningFeatures,False,SquareScalingFeatures,84.24,0.034785


In [3]:
def compute_win_rate(df, encoders_to_compare):
    df['num_encoder'] = df['num_encoder'] + '_' + df['num_encoder_trainable'].astype(str)

    # Group by dataset and model to analyze each pair
    grouped = df.groupby(["dataset_name", "model_name"])

    total_wins = 0
    total_improvement = []
    total_comparisons = 0

    for (dataset, model), group in grouped:
        # Extract performance of raw model (where num_encoder is None)
        raw_perf = group[group["num_encoder"].isna()]["Average Accuracy (%)"].max()
        
        # It's possible there might be no raw model row in some cases, so skip if not found
        if pd.isna(raw_perf):
            continue

        # Find max performance among the specified encoders if not regression
        if TASK_TYPE[dataset] == 'regression':
            encoder_perf = group[group["num_encoder"].isin(encoders_to_compare)]["Average Accuracy (%)"].min()
        else:
            encoder_perf = group[group["num_encoder"].isin(encoders_to_compare)]["Average Accuracy (%)"].max()
        
        
        # If there's no encoder-based row for this dataset-model, skip
        if pd.isna(encoder_perf):
            continue

        # Compare performances
        total_comparisons += 1
        if TASK_TYPE[dataset] == 'regression':
            if encoder_perf < raw_perf:
                total_wins += 1
                improvement = (raw_perf - encoder_perf) / raw_perf * 100
                total_improvement.append(improvement)
        else:
            if encoder_perf > raw_perf:
                total_wins += 1
                improvement = (encoder_perf - raw_perf) / raw_perf * 100
                total_improvement.append(improvement)
        print(f"Dataset: {dataset}, Model: {model}, Raw: {raw_perf:.4f}, Encoder: {encoder_perf:.4f}")
        print(f"Improvement: {improvement:.4f}%")
        print(f"Total wins: {total_wins} / {total_comparisons}")
        print('---')

    print(f"Out of {total_comparisons} dataset-model pairs, encoders improved performance {total_wins} times.")
    print(f"Average improvement when encoders win: {np.mean(total_improvement):.4f}%")
    print(f"Max improvement when encoders win: {np.max(total_improvement):.4f}%")

# Define the encoders to compare: EncoderName + True if trainable, False if not trainable
encoders_to_compare = ["FourierFeatures_False", "ComboFeatures_False", "BinningFeatures_False"]
compute_win_rate(df.copy(), encoders_to_compare)

Dataset: adult, Model: MLP, Raw: 84.1900, Encoder: 84.5100
Improvement: 0.3801%
Total wins: 1 / 1
---
Dataset: adult, Model: ModernNCA, Raw: 83.9800, Encoder: 84.4400
Improvement: 0.5477%
Total wins: 2 / 2
---
Dataset: adult, Model: TabTransformer, Raw: 83.8700, Encoder: 85.3000
Improvement: 1.7050%
Total wins: 3 / 3
---
Dataset: california_housing, Model: MLP, Raw: 0.4600, Encoder: 0.4100
Improvement: 10.8696%
Total wins: 4 / 4
---
Dataset: california_housing, Model: ModernNCA, Raw: 0.4200, Encoder: 0.4000
Improvement: 4.7619%
Total wins: 5 / 5
---
Dataset: california_housing, Model: TabTransformer, Raw: 0.5200, Encoder: 0.4400
Improvement: 15.3846%
Total wins: 6 / 6
---
Dataset: higgs, Model: MLP, Raw: 70.6400, Encoder: 71.2500
Improvement: 0.8635%
Total wins: 7 / 7
---
Dataset: higgs, Model: ModernNCA, Raw: 71.8000, Encoder: 72.2800
Improvement: 0.6685%
Total wins: 8 / 8
---
Dataset: higgs, Model: TabTransformer, Raw: 72.0700, Encoder: 72.2200
Improvement: 0.2081%
Total wins: 9 / 9
