In [None]:
# Function: Take detector confidences that do not aggregate to original category proportions
# and rescale them (very simple constant addition scaling) to original category proportions

# Use: a set of detectors developed with SMOTE or other rebalancing algorithms will no longer
# match original proportions of categories, and will be scaled differently from each other.
# This code provides offsets that can be added to confidences to make them comparable.

# Instructions: Run each cell of this Jupyter Notebook one after the other

# Input: A file with confidences for each data point for each category, 
# and a file with original proportions for each category
# Output: Rescaled confidences, final proportionsS

# By Ryan S. Baker, 7/11/2024
# MIT License

In [199]:
def calculate_category_proportions(df_orig):
    """
    Extends the DataFrame by determining which column has the highest value for each row,
    appending this column name, counting occurrences, and calculating the proportion of these occurrences.
    
    Args:
    df (DataFrame): The DataFrame to process, where rows represent variables and columns represent different measures.
    
    Returns:
    DataFrame: A new DataFrame named 'proportiontable' that contains the proportions of each column being the MaxColumn.
    """
    
    df = df_orig.copy()

    # Determine the column name with the highest value for each row
    df['MaxColumn'] = df.idxmax(axis=1)
    
    # Count occurrences of each column being the maximum
    counts = df['MaxColumn'].value_counts()
    
    # Calculate the proportion of each column being the maximum
    total_rows = len(df)
    proportions = counts / total_rows
    
    # Create the proportion table including all original columns
    proportion_table = pd.DataFrame(index=df.columns[:-1])  # Exclude 'MaxColumn' from the index
    proportion_table['Proportion'] = proportions
    proportion_table['Proportion'].fillna(0, inplace=True)  # Fill missing values with 0
    
    return proportion_table


In [200]:
def apply_offsets_to_dataframe(df_orig, offsets):
    """
    Applies offsets to each column of the DataFrame based on the provided dictionary of offsets.
    
    Args:
    df (DataFrame): The DataFrame to process, where rows represent variables and columns have specific labels.
    offsets (dict): A dictionary where keys correspond to DataFrame column labels and values are the offsets to apply.
    
    Returns:
    DataFrame: A new DataFrame with offsets applied to the original values.
    """
    df = df_orig.copy()
    # Ensure all specified offsets are present in the DataFrame columns
    for column in df.columns:
        if column in offsets:
            df[column] = df[column] + offsets[column]
        else:
            if column != 'MaxColumn':
                print(f"No offset provided for column {column}, leaving unchanged.")
   # print (df)
                
    return df

In [201]:
def compare_tables(table1, table2):
    """
    Compares two tables by subtracting the values of matching labels from each other.
    
    Args:
    table1 (DataFrame): The first DataFrame with columns 'Label' and 'Value'.
    table2 (DataFrame): The second DataFrame with columns 'Label' and 'Value'.
    
    Returns:
    DataFrame: A new DataFrame with labels and the difference in values between the two tables.
    """
    
    # Compute the difference between the two tables
    result = table1['Proportion'].subtract(table2['Proportion'], fill_value=0).reset_index()
    result.columns = ['Category', 'Difference']
    
    return result

In [202]:
def sum_abs_diff_tables(table1, table2):
    """
    Compares two tables by subtracting the values of matching labels from each other,
    taking the absolute value of these differences, and summing them up.
    
    Args:
    table1 (DataFrame): The first DataFrame with columns 'Label' and 'Value'.
    table2 (DataFrame): The second DataFrame with columns 'Label' and 'Value'.
    
    Returns:
    float: The sum of the absolute differences between the two tables.
    
    Absolute differences used rather than squared differences to avoid focusing 
    fit on the majority class. In practice, will make limited difference and could
    easily be substituted
    """
    
    # Compute the absolute difference between the two tables
    result = table1['Proportion'].subtract(table2['Proportion'], fill_value=0).abs()

    # Sum up all absolute differences
    total_difference = result.sum()
    
    return total_difference

In [203]:
import pandas as pd

def get_table_diffs(confidences, original_prop, offsets):
  
    # Get column names and row labels
    conf_col_names = confidences.columns
    orig_row_labels = original_prop.index
    
    
    # Check for columns in confidences not in original_prop rows
    missing_in_orig = set(conf_col_names) - set(orig_row_labels)
    if missing_in_orig:
        print("Error: The following columns in 'confidences-dwd.csv' do not appear as rows in 'original-prop-dwd.csv':", missing_in_orig)
    
    # Check for rows in original_prop not in confidences columns
    missing_in_conf = set(orig_row_labels) - set(conf_col_names)
    if missing_in_conf:
        print("Error: The following rows in 'original-prop-dwd.csv' do not appear as columns in 'confidences-dwd.csv':", missing_in_conf)
   
    category_proportions = calculate_category_proportions(confidences)
    
    #print(category_proportions)
    
    # Run the function
    modified_df = apply_offsets_to_dataframe(confidences, offsets)
    
    modified_category_proportions = calculate_category_proportions(modified_df)
    
    #print(modified_category_proportions)
    #print(original_prop)
    
    difftable = compare_tables(original_prop,modified_category_proportions)
    
    #print (difftable)
    
    sum_abs_diff = sum_abs_diff_tables(original_prop,modified_category_proportions)
    
    return sum_abs_diff


In [209]:
from scipy.optimize import minimize

def make_offsets(confidences, x):
    """Create a dictionary of offsets from the optimization variables."""
    return {column: x[i] for i, column in enumerate(confidences.columns)}

def objective(x, confidences, original_prop):
    """Objective function to be minimized."""
    offsets = make_offsets(confidences, x)
    #print(offsets)
    res = get_table_diffs(confidences, original_prop, offsets)
    #print(res)
    return res

def fit_offsets(confidence_filename, original_prop_filename):
    # Load the CSV files
    confidences = pd.read_csv(confidence_filename)
    original_prop = pd.read_csv(original_prop_filename, index_col=0)  # Assuming the first column is the index
        
    # Initial guess for offsets (all zeros)
    initial_guess = [0.0] * len(confidences.columns)

    # Setting initial step sizes for each dimension
    initial_simplex = [initial_guess]
    initial_step_size = 0.1  # Step size of 0.1 for each dimension
    for i in range(len(initial_guess)):
        simplex_point = initial_guess.copy()
        simplex_point[i] += initial_step_size
        initial_simplex.append(simplex_point)
    
    # Perform minimization using the Nelder-Mead method
    result = minimize(objective, initial_guess, args=(confidences, original_prop), method='Nelder-Mead', options={'initial_simplex': initial_simplex, 'xatol': 0.001, 'fatol': 0.001, 'maxiter': 10000})

    # Extract the best offsets found
    best_offsets = make_offsets(confidences, result.x)
    
    # Print the best offsets and the resulting diff
    print("Best Offsets:", best_offsets)
    diff = get_table_diffs(confidences, original_prop, best_offsets)
    
    modified_df = apply_offsets_to_dataframe(confidences, best_offsets)
        
    modified_category_proportions = calculate_category_proportions(modified_df)
    
    print(modified_category_proportions)
    print()
    
    print("Minimum Difference:", diff)
    
    return modified_df

    # modified_df contains the actual rescaled detector confidences for use

In [210]:
# File names (you will need to change the pathname to match your computer)
confidence_filename = 'C:/july2024/confidences-dwd.csv'
original_prop_filename = 'C:/july2024/original-prop-dwd.csv'

# Run the function
modified_df = fit_offsets (confidence_filename,original_prop_filename)

# print(modified_df)

Best Offsets: {'ConcentrationBROMP': 0.2556039080825493, 'ConfusionBROMP': 0.04017999129957831, 'BoredomBROMP': -0.09996964472004519, 'FrustrationBROMP': -0.0008127388961018402, 'DelightBROMP': -0.14630263091234566}
                    Proportion
ConcentrationBROMP    0.822303
ConfusionBROMP        0.065773
BoredomBROMP          0.047312
FrustrationBROMP      0.051260
DelightBROMP          0.013352

Minimum Difference: 0.0002673859692325798
       ConcentrationBROMP  ConfusionBROMP  BoredomBROMP  FrustrationBROMP  \
0                0.732189        0.593115      0.222694          0.367793   
1                0.763120        0.593115      0.143797          0.368300   
2                0.902952        0.426395      0.133699          0.368534   
3                0.881862        0.451860      0.189924          0.426728   
4                0.894440        0.446724      0.156182          0.414395   
...                   ...             ...           ...               ...   
17221           