In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize

data = pd.read_csv('pairwise_results.csv')
data = data[~data['Model A'].str.contains('ssim') & ~data['Model B'].str.contains('ssim')]

# Extract unique models and map to indices
unique_models = set(data['Model A']).union(set(data['Model B']))
model_list = list(unique_models)
model_index = {model: idx for idx, model in enumerate(model_list)}
num_models = len(model_list)

# Initialize ratings to 1000
initial_ratings = np.ones(num_models) * 1000

# Function to compute negative log-likelihood
def neg_log_likelihood(ratings):
    LL = 0.0
    for idx, row in data.iterrows():
        model_a = row['Model A']
        model_b = row['Model B']
        a_idx = model_index[model_a]
        b_idx = model_index[model_b]
        Ra = ratings[a_idx]
        Rb = ratings[b_idx]
        A_wins = row['Model A Wins']
        B_wins = row['Model B Wins']
        Ties = row['Ties']
        N = A_wins + B_wins + Ties
        # Expected score for model A
        Ea = 1 / (1 + 10 ** ((Rb - Ra) / 400))
        # Total observed score for model A
        Sa = A_wins + 0.5 * Ties
        Sb = B_wins + 0.5 * Ties  # Sa + Sb = N
        # Avoid log(0) by adding a small epsilon
        epsilon = 1e-10
        Ea = min(max(Ea, epsilon), 1 - epsilon)
        # Accumulate negative log-likelihood
        LL -= Sa * np.log(Ea) + Sb * np.log(1 - Ea)
    return LL

# Constraint: Mean rating is 1000
def mean_rating_constraint(ratings):
    return np.mean(ratings) - 1000

# Optimization
constraints = {'type': 'eq', 'fun': mean_rating_constraint}
result = minimize(neg_log_likelihood, initial_ratings, method='SLSQP', constraints=constraints)

# Check if optimization was successful
if not result.success:
    print("Optimization failed:", result.message)

# Get the optimized ratings
optimized_ratings = result.x

# Map ratings back to model names
ratings = {model: optimized_ratings[model_index[model]] for model in model_list}

# Sort and print the ratings
print("Estimated Elo Ratings:")
for model_name, rating in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
    print(f"{model_name}: {rating:.2f}")


Estimated Elo Ratings:
cosine: 1212.04
aurora: 1020.17
hive: 1006.01
ip2p: 1004.19
mb: 961.23
null_text: 926.55
sdedit: 869.81
