In [13]:
import pandas as pd


df1 = pd.read_csv("data/summac.csv")
df2 = pd.read_csv("data/referee.csv")
df3 = pd.read_csv("data/questeval.csv", encoding="ISO-8859-1")
df4 = pd.read_csv("data/salsa.csv")



merged_df = pd.concat([df1, df2, df3, df4], axis=1)


df = merged_df.loc[:, ~merged_df.columns.duplicated()]

# Identify all columns related to model evaluation
model_prefixes = ["access", "keep_it_simple", "swipe_v5", "swipe_v5_clean", "gpt3", "bartl_wikilarge"]
metric_names = ["sari", "fkgl", "referee_score", "salsa_reference_free", "questeval_reference_based",
                "questeval_reference_free", "SummaCZS", "SummaCConv"]

# Generate the list of all metric columns
numeric_columns = [f"{metric}_{prefix}" for prefix in model_prefixes for metric in metric_names]

# Add reference-related columns
numeric_columns += ["fkgl_reference", "referee_score_reference", "salsa_reference_free_reference",
                    "questeval_reference_free_reference", "SummaCZS_reference", "SummaCConv_reference"]

# Convert all relevant columns to numeric
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Extract models dynamically
models = {}

for prefix in model_prefixes:
    model_name = prefix.upper().replace("_", "-")
    model_metrics = {}

    for metric in metric_names:
        column_name = f"{metric}_{prefix}"
        if column_name in df.columns:
            model_metrics[metric.upper()] = df[column_name].mean()  # Compute mean score

    models[model_name] = model_metrics

# Add Reference model metrics
reference_metrics = {
    "SARI": None,
    "FKGL": df["fkgl_reference"].mean() if "fkgl_reference" in df.columns else None,
    "REFEREE_SCORE": df["referee_score_reference"].mean() if "referee_score_reference" in df.columns else None,
    "SALSA_REFERENCE_FREE": df["salsa_reference_free_reference"].mean() if "salsa_reference_free_reference" in df.columns else None,
    "QUESTEVAL_REFERENCE_BASED": None,
    "QUESTEVAL_REFERENCE_FREE": df["questeval_reference_free_reference"].mean() if "questeval_reference_free_reference" in df.columns else None,
    "SUMMACZS": df["SummaCZS_reference"].mean() if "SummaCZS_reference" in df.columns else None,
    "SUMMACCONV": df["SummaCConv_reference"].mean() if "SummaCConv_reference" in df.columns else None,
}
models["REFERENCE"] = reference_metrics

# Convert to DataFrame and display results
results_df = pd.DataFrame(models).T
print(results_df)

# Save to CSV
results_df.to_csv("data/model_evaluation_results.csv", index=True)




                     SARI       FKGL  REFEREE_SCORE  SALSA_REFERENCE_FREE  \
ACCESS           0.387576  10.413900      -0.184373             48.510300   
KEEP-IT-SIMPLE   0.339022   8.684647       0.511984             65.785754   
SWIPE-V5         0.477764   7.707676       0.464713             64.892294   
SWIPE-V5-CLEAN   0.455523   8.236722       0.449690             62.072245   
GPT3             0.351999   9.470332       0.489155             63.584523   
BARTL-WIKILARGE  0.378713   9.716823       0.162640             56.530410   
REFERENCE             NaN   8.842739       0.317441             63.688621   

                 QUESTEVAL_REFERENCE_BASED  QUESTEVAL_REFERENCE_FREE  \
ACCESS                            0.561528                  0.636953   
KEEP-IT-SIMPLE                    0.476347                  0.541861   
SWIPE-V5                          0.593995                  0.622487   
SWIPE-V5-CLEAN                    0.591827                  0.633329   
GPT3                   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = pd.to_numeric(df[col], errors='coerce')
