
"""
# LLM Essay Scoring Evaluation (Zero-Shot & Few-Shot)

This notebook evaluates the scoring performance of the LLMs across the two datasets (`FCE`, `ASAP`) and two learning settings (`0-Shot`, `1-Shot`). 

## Instructions:
- Choose `dataset_name`: `"FCE"` or `"ASAP"`
- Choose `setting`: `"0-Shot"` or `"1-Shot"`

The script computes:
- MSE, MAE
- Micro and Macro Quadratic Weighted Kappa (QWK)
- Pearson and Spearman Correlations
"""


In [13]:
# Set Configuration
dataset_name = "ASAP"       # "FCE" or "ASAP"
setting = "0-Shot"          # "0-Shot" or "1-Shot"

file_path = f"./Data/{dataset_name}.xlsx"
output_prefix = f"/Users/koketch/Desktop/{setting}-{dataset_name}"


In [14]:
# mport Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
from scipy.stats import pearsonr, spearmanr


In [15]:
# Load Data
combined_df = pd.read_excel(file_path)
combined_df.head()


Unnamed: 0,Prompt_ID,Test_Bed,Prompt_Type,Student_Essay,Human,0-Shot Rubric,GPT4_0-Shot,GPT4o_0-Shot,GPT3.5_0-Shot,Llama2_0-Shot,...,GPT4_1-Shot,GPT4o_1-Shot,GPT3.5_1-Shot,Llama3_1-Shot,Llama2_1-Shot,Llama3.1_1-Shot,Deepseek-R1_1-Shot,Qwen2.5_1-Shot,Llama3-8B_1-Shot,Prometheus-13b_1-Shot
0,1,ASAP,ARG,"Dear local newspaper, I think effects computer...",4.0,"Instruction\nAs a virtual assessor, your respo...",2,2,3,6,...,3.0,2.0,4.5,4.0,4.5,4.0,4.0,4.0,2.0,3.0
1,1,ASAP,ARG,"Dear @CAPS1 @CAPS2, I believe that using compu...",4.5,"Instruction\nAs a virtual assessor, your respo...",2,2,3,4,...,4.0,4.0,4.0,4.0,4.5,4.0,4.0,4.0,3.5,5.0
2,1,ASAP,ARG,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",3.5,"Instruction\nAs a virtual assessor, your respo...",3,2,2,6,...,3.0,3.0,3.5,3.5,4.5,4.0,4.0,3.0,4.5,2.0
3,1,ASAP,ARG,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,"Instruction\nAs a virtual assessor, your respo...",3,2,3,4,...,5.0,4.0,5.5,4.0,4.5,5.5,4.0,4.5,4.5,4.0
4,1,ASAP,ARG,"Dear @LOCATION1, I know having computers has a...",4.0,"Instruction\nAs a virtual assessor, your respo...",3,3,3,4,...,4.0,4.0,4.0,4.0,4.5,4.0,4.0,4.5,4.0,5.0


In [16]:
# Define model names 
base_models = [
    'GPT3.5', 'GPT4', 'GPT4o', 'Llama2',
    'Llama3', 'Llama3.1', 'Deepseek-R1',
    'Qwen2.5', 'Llama3-8B', 'Prometheus-13b'
]
model_names = [f"{m}_{setting}" for m in base_models]


In [17]:
# Initialize containers
results = []
per_prompt_qwk_all = []


In [18]:
# Metric computation
for model in model_names:
    filtered_df = combined_df[['Prompt_ID', 'Human', model]].dropna()

    if filtered_df.empty:
        print(f"Skipping {model} due to insufficient data.")
        continue

    y_true_scaled_list = []
    y_pred_scaled_list = []
    y_true_original_list = []
    y_pred_original_list = []

    grouped = filtered_df.groupby('Prompt_ID')

    for prompt_id, group in grouped:
        scaler = MinMaxScaler()

        y_true = group['Human'].values
        y_true_scaled = scaler.fit_transform(y_true.reshape(-1, 1)).flatten()
        y_true_scaled_list.extend(y_true_scaled)

        y_pred = group[model].values
        y_pred_scaled = scaler.transform(y_pred.reshape(-1, 1)).flatten()
        y_pred_scaled_list.extend(y_pred_scaled)

        y_true_original_list.extend(y_true)
        y_pred_original_list.extend(y_pred)

        y_true_int = group['Human'].astype(int)
        y_pred_int = group[model].astype(int)

        if len(np.unique(y_true_int)) > 1 and len(np.unique(y_pred_int)) > 1:
            qwk_prompt = cohen_kappa_score(y_true_int, y_pred_int, weights='quadratic')
            per_prompt_qwk_all.append({
                'Model': model.replace(f'_{setting}', ''),
                'Prompt_ID': prompt_id,
                'QWK': round(qwk_prompt, 4),
                'Setting': setting
            })

    y_true_scaled = np.array(y_true_scaled_list)
    y_pred_scaled = np.array(y_pred_scaled_list)
    y_true_original = np.array(y_true_original_list)
    y_pred_original = np.array(y_pred_original_list)

    micro_qwk = round(cohen_kappa_score(y_true_original.astype(int), y_pred_original.astype(int), weights='quadratic'), 4)
    model_qwks = [entry['QWK'] for entry in per_prompt_qwk_all if entry['Model'] == model.replace(f'_{setting}', '')]
    macro_qwk = round(np.mean(model_qwks), 4) if model_qwks else np.nan

    mse = round(mean_squared_error(y_true_scaled, y_pred_scaled), 4)
    mae = round(mean_absolute_error(y_true_scaled, y_pred_scaled), 4)
    pcc, _ = pearsonr(y_true_scaled, y_pred_scaled)
    src, _ = spearmanr(y_true_scaled, y_pred_scaled)

    results.append({
        'Model': model.replace(f'_{setting}', ''),
        'MSE': mse,
        'MAE': mae,
        'Micro QWK': micro_qwk,
        'Macro QWK': macro_qwk,
        'PCC': round(pcc, 4),
        'SRC': round(src, 4),
        'Setting': setting
    })


In [19]:
# Convert to DataFrames
results_df = pd.DataFrame(results)
per_prompt_qwk_df = pd.DataFrame(per_prompt_qwk_all)


In [20]:
# Display Results
print("Overall Metrics:")
display(results_df)

print("\nSample Per-Prompt QWKs:")
display(per_prompt_qwk_df.head())


Overall Metrics:


Unnamed: 0,Model,MSE,MAE,Micro QWK,Macro QWK,PCC,SRC,Setting
0,GPT3.5,0.2331,0.3957,0.2057,0.1271,0.178,0.1336,0-Shot
1,GPT4,0.3083,0.4517,0.8889,0.2699,0.4958,0.4441,0-Shot
2,GPT4o,0.2539,0.4228,0.1924,0.1431,0.241,0.2091,0-Shot
3,Llama2,1.2315,0.9558,0.1753,0.0049,-0.0338,0.0024,0-Shot
4,Llama3,0.2501,0.4207,0.8828,0.2143,0.443,0.4026,0-Shot
5,Llama3.1,0.2875,0.447,0.8536,0.1841,0.4376,0.3822,0-Shot
6,Deepseek-R1,0.283,0.442,0.8283,0.1797,0.3754,0.3265,0-Shot
7,Qwen2.5,0.2535,0.4323,0.8734,0.1845,0.4415,0.4027,0-Shot
8,Llama3-8B,0.3085,0.3968,0.2532,0.2048,0.3457,0.3365,0-Shot
9,Prometheus-13b,0.3422,0.4392,0.5494,0.0597,0.1053,0.0962,0-Shot



Sample Per-Prompt QWKs:


Unnamed: 0,Model,Prompt_ID,QWK,Setting
0,GPT3.5,1,0.0962,0-Shot
1,GPT3.5,2,0.1736,0-Shot
2,GPT3.5,3,0.0543,0-Shot
3,GPT3.5,4,0.1274,0-Shot
4,GPT3.5,5,0.2815,0-Shot


In [21]:
# Display results 
print("Full Results DataFrame:")
print(results_df)

print("\nPer-Prompt QWK DataFrame:")
print(per_prompt_qwk_df)

# Save to CSV
results_df.to_csv(f"{output_prefix}mQWK.csv", index=False)
per_prompt_qwk_df.to_csv(f"{output_prefix}-PerPromptQWKs.csv", index=False)
print("Results saved to CSV.")



Full Results DataFrame:
            Model     MSE     MAE  Micro QWK  Macro QWK     PCC     SRC  \
0          GPT3.5  0.2331  0.3957     0.2057     0.1271  0.1780  0.1336   
1            GPT4  0.3083  0.4517     0.8889     0.2699  0.4958  0.4441   
2           GPT4o  0.2539  0.4228     0.1924     0.1431  0.2410  0.2091   
3          Llama2  1.2315  0.9558     0.1753     0.0049 -0.0338  0.0024   
4          Llama3  0.2501  0.4207     0.8828     0.2143  0.4430  0.4026   
5        Llama3.1  0.2875  0.4470     0.8536     0.1841  0.4376  0.3822   
6     Deepseek-R1  0.2830  0.4420     0.8283     0.1797  0.3754  0.3265   
7         Qwen2.5  0.2535  0.4323     0.8734     0.1845  0.4415  0.4027   
8       Llama3-8B  0.3085  0.3968     0.2532     0.2048  0.3457  0.3365   
9  Prometheus-13b  0.3422  0.4392     0.5494     0.0597  0.1053  0.0962   

  Setting  
0  0-Shot  
1  0-Shot  
2  0-Shot  
3  0-Shot  
4  0-Shot  
5  0-Shot  
6  0-Shot  
7  0-Shot  
8  0-Shot  
9  0-Shot  

Per-Prompt QWK Da

# Summarization Quality Evaluation (Few-shot & Zero-shot)
This section evaluates the content quality of summaries generated by different LLMs using two common text generation settings:

Few-shot: Models were given three carefully selected examples before generating their own summary.

Zero-shot: Models generated summaries without seeing any examples.

For each model and setting, the evaluation compares the generated summary to a human-written reference summary using:

ROUGE-1: Overlap of unigrams (single words)

ROUGE-2: Overlap of bigrams (two-word sequences)

ROUGE-L: Longest common subsequence

METEOR: Alignment-based score that incorporates synonyms and stemming


In [27]:
import pandas as pd
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
from tqdm import tqdm
import nltk

# Optional: download if not already done
# nltk.download('wordnet')
# nltk.download('omw-1.4')

def compute_metrics(refs, preds):
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    metrics = {
        "rouge1": [],
        "rouge2": [],
        "rougeL": [],
        "meteor": [],
    }

    for ref, pred in tqdm(zip(refs, preds), total=len(refs), desc="Scoring ROUGE and METEOR"):
        r_scores = rouge.score(str(ref), str(pred))
        metrics["rouge1"].append(r_scores["rouge1"].fmeasure)
        metrics["rouge2"].append(r_scores["rouge2"].fmeasure)
        metrics["rougeL"].append(r_scores["rougeL"].fmeasure)
        metrics["meteor"].append(meteor_score([str(ref).split()], str(pred).split()))

    return pd.DataFrame(metrics)


def main(csv_path, output_prefix):
    df = pd.read_csv(csv_path)

    reference_column = "Reference Summary"
    if reference_column not in df.columns:
        print("Missing 'Reference Summary' column.")
        return

    # Get all model output columns (exclude the reference)
    prediction_columns = [col for col in df.columns if col != reference_column]

    # Split into Few-shot and Zero-shot
    categorized_results = {"Few-shot": {}, "Zero-shot": {}}

    for col in prediction_columns:
        if "Few-shot" in col:
            setting = "Few-shot"
        elif "Zero-shot" in col:
            setting = "Zero-shot"
        else:
            print(f"Skipping column '{col}' (no 'Few-shot' or 'Zero-shot' in name)")
            continue

        print(f"\nEvaluating model: {col} ({setting})")
        ref = df[reference_column].astype(str).tolist()
        pred = df[col].astype(str).tolist()
        metrics_df = compute_metrics(ref, pred)
        categorized_results[setting][col] = metrics_df.mean()

    for setting, model_dict in categorized_results.items():
        if not model_dict:
            print(f"No {setting} columns found.")
            continue

        final_results = pd.DataFrame(model_dict).T
        final_results = final_results[["rouge1", "rouge2", "rougeL", "meteor"]]
        final_results.columns = ["ROUGE-1", "ROUGE-2", "ROUGE-L", "METEOR"]
        final_results = final_results.round(4)

        print(f"\nFinal {setting} Evaluation Results:")
        print(final_results)

        save_path = f"{output_prefix}_{setting}.csv"
        final_results.to_csv(save_path)
        print(f"Saved to {save_path}")


if __name__ == "__main__":
    csv_path = "/Users/koketch/Desktop/Summarizations.csv"
    output_prefix = "/Users/koketch/Desktop/summary_eval_results"

    main(csv_path, output_prefix)


Skipping column 'Article' (no 'Few-shot' or 'Zero-shot' in name)

Evaluating model: GPT-3.5 Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:02<00:00, 892.68it/s] 



Evaluating model: GPT-4 Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 440.60it/s]


Skipping column 'GPT-4o Zer0-shot ' (no 'Few-shot' or 'Zero-shot' in name)

Evaluating model: Llama-3-70B Zero-shot (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 380.31it/s]



Evaluating model: Llama-2-70B Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 355.05it/s]



Evaluating model: Llama-3-8B Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 386.31it/s]



Evaluating model: Llama3.1 Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 356.37it/s]



Evaluating model: Qwen2.5-72B Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 388.81it/s]



Evaluating model: Prometheus Zero-shot  (Zero-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 380.55it/s]


Skipping column 'Unnamed: 11' (no 'Few-shot' or 'Zero-shot' in name)

Evaluating model: GPT-3.5 Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 463.51it/s]



Evaluating model: GPT-4 Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 469.60it/s]



Evaluating model: GPT-4o Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 439.13it/s]



Evaluating model: Llama-3-8B Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 408.25it/s]



Evaluating model: Llama-3-70B Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 409.04it/s]



Evaluating model: Llama3.1 Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:05<00:00, 381.89it/s]



Evaluating model: Qwen2.5-72B Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 456.09it/s]



Evaluating model: Llama-2-70B Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 403.98it/s]



Evaluating model: Prometheus Few-shot  (Few-shot)


Scoring ROUGE and METEOR: 100%|██████████| 2000/2000 [00:04<00:00, 435.47it/s]


Final Few-shot Evaluation Results:
                       ROUGE-1  ROUGE-2  ROUGE-L  METEOR
GPT-3.5 Few-shot        0.3605   0.1316   0.2362  0.2718
GPT-4 Few-shot          0.3707   0.1457   0.2479  0.2826
GPT-4o Few-shot         0.3538   0.1250   0.2270  0.2677
Llama-3-8B Few-shot     0.3524   0.1344   0.2308  0.2864
Llama-3-70B Few-shot    0.3612   0.1375   0.2347  0.2933
Llama3.1 Few-shot       0.2328   0.0642   0.1543  0.1897
Qwen2.5-72B Few-shot    0.3629   0.1327   0.2348  0.2685
Llama-2-70B Few-shot    0.3422   0.1286   0.2249  0.2778
Prometheus Few-shot     0.3454   0.1269   0.2266  0.2692
Saved to /Users/koketch/Desktop/summary_eval_results_Few-shot.csv

Final Zero-shot Evaluation Results:
                        ROUGE-1  ROUGE-2  ROUGE-L  METEOR
GPT-3.5 Zero-shot        0.1158   0.0428   0.0775  0.0895
GPT-4 Zero-shot          0.3671   0.1453   0.2437  0.2855
Llama-3-70B Zero-shot    0.3511   0.1324   0.2245  0.2926
Llama-2-70B Zero-shot    0.3339   0.1253   0.2167  0.2855
L


