In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

In [3]:
from datetime import datetime

def filter_latest_by_user(records):
    """
    Given a list of dicts with keys 'user_id' and 'date_time',
    returns a new list containing only the latest record per unique user_id.
    """
    latest_by_user = {}
    for record in records:
        user = record['user_id']
        dt = datetime.strptime(record['date_time'], '%Y-%m-%d %H:%M:%S')
        if user not in latest_by_user or dt > latest_by_user[user]['dt']:
            latest_by_user[user] = {'record': record, 'dt': dt}
    return [entry['record'] for entry in latest_by_user.values()]

# Example:
# filtered_list = filter_latest_by_user(data)
# print(filtered_list)


In [4]:
import numpy as np
import krippendorff

def calculate_ordinal_kpf_alpha(reocrds):
    rows = []
    for rec in reocrds:
        uid = rec['user_id']
        for t in rec['target_utterances']:
            rows.append({
                'utterance_id': t['utterance_id'],
                'user_id':       uid,
                'label':         t['labels']['informativeness']
            })

    df = pd.DataFrame(rows)
    matrix = df.pivot(index='utterance_id',
                      columns='user_id',
                      values='label')
    # krippendorff.alpha wants a 2D array shape (n_categories, n_subjects),
    # so we transpose: categories × items, filling missing with np.nan
    data = matrix.T.values

    α = krippendorff.alpha(
        reliability_data = data,
        level_of_measurement = 'ordinal'
    )
    return α



In [5]:
from sklearn.metrics import cohen_kappa_score

def compute_weighted_accuracy(truths, preds):
    maxd = 3
    w = lambda i,j: 1 - abs(i-j)/maxd
    return sum(w(t,p) for t,p in zip(truths,preds)) / len(truths)

def compute_performance(records, ground_truth):
    # map utterance_id → predicted label
    weighted_accs = []
    for rec in records:
        preds  = [t['labels']['informativeness'] for t in rec['target_utterances']]
        truths = [ground_truth[t['utterance_id']] for t in rec['target_utterances']]
        weighted_accs.append(compute_weighted_accuracy(truths, preds))
    return sum(weighted_accs)/len(weighted_accs)

In [6]:


def print_annotation_details(utt_id, records, ground_truth, utterances):
    """
    Given:
      - utt_id (int or str)
      - records: list of annotation dicts
      - ground_truth: dict mapping utt_id -> true label
      - utterances: list of dicts with keys 'utterance_id' and 'text'
    Prints:
      - the utterance text
      - the ground truth label
      - each annotator's vote (using their latest submission)
    """
    # Build a lookup for utterance text
    utt_map = {int(u['utterance_id']): u['utterance_text'] for u in utterances}
    
    # Fetch and print text
    text = utt_map.get(int(utt_id), "[Text not found]")
    print(f"Utterance {utt_id} text:\n{text}\n")
    
    # Print ground truth
    gt = ground_truth.get(str(utt_id), "[No ground truth]")
    print(f"Ground truth informativeness: {gt}\n")
    
    records
    
    # Collect and print annotator votes
    print("Annotator votes:")
    for rec in records:
        preds = {
            t['utterance_id']: t['labels']['informativeness']
            for t in rec['target_utterances']
        }
        if str(utt_id) in preds:
            print(f" - {rec['user_id']}: {preds[str(utt_id)]}")


In [7]:
prescreen_data = load_json("infogain_annotation.prescreen.json")

In [8]:
ground_truth = load_json("../static/prescreen_sample.json")
ground_truth_ans = {str(gt['utterance_id']): gt['answers']['informativeness'] for gt in ground_truth['target_utterances']}

In [9]:
prescreen_data = [p for p in prescreen_data if len(p['target_utterances']) == 6 and p["user_id"] not in ["maychill", "test", "123", "345", "678", "67fd46db088825e305648e1a"]]

In [10]:
prescreen_data = filter_latest_by_user(prescreen_data)

In [11]:
α = calculate_ordinal_kpf_alpha(prescreen_data)
print("Krippendorff’s α (ordinal):", α)

Krippendorff’s α (ordinal): 0.8684054694229113


In [12]:
acc = compute_performance(prescreen_data, ground_truth_ans)
print(f"Mean Accuracy = {acc:.3f}")

Mean Accuracy = 0.806


In [13]:
# 3) build a flat DataFrame of all (utterance_id, user_id, pred, truth)
rows = []
for rec in prescreen_data:
    for t in rec['target_utterances']:
        qid  = int(t['utterance_id'])
        pred = t['labels']['informativeness']
        true = ground_truth_ans[str(qid)]
        rows.append({'utterance_id': qid,
                     'user_id'     : rec['user_id'],
                     'pred'        : pred,
                     'truth'       : true})

df = pd.DataFrame(rows)

# 4) compute the per-annotation weight
maxd = df['truth'].max() - df['truth'].min()  # should be 3 for labels 1–4
df['w_acc'] = 1 - (df.eval('abs(pred - truth)') / maxd)

# 5) group by utterance_id to get adjusted accuracy
breakdown = df.groupby('utterance_id').agg(
    n_annotators     = ('user_id', 'nunique'),
    adjusted_accuracy = ('w_acc',  'mean')
).reset_index()

print(breakdown)

   utterance_id  n_annotators  adjusted_accuracy
0           101             6           0.444444
1           102             6           1.000000
2           103             6           1.000000
3           104             6           0.888889
4           105             6           0.555556
5           106             6           0.944444


In [16]:
print_annotation_details(
    utt_id=101,
    records=prescreen_data,
    ground_truth=ground_truth_ans,
    utterances=ground_truth["target_utterances"]
)

Utterance 101 text:
It’s misleading to suggest that <strong>aid</strong> inherently undermines <strong>accountability</strong>. Take the substantial reductions in <strong>HIV/AIDS</strong> cases, for instance—<strong>aid</strong> supported government-run programs that dramatically reduced the spread of the disease. What matters is how <strong>aid</strong> is designed and monitored. Blanket critiques overlook the difference between strategic investment and careless spending.

Ground truth informativeness: 2

Annotator votes:
 - 67edb75b0744561f1cf6ff9d: 4
 - 677c4ac5878568e38dae405a: 4
 - 67aa21a12b1a6e9475bc61b2: 4
 - Nakita: 4
 - 63af557b3d4f219c3226b7d6: 2
 - 654455ad0b76675379c66db4: 4
