In [None]:
import pandas as pd
df = pd.read_csv('Annotations_2.csv', delimiter=',')
df

In [None]:
df['splitted_annotation']  = df['Annotation'].apply(lambda x: set(x.split('\n')) if isinstance(x, str) else set())
df['splitted_annotation_2']  = df['Annotation 2'].apply(lambda x: set(x.split('\n')) if isinstance(x, str) else set())
df.loc[0]

In [None]:
df.loc[19]

In [None]:
def jaccard_similarity(set1, set2):
    intersection = set1 & set2
    union = set1 | set2
    return len(intersection) / len(union) if union else 0

def dice_coefficient(set1, set2):
    intersection = set1 & set2
    return 2 * len(intersection) / (len(set1) + len(set2)) if (set1 or set2) else 0

def overlap_coefficient(set1, set2):
    intersection = set1 & set2
    return len(intersection) / min(len(set1), len(set2)) if (set1 and set2) else 0

# Clean and split malformed entries
def clean_malformed_entries(annotation):
    annotator2 = set()
    for entry in annotation:
        parts = entry.split(') ')
        for part in parts:
            if part:
                if not part.endswith(')'):
                    part += ')'
                annotator2.add(part)
    return annotator2

In [None]:
import numpy as np
indices=[]
jacs= []
dices=[]
overlaps=[]
num_triples_annotator1=0
num_triples_annotator2=0
for index, row in df.iterrows():
    set1 = clean_malformed_entries(row['splitted_annotation'])
    num_triples_annotator1 += len(set1)
    set2 = clean_malformed_entries(row['splitted_annotation_2'])
    num_triples_annotator2 += len(set2)
    jac = jaccard_similarity(set1, set2)
    dice = dice_coefficient(set1, set2)
    overlap= overlap_coefficient(set1, set2)
    jacs.append(jac)
    dices.append(dice)
    overlaps.append(overlap)
    indices.append(index+1)
    #print(set1)
    #print(set2)
    print(f"Set:{index} Jaccard:{jac:.2f} Dice coeff.:{dice:.2f} Overlap coeff.: {overlap:.2f}")

    if(index ==19): break

jacs = np.array(jacs)
dices = np.array(dices)
overlaps = np.array(overlaps)
indices = np.array(indices)

print(f"Number of triples by annotator 1:{num_triples_annotator1}")
print(f"Number of triples by annotator 2:{num_triples_annotator2}")


In [None]:

import matplotlib.pyplot as plt
import numpy as np

# Define colors
colors = ['plum', 'lightgreen', 'gold', 'lightcoral', 'skyblue']

# Create the plot
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each metric
ax.plot(indices, jacs, marker='o', label='Jaccard Index', color=colors[1], linestyle='-', linewidth=2)
ax.plot(indices, dices, marker='o', label='Dice coefficient', color=colors[3], linestyle='-', linewidth=2)
ax.plot(indices, overlaps, marker='o', label='Overlap coefficient', color=colors[4], linestyle='-', linewidth=2)

# Set titles and labels
ax.set_title('Inter-annotator agreements over 20 sets of annotations', fontsize=16)
ax.set_xlabel('Annotation sets', fontsize=12)
ax.set_ylabel('Inter-annotator agreement', fontsize=12)

# Set x-axis ticks to discrete values
ax.set_xticks(indices)
ax.set_xticklabels([str(i) for i in indices])

# Add legend and grid
ax.legend(title="Metrics", loc='best', fontsize=12, title_fontsize=14)
ax.grid(True)

# Adjust x-axis limits
ax.set_xlim([-0.5, 19.5])
ax.set_ylim([0.0, 1.03])
# Display the plot
plt.tight_layout()
plt.show()


In [None]:

# Number of sets
num_sets = len(jacs)
indices = np.arange(num_sets)

# Bar width
bar_width = 0.25

# Create the plot
fig, ax = plt.subplots(figsize=(8, 6))

# Plot each array as a group of bars
ax.bar(indices - bar_width, jacs, width=bar_width, label='Jaccard Index', color=colors[1])
ax.bar(indices, dices, width=bar_width, label='Dice Coefficient', color=colors[3])
ax.bar(indices + bar_width, overlaps, width=bar_width, label='Overlap Coefficient', color=colors[4])

# Set labels and title
ax.set_xlabel('Set Index')
ax.set_ylabel('Agreement Score')
ax.set_title('Multi-bar Graph of Agreement Metrics')
ax.set_xticks(indices)
ax.set_xticklabels([f'{i+1}' for i in indices])
ax.legend(loc='lower right')
ax.grid(True)

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
colors = ['plum', 'lightgreen', 'gold', 'lightcoral', 'skyblue']

#Combine data into a list
data = [jacs, dices, overlaps]

# Create the box plot
fig, ax = plt.subplots()
box=ax.boxplot(data, patch_artist=True)

colors = [colors[1], colors[3], colors[4]]
for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)


# Set labels and title
ax.set_xticklabels(['Jaccard Index', 'Dice Coefficient', 'Overlap Coefficient'])
ax.set_title('Inter-annotator agreements over 20 sets of annotations')

ax.set_ylim([0.0, 1.03])
ax.grid(True)



plt.savefig("figures/IAA.pdf", format="pdf", bbox_inches="tight")
# Show the plot
plt.show()
plt.close()

In [None]:
print(jacs.mean())
print(dices.mean())
print(overlaps.mean())

