In [1]:
import argparse
import pandas as pd
import numpy as np
from itertools import compress
from sklearn.cluster import KMeans

In [2]:
SEED = 0
np.random.seed(SEED)

### Load Dataset

In [3]:
dataset = "data/vehicle-simulated-x.csv"

In [4]:
data = pd.read_csv(dataset)

In [5]:
data.head()

Unnamed: 0,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,...,x_13,x_14,x_15,x_16,x_17,y,y_1,y_2,y_3,y_4
0,95.0,48.0,83.0,178.0,72.0,10.0,162.0,42.0,20.0,159.0,...,70.0,6.0,16.0,187.0,197.0,3,1.0,3.0,3.0,1.0
1,91.0,41.0,84.0,141.0,57.0,9.0,149.0,45.0,19.0,143.0,...,72.0,9.0,14.0,189.0,199.0,3,0.0,3.0,3.0,3.0
2,104.0,50.0,106.0,209.0,66.0,10.0,207.0,32.0,23.0,158.0,...,73.0,14.0,9.0,188.0,196.0,2,2.0,2.0,3.0,2.0
3,93.0,41.0,82.0,159.0,63.0,9.0,144.0,46.0,19.0,143.0,...,63.0,6.0,10.0,199.0,207.0,3,0.0,3.0,3.0,3.0
4,85.0,44.0,70.0,205.0,103.0,52.0,149.0,45.0,19.0,144.0,...,127.0,9.0,11.0,180.0,183.0,0,0.0,0.0,0.0,0.0


In [6]:
columns = list(data.columns.values)
feature_cols = list(compress(columns, [c.startswith('x_') for c in columns]))
ground_truth_labels = data["y"].to_numpy()
n_classes = len(np.unique(ground_truth_labels))

In [7]:
n_classes

4

### Random Annotators

In [8]:
n_annotators_random = 3
annotator_random_probs = [0.6, 0.5, 0.4]

In [9]:
# Generate annotations for random annotators
random_annotations = []
for i in range(n_annotators_random):
    annotations = [label if np.random.rand() < annotator_random_probs[i] else np.random.choice(list(set(range(n_classes)) - set([label]))) for label in ground_truth_labels]
    random_annotations.append(annotations)

In [10]:
# Actual random annotator accuracies
(np.array(random_annotations) == ground_truth_labels).mean(axis=1)

array([0.59810875, 0.49054374, 0.39952719])

### Instance Dependent Annotators

For instance dependent annotators, we do the following:
1. We group the data into n clusters.
2. Each annotator is assigned a cluster.
3. If an annotator gets an instance with the assigned cluster, he correctly predicts the label with give probability.
4. Else he randomly predicts the label.

In [11]:
n_annotators_inst_dep = 10
n_clusters = n_annotators_inst_dep // 2 # We assign to annotators to one cluster.
annotator_inst_dep_probs = [0.6, 0.7, 0.7, 0.85, 0.65, 0.9, 0.78, 0.95, 0.8, 0.68]

In [12]:
# Cluster the data and get cluster labels
model = KMeans(n_clusters=n_clusters)
cluster_labels = model.fit_predict(data[feature_cols])

  super()._check_params_vs_input(X, default_n_init=10)


In [13]:
# Assign a cluster to each annotator
annotator_inst_dep_clusters = [i%n_clusters for i in range(n_annotators_inst_dep)]

In [14]:
inst_dep_annotations = []
for i in range(n_annotators_inst_dep):
    annotations = []
    for idx, label in enumerate(ground_truth_labels):
        if cluster_labels[idx] == annotator_inst_dep_clusters[i]: # instance belongs to the cluster specialised by annotator
            annotations.append(label if np.random.rand() < annotator_inst_dep_probs[i] else np.random.choice(list(set(range(n_classes)) - set([label]))))
        else:
            annotations.append(np.random.choice(list(range(n_classes))))
    inst_dep_annotations.append(annotations)

In [15]:
# Actual instance dependent annotator accuracies
annotator_accuracies = []
annotator_correct_cluster_accuracies = []
annotator_incorrect_cluster_accuracies = []
for i in range(n_annotators_inst_dep):
    annotator_accuracies.append((inst_dep_annotations[i] == ground_truth_labels).mean())
    annotator_correct_cluster_accuracies.append((inst_dep_annotations[i] == ground_truth_labels)[cluster_labels == annotator_inst_dep_clusters[i]].mean())
    annotator_incorrect_cluster_accuracies.append((inst_dep_annotations[i] == ground_truth_labels)[cluster_labels != annotator_inst_dep_clusters[i]].mean())

print(f"Annotator Accuracies: {annotator_accuracies}")
print(f"Annotator Correct Cluster Accuracies: {annotator_correct_cluster_accuracies}")
print(f"Annotator Incorrect Cluster Accuracies: {annotator_incorrect_cluster_accuracies}")

Annotator Accuracies: [0.28250591016548465, 0.3321513002364066, 0.3404255319148936, 0.2635933806146572, 0.41252955082742315, 0.35224586288416077, 0.3617021276595745, 0.3747044917257683, 0.2765957446808511, 0.4160756501182033]
Annotator Correct Cluster Accuracies: [0.5491803278688525, 0.631578947368421, 0.6867469879518072, 0.875, 0.6482558139534884, 0.9016393442622951, 0.8210526315789474, 0.9518072289156626, 0.7916666666666666, 0.6773255813953488]
Annotator Incorrect Cluster Accuracies: [0.23756906077348067, 0.24542682926829268, 0.25588235294117645, 0.24574209245742093, 0.250996015936255, 0.2596685082872928, 0.22865853658536586, 0.2338235294117647, 0.2615571776155718, 0.23705179282868527]


### Class Dependent Annotators

In [16]:
n_annotators_class_dep = 8
annotator_class_dep_probs = [0.7, 0.85, 0.9, 0.9, 0.8, 0.82, 0.75, 0.95]
annotator_class_dep_classes = [0, 1, 3, 2, 2, 1, 0, 3]

In [17]:
# Generate annotations for class dependent annotators
class_dep_annotations = []
for i in range(n_annotators_class_dep):
    annotations = []
    for label in ground_truth_labels:
        if label == annotator_class_dep_classes[i]: # instance belongs to the class specialised by annotator
            annotations.append(label if np.random.rand() < annotator_class_dep_probs[i] else np.random.choice(list(set(range(n_classes)) - set([label]))))
        else:
            annotations.append(np.random.choice(list(range(n_classes))))
    class_dep_annotations.append(annotations)

In [18]:
# Actual class dependent annotator accuracies
annotator_accuracies = []
annotator_correct_class_accuracies = []
annotator_incorrect_class_accuracies = []
for i in range(n_annotators_class_dep):
    annotator_accuracies.append((class_dep_annotations[i] == ground_truth_labels).mean())
    annotator_correct_class_accuracies.append((class_dep_annotations[i] == ground_truth_labels)[ground_truth_labels == annotator_class_dep_classes[i]].mean())
    annotator_incorrect_class_accuracies.append((class_dep_annotations[i] == ground_truth_labels)[ground_truth_labels != annotator_class_dep_classes[i]].mean())

print(f"Annotator Accuracies: {annotator_accuracies}")
print(f"Annotator Correct Cluster Accuracies: {annotator_correct_class_accuracies}")
print(f"Annotator Incorrect Cluster Accuracies: {annotator_incorrect_class_accuracies}")

Annotator Accuracies: [0.37115839243498816, 0.3806146572104019, 0.3983451536643026, 0.4243498817966903, 0.38652482269503546, 0.35933806146572106, 0.3735224586288416, 0.408983451536643]
Annotator Correct Cluster Accuracies: [0.6972477064220184, 0.839622641509434, 0.9095477386934674, 0.9170506912442397, 0.783410138248848, 0.75, 0.7752293577981652, 0.949748743718593]
Annotator Incorrect Cluster Accuracies: [0.25796178343949044, 0.22712933753943218, 0.241112828438949, 0.2543720190779014, 0.24960254372019078, 0.22870662460567823, 0.2340764331210191, 0.2426584234930448]


### Save the new annotations data

In [19]:
save_name = "data/new/vehicle.csv"

In [20]:
annotated_data = data[feature_cols + ["y"]].copy()
annotator_idx = 0
# Random Annotators
for annotations in random_annotations:
    annotated_data[f"y_{annotator_idx}"] = annotations
    annotator_idx += 1
# Instance Dependent Annotators
for annotations in inst_dep_annotations:
    annotated_data[f"y_{annotator_idx}"] = annotations
    annotator_idx += 1
# Class Dependent Annotators
for annotations in class_dep_annotations:
    annotated_data[f"y_{annotator_idx}"] = annotations
    annotator_idx += 1

annotated_data.to_csv(save_name, index=False)