In [1]:
import collections
import glob
import json
import openreview
import pandas as pd
import random
import tqdm

random.seed(37)

In [2]:
SUBSETS = "train dev test".split()
TOTAL_REQUIRED_MAP = {
    "train":1,
    "dev":1,
    "test":2
}

# Continuing annotators: anno3, anno10

ExistingAnnotation = collections.namedtuple("ExistingAnnotation",
                                            "subset forum_id review_id anno".split())
existing_annotation_dicts = []
for subset in SUBSETS:
    subset_dir = "".join(["/Users/nnayak/Downloads/0517_split_2/",
                          subset, "/*"])
    for filename in sorted(glob.glob(subset_dir)):
        with open(filename, 'r') as f:
            obj = json.load(f)
            existing_annotation_dicts.append(
            ExistingAnnotation(subset,obj["metadata"]["forum_id"],
                              obj["metadata"]["review"],
                              obj["metadata"]["anno"]))
existing_annotations = pd.DataFrame.from_dict(existing_annotation_dicts)

In [3]:
guest_client = openreview.Client(baseurl='https://api.openreview.net')

def get_new_annotator(existing_annotators):
    if set('anno3 anno10'.split()) <= set(existing_annotators):
        return "ERROR"
    elif 'anno3' in existing_annotators:
        return 'anno10'
    elif 'anno10' in existing_annotators:
        return 'anno3'
    else:
        return random.choice(['anno3', 'anno10'])
    
def get_top_level_reviews(guest_client, forum_id):
    return set([note.id
            for note in guest_client.get_notes(forum=forum_id)
            if note.replyto == forum_id 
            and "AnonReviewer" in note.signatures[0]])
        
ToBeAnnotated = collections.namedtuple("ToBeAnnotated",
                                       "subset forum_id review_id already_annotated assigned_to".split())
to_be_annotated_dicts = []
for subset in SUBSETS:
    total_required = TOTAL_REQUIRED_MAP[subset]
    this_subset_annotations = existing_annotations[existing_annotations["subset"]  == subset]
    forum_ids = sorted(this_subset_annotations["forum_id"].unique())
    for forum_id in forum_ids:
        this_forum_annotations = existing_annotations[
            existing_annotations["forum_id"] == forum_id]
        new_annotator = get_new_annotator(this_forum_annotations["anno"].unique())
        reviews_annotated = this_forum_annotations["review_id"].unique()
        reviews_to_annotate = get_top_level_reviews(guest_client, forum_id) - set(reviews_annotated)
        if subset == 'test':
            for review in reviews_annotated:
                if len(this_forum_annotations[this_forum_annotations["review_id"] == review]) < 2:
                    to_be_annotated_dicts.append(
                    ToBeAnnotated(subset, forum_id, review, "n/a", new_annotator))
                else:
                    print("yay")
            for review in reviews_to_annotate:
                if new_annotator == "ANY":
                    for annotator in ["anno3", "anno10"]:
                        to_be_annotated_dicts.append(
                            ToBeAnnotated(subset, forum_id, review, "n/a", annotator))
                else:
                    already_annotated_by = this_forum_annotations[this_forum_annotations["review_id"] == review]["anno"].unique()
                    if not already_annotated_by.size:   
                        to_be_annotated_dicts.append(
                            ToBeAnnotated(subset, forum_id, review, "n/a", 'ANY'))
                    elif already_annotated_by[0] == 'anno3':
                        to_be_annotated_dicts.append(
                            ToBeAnnotated(subset, forum_id, review, "n/a", 'anno10'))
                    else:
                        to_be_annotated_dicts.append(
                            ToBeAnnotated(subset, forum_id, review, "n/a", 'anno3'))
        else:
            for review in reviews_to_annotate:
                to_be_annotated_dicts.append(
                    ToBeAnnotated(subset, forum_id, review, "n/a", new_annotator))
                
tba_df = pd.DataFrame.from_dict(to_be_annotated_dicts)

In [4]:
tba_df

Unnamed: 0,subset,forum_id,review_id,already_annotated,assigned_to
0,train,B14ejsA5YQ,BkeWgLUpnX,,anno3
1,train,B14ejsA5YQ,BJlOnG0gTX,,anno3
2,train,B1esx6EYvr,Ske6OaZ-qr,,anno3
3,train,B1fpDsAqt7,H1gUmqkh3Q,,anno10
4,train,B1fpDsAqt7,SJgyyPTv3m,,anno10
...,...,...,...,...,...
298,test,ryxUMREYPr,S1xrU4J2FS,,anno10
299,test,ryxUMREYPr,SJlp3GIRtr,,ANY
300,test,ryxhynC9KX,SygnL1qDhX,,anno10
301,test,ryxhynC9KX,rJxd44Xt27,,ANY


In [12]:
REV_MAP = {
    'anno3': 'KG',
    'anno10': 'MAD'
}

for _, row in tba_df.iterrows():
    if 'anno' in row[-1]:
        anno = REV_MAP[row[-1]]
    elif row[-1] == 'ANY':
        anno = random.choice(["KG", 'MAD'])
    else:
        continue
    print(row[2] + "\t" + anno)

BkeWgLUpnX	KG
BJlOnG0gTX	KG
Ske6OaZ-qr	KG
H1gUmqkh3Q	MAD
SJgyyPTv3m	MAD
ByxzpzkyT7	MAD
S1xcRXhFnQ	MAD
ByezUKSq2Q	KG
r1l7E6X22m	KG
S1gaWerP2X	KG
rJxr_fmNFS	MAD
H1lRMzRpYr	MAD
SyeJ_2r0tH	KG
S1g3KSyaqr	KG
rkx6mDnd37	MAD
BkemLDvAn7	MAD
S1xGt0Rq3m	MAD
BklqO2_AKH	MAD
SyeS7CQ83m	KG
SyeR04XiuS	MAD
BkgV-wG5FH	KG
H1xeyoxG9S	KG
SygQJELD3X	KG
Hye-MhpZT7	KG
BylSI7T6qr	KG
BkgYfE5o3Q	MAD
SJllCbbahQ	MAD
BkxPCGTX9B	MAD
rJe-UgPqnX	MAD
HJlAUngO2X	MAD
SJgAEEpDhQ	MAD
ByxVe3vnOS	KG
Sklm0w_6tr	KG
HkeVjIX9hX	KG
SJlld01kcS	KG
BygwEyIPKB	KG
BkerfQqaFH	KG
rJg8esmZcB	KG
SJxy9Ndy5r	KG
Sye1Z22R5B	KG
B1xXNM8qpX	MAD
HJeXDu9h2X	MAD
rygkDfsDtH	MAD
S1xAWnjRFH	KG
SkedBT5ntB	KG
SkeyBm0VnQ	KG
S1xUCqYpYH	MAD
HkglHuj3FS	KG
S1gd_SqAKr	KG
r1gbbP6TKB	KG
rylTafBXqB	KG
S1eIP8MpYB	MAD
SkgI3OVTYH	MAD
BkxRG1XIhX	KG
Skl-BtQq6X	KG
Byl0ceB1cH	KG
BklLqYip9B	KG
BJeididLqS	MAD
BJl-jwU6tB	MAD
SkgS2HI5hQ	MAD
SyeASuDp27	MAD
Skx-jEBr5B	MAD
ryxWDI_Gsm	KG
rJeG6QIc3X	KG
BJx9Y2wuiB	KG
BkxWG9hlTQ	MAD
HygPsSJLpQ	KG
SJe4yM7lcr	MAD
BJl5fZS6YS	MAD
rke