In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [2]:
data_path = '../output/guessing_names.csv'

In [3]:
%matplotlib inline
import os
from glob import glob
import re
import pandas as pd
from pyhocon import ConfigFactory
from cort.core.corpora import Corpus
import urllib
from IPython.core.display import display, HTML
from mturk.score_submissions import evaluate_submissions, adhoc_fix
from mturk.gformtools import visualize, extract_errors, unpack_json, chains_str_from_events
from collections import Counter
from tqdm import tqdm

In [4]:
anns_paths = ['../data/annotations/Coref-annotation.csv',
              '../data/annotations/Coref-annotation-too-long.csv']
anns_gform = pd.concat([pd.read_csv(adhoc_fix(p)) for p in anns_paths], sort=False)
anns_gform = anns_gform.drop_duplicates(['Document', 'Username'], keep='last')
anns_chains = anns_gform.Annotations.apply(unpack_json)
anns = pd.concat([anns_gform, anns_chains], axis=1)
# there's no difference between using auto or gold because humans don't see our syntactic and semantic annotations
# some *.auto_conll files are missing so I replace them with the *.gold_conll equivalence
anns['conll_file'] = anns.conll_file.str.replace('auto_conll', 'gold_conll')

In [5]:
root_dir = '..'

In [6]:
def iterate_masked_name_mentions(df):
    for _, row in tqdm(list(df.iterrows())):
        conll_path = os.path.join(root_dir, row['conll_file'])
        if re.search(r'/no-(?:external|internal/)', conll_path):
            with open(conll_path) as f:
                corpus = Corpus.from_file('', f)
            doc, = corpus.documents
            mention_strs = Counter(' '.join(doc.tokens[m.span.begin:m.span.end+1])
                                   for m in doc.annotated_mentions)
            name_mention_strs = [s for s, cnt in mention_strs.items() 
                                 if re.match(r'_\w+_(?: _\w+_)*$', s) and cnt >= 2]
            for s in name_mention_strs:
                yield {'mention': s, 'conll_path': conll_path,
                       'Document': row['Document'], 'Username': row['Username']}

In [7]:
df = pd.DataFrame(iterate_masked_name_mentions(anns))

100%|██████████| 1034/1034 [00:37<00:00, 27.76it/s]


In [8]:
def format_url(row):
    prefilled_form_url_template = 'https://docs.google.com/forms/d/e/1FAIpQLSfk04beuK-ZwD9j2twk4hIZDNy-UPxpKX5jXaPQRj1iGTqDmg/viewform?usp=pp_url&entry.366340186=%s&entry.1671245065=%s'
    get_url = lambda row: prefilled_form_url_template %(urllib.parse.quote(row['Document']), urllib.parse.quote(row['mention']))
    return '<li><a href="%s">%s: %s</a></li>' %(get_url(row), row['Document'], row['mention'])

df['prefilled_form'] = df.apply(format_url, axis=1)

In [9]:
df.sample(10)

Unnamed: 0,Document,Username,conll_path,mention,prefilled_form
177,student2-2019-03-29-question-220.html,student2,../output/conll-2012-transformed/no-external/t...,_PHEAM_ _KE_,"<li><a href=""https://docs.google.com/forms/d/e..."
289,student3-2019-03-29-question-234.html,student3,../output/conll-2012-transformed/no-internal/t...,_BUR18_,"<li><a href=""https://docs.google.com/forms/d/e..."
92,student3-2019-03-29-question-062.html,student3,../output/conll-2012-transformed.v2/no-interna...,_MO43_,"<li><a href=""https://docs.google.com/forms/d/e..."
93,student3-2019-03-29-question-062.html,student3,../output/conll-2012-transformed.v2/no-interna...,_SYC_ _KUG5_,"<li><a href=""https://docs.google.com/forms/d/e..."
266,student2-2019-03-29-question-327.html,student2,../output/conll-2012-transformed/no-internal/t...,_NOO_ _CHAD_,"<li><a href=""https://docs.google.com/forms/d/e..."
13,student3-2019-03-29-question-017.html,student3,../output/conll-2012-transformed.v2/no-interna...,_FEAD42_,"<li><a href=""https://docs.google.com/forms/d/e..."
118,student2-2019-03-29-question-082.html,student2,../output/conll-2012-transformed.v2/no-externa...,_TIET_,"<li><a href=""https://docs.google.com/forms/d/e..."
283,student1-2019-03-14-question-250.html,student1,../output/conll-2012-transformed/no-external/t...,_CHE14_ _XU95_ _TRAD65_,"<li><a href=""https://docs.google.com/forms/d/e..."
265,student2-2019-03-29-question-327.html,student2,../output/conll-2012-transformed/no-internal/t...,_LY_,"<li><a href=""https://docs.google.com/forms/d/e..."
212,student3-2019-03-29-question-170.html,student3,../output/conll-2012-transformed/no-external/d...,_CEE_,"<li><a href=""https://docs.google.com/forms/d/e..."


In [10]:
assert not os.path.exists(data_path)
df.to_csv(data_path, index=False)

In [10]:
df = pd.read_csv(data_path)

In [15]:
df.groupby('Username').agg({'Document': [min, max]}).drop_duplicates(keep='last')

Unnamed: 0_level_0,Document,Document
Unnamed: 0_level_1,min,max
Username,Unnamed: 1_level_2,Unnamed: 2_level_2
student1,student1-2019-03-14-question-001.html,student1-2019-03-14-question-254.html
student2,student2-2019-03-29-question-001.html,student2-2019-03-29-question-381.html
student3,student3-2019-03-29-question-001.html,student3-2019-03-29-question-247.html


### Student 1

In [11]:
def format_url_list(df, username):
    data = df[df.Username.str.contains(username)].sample(frac=1, random_state=348292)
    s = ['<ol>'] + list(data.prefilled_form) + ['</ol>']
    display(HTML('\n'.join(s)))

In [12]:
format_url_list(df, 'student1')

### Student 2

In [13]:
format_url_list(df, 'student2')

### Student 3

In [14]:
format_url_list(df, 'student3')