In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [2]:
import pandas as pd
from mturk.gformtools import visualize, extract_errors, unpack_json, chains_str_from_events
%matplotlib inline
import seaborn as sns
from glob import glob
import os
from tqdm import tqdm
from mturk.extract_results import extract_results_from_log_file
from mturk.score_submissions import evaluate_submissions, adhoc_fix
import re
from cort.core.corpora import Corpus
import urllib
from IPython.core.display import display, HTML

In [3]:
%%sh
cd ..
# this file is not commited to git to protect the privacy of annotators
# if you don't have this file, please ignore
python3 anonymize.py 

In [4]:
anns_paths = ['../data/annotations/Name-quiz.csv']
anns_gform = pd.concat([pd.read_csv(p) for p in anns_paths], sort=False)
# sometimes people submit the same answer twice, we'll keep only the last submission 
anns = anns_gform.drop_duplicates(['Document', 'Token', 'Username'], keep='last').copy()
anns['Timestamp_dt'] = pd.to_datetime(anns.Timestamp)



## Fix some annotation mistakes (ad-hoc)

In the beginning, some annotators didn't understand the instructions and input some bogus names. We'll need to filter them out.

In [5]:
problematic_names = ['An asian country', 'A political figure', 'car maker brand', 'A company name', 
                     'No idea, the sentences work withouth this word', 'approval',
                     'social aid organisation (red crescent, red cross)', 'political party',
                     'Name of a journalist', 'devil', 'State of the USA']
anns[anns['Guessed name'].isin(problematic_names)][['Document', 'Guessed name', 'Guessed characteristics']]

Unnamed: 0,Document,Guessed name,Guessed characteristics
75,student3-2019-03-29-question-170.html,"No idea, the sentences work withouth this word","attribute, adjective, modal for time or place"
76,student3-2019-03-29-question-025.html,A company name,"Described in the text as ""a collection of comp..."
77,student3-2019-03-29-question-180.html,A political figure,person
78,student3-2019-03-29-question-051.html,An asian country,country
80,student3-2019-03-29-question-234.html,State of the USA,"state, location/governmental entity"
82,student3-2019-03-29-question-062.html,"social aid organisation (red crescent, red cross)",organisation
83,student3-2019-03-29-question-180.html,political party,"group, party"
88,student3-2019-03-29-question-118.html,Name of a journalist,Person
89,student3-2019-03-29-question-017.html,car maker brand,a company
203,student1-2019-03-14-question-203.html,approval,


In [6]:
anns[anns['Guessed name'].isin(problematic_names)] = float('nan')

Mid-way through, I found out that the name of the speaker (typically broadcasters) are shown in the document and annotators just copied them into the answer sheet. This is not intended because I want to check if annotators know about the entities already before the task, not what they learn. Also, they typically don't know anything about the broadcasters so the names would be useless anyway. I'll filter them out.

In [7]:
speaker_names = ['Martha Stewart', 'Stacy Brown', 'Linda Walker', 'Shane Sellers']
anns[anns['Guessed name'].isin(speaker_names)][['Document', 'Guessed name', 'Guessed characteristics']]

Unnamed: 0,Document,Guessed name,Guessed characteristics
3,student2-2019-03-29-question-021.html,Stacy Brown,
13,student2-2019-03-29-question-017.html,Martha Stewart,
30,student2-2019-03-29-question-028.html,Linda Walker,
59,student2-2019-03-29-question-011.html,Shane Sellers,
166,student2-2019-03-29-question-017.html,Martha Stewart,


In [8]:
anns[anns['Guessed name'].isin(speaker_names)] = float('nan')

## Stats

In [9]:
len(anns)

310

In [10]:
anns['Guessed name'].isna().sum() / len(anns)

0.7645161290322581

In [11]:
(anns['Guessed name'].isna() & anns['Guessed characteristics'].isna()).sum() / len(anns)

0.0935483870967742

In [12]:
anns['Guessed name'].value_counts()

Taiwan                10
Israel                 4
Hong Kong              4
Bush                   4
god                    3
Al Gore                3
God                    3
China                  3
North Korea            2
Serbia                 2
Florida                2
Greece                 2
Robert Mugabe          1
President Clinton      1
Wall Street            1
Zimbabwe               1
New York               1
Bonnaroo               1
Bill Clinton           1
South Korea            1
Michael Jackson        1
United Nations         1
Greeks                 1
Irak                   1
monday                 1
Russia                 1
the Treasury           1
Belgrade               1
Yemen                  1
Slobodan Milosevic     1
Texas                  1
jugoslawia             1
Vladimir Putin         1
George W. Bush         1
Dakar                  1
Dongguan               1
Washington, D.C.       1
iraq                   1
Gore                   1
Senegal                1


In [13]:
anns.groupby('Username').agg({'Timestamp': len, 'Guessed name': 'count', 'Guessed characteristics': 'count'})

Unnamed: 0_level_0,Timestamp,Guessed name,Guessed characteristics
Username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
student1,107,8,87
student2,131,51,81
student3,56,14,55


In [14]:
anns[anns.Username == 'student3']['Guessed characteristics']

74                                     place, country
79                                      country/place
81     Either a political party or a political person
84                            a person or institution
85        either a location or a date (year or month)
86                                    day of the week
87                        A location, event or person
90                                     person, entity
91                                      american city
92                                    god, bible text
93                                            US city
94                                      event or date
95                                            airline
96                                       chinese city
97                                           country 
98                              greek political party
99                                       US president
177                                           country
178                         

# Generate spreadsheet for annotation

In [20]:
def extract_conll_path(path):
    with open(path) as f:
        html = f.read()
    conll_path, = re.findall(r'name="conll_file" value="(.+)"', html)
    return conll_path

In [56]:
def file_contains(path, s):
    with open(path) as f:
        return s in f.read()
    
def file_equal(path1, path2):
    name_pattern = r'\b_[A-Z][A-Z0-9]+_\b'
    with open(path1) as f1, open(path2) as f2:
        tokens1 = re.findall(name_pattern, f1.read())
        tokens2 = re.findall(name_pattern, f2.read())
        return tokens1 == tokens2
    
def add_urls(row):
    doc = row['Document']
    token = row['Token']
    if not pd.isna(doc):
        paths = glob('../data/annotations/documents/*/' + doc)
        paths = [p for p in paths if file_contains(p, token)]
        if paths:
            if any(not file_equal(paths[0], p) for p in paths[1:]):
                paths_related_to_name = [p for p in paths if 'name' in p]
                if len(paths_related_to_name) == 1:
                    paths = paths_related_to_name
                else:
                    print('Warning: found two or more non-equivalent files: %s' %paths)
            for i, path in enumerate(paths):
                path = os.path.abspath(path)
                url = 'file://%s' %(urllib.parse.quote(path))
                row['url%d' %i] = url
            conll_paths = [extract_conll_path(p) for p in paths]
            orig_conll_paths = [re.sub(r'/no-(?:external|internal)/', '/orig/', p) for p in conll_paths]
            orig_conll_paths = [p.replace('auto_conll', 'gold_conll') for p in orig_conll_paths]
            orig_conll_paths = [p.replace('transformed.v2', 'transformed') for p in orig_conll_paths]
            orig_html_paths = [p + '.html' for p in orig_conll_paths]
            orig_html_paths = list(set(orig_html_paths))
            for i, path in enumerate(orig_html_paths):
                path = os.path.abspath(os.path.join('..', path))
                url = 'file://%s' %(urllib.parse.quote(path))
                row['orig%d' %i] = url
        else:
            print('Warning: no document found for %s - %s' %(doc, token))
    return row

In [58]:
anns = anns.apply(add_urls, axis=1)



In [59]:
cols = ([c for c in anns.columns if 'url' in c or 'orig' in c] + 
        ['Token', 'Guessed name', 'Guessed characteristics'])
anns[cols].to_csv('../output/entity-guessing-score-sheet.csv')