In [1]:
import sys
if '..' not in sys.path:
    sys.path.append('..')

In [6]:
from cort.core.corpora import Corpus
import os
from docopt import docopt
import sys
import re
from collections import defaultdict
from utils import map_values
import codecs
from collections import Counter

In [2]:
def read_dataset(inp_dir):
    mentions, tokens, names = [], [], []
    for fname in os.listdir(inp_dir):
        if fname.endswith('_gold_conll'):
            path = os.path.join(inp_dir, fname)
            try:
                with codecs.open(path, 'r', 'utf-8') as f:
                    corpus = Corpus.from_file('', f)
            except KeyError:
                # https://github.com/dmcc/PyStanfordDependencies/issues/24
                sys.stderr.write("Ignored due to parsing error: %s\n" %path)
            for doc in corpus:
                if not doc.annotated_mentions:
                    sys.stderr.write("Document doesn't contain any mention: %s in %s\n" %(doc.identifier, path))
                else:
                    my_mentions, my_tokens, my_names = read_document(doc)
                    mentions.extend(my_mentions)
                    tokens.extend(my_tokens)
                    names.extend(my_names)
    return mentions, tokens, names


def read_document(doc):
    mentions = [m for m in doc.annotated_mentions]
    tokens = [doc.tokens[i]
              for m in mentions
              for i in range(m.span.begin, m.span.end+1)]
    names = [doc.tokens[i] 
             for m in mentions
             for i in range(m.span.begin, m.span.end+1)
             if doc.pos[i] in ['NNP', 'NNPS']]
    return mentions, tokens, names

In [3]:
inp_dir = '../data/conll-2012-flat'
train_mentions, train_tokens, train_names = read_dataset(os.path.join(inp_dir, 'train'))

Document doesn't contain any mention: (bn/voa/00/voa_0043); part 000 in ../data/conll-2012-flat/train/bn_voa_0043.v4_gold_conll
Document doesn't contain any mention: (nw/wsj/10/wsj_1088); part 000 in ../data/conll-2012-flat/train/nw_wsj_1088.v4_gold_conll
Document doesn't contain any mention: (nw/wsj/13/wsj_1384); part 000 in ../data/conll-2012-flat/train/nw_wsj_1384.v4_gold_conll
Document doesn't contain any mention: (bc/cnn/00/cnn_0001); part 009 in ../data/conll-2012-flat/train/bc_cnn_0001.v4_gold_conll
Document doesn't contain any mention: (nw/xinhua/02/chtb_0205); part 000 in ../data/conll-2012-flat/train/nw_chtb_0205.v4_gold_conll
Ignored due to parsing error: ../data/conll-2012-flat/train/tc_ch_0005.v4_gold_conll
Document doesn't contain any mention: (bn/voa/00/voa_0017); part 000 in ../data/conll-2012-flat/train/bn_voa_0017.v4_gold_conll
Document doesn't contain any mention: (bn/abc/00/abc_0024); part 000 in ../data/conll-2012-flat/train/bn_abc_0024.v4_gold_conll
Document doesn

In [4]:
_, dev_tokens, dev_names = read_dataset(os.path.join(inp_dir, 'dev'))

Document doesn't contain any mention: (bn/cnn/02/cnn_0240); part 000


In [5]:
def is_numeric(token):
    return bool(re.match(r'(mid-|\'|")?(\+|-)?[\d\.,]*\d+(/[\d\.,]*\d+)?(th|st|nd|rd|s)?$', token))

In [11]:
def compute_stats(tokens, names):
    tokens = [t.lower() for t in tokens]
    names = [n.lower() for n in names]
    
    numbers = [t for t in tokens if is_numeric(t)]
    uniq_numbers = set(numbers)
    uniq_names = set(names)
    common = [t for t in tokens if not (t in uniq_numbers or t in uniq_names)]
    uniq_common = set(common)
    unique_tokens = set(tokens)
    assert uniq_numbers.union(uniq_names).union(uniq_common) == unique_tokens
    
    stats = {
        'num_numbers': len(numbers),
        'num_names': len(names),
        'num_commons': len(common),
        'num_total': len(tokens),
        'pct_numbers': len(numbers) / len(tokens),
        'pct_names': len(names) / len(tokens),
        'pct_commons': len(common) / len(tokens),
        
        'num_unique_numbers': len(uniq_numbers),
        'num_unique_common': len(uniq_common),
        'num_unique_names': len(uniq_names),
        'num_unique_total': len(unique_tokens),
        'pct_unique_numbers': len(uniq_numbers) / len(unique_tokens),
        'pct_unique_common': len(uniq_common) / len(unique_tokens),
        'pct_unique_names': len(uniq_names) / len(unique_tokens),
    }
    return stats

In [12]:
dev_stats = compute_stats(dev_tokens, dev_names)

In [13]:
dev_stats

{'num_numbers': 400,
 'num_names': 10586,
 'num_commons': 24971,
 'num_total': 45397,
 'pct_numbers': 0.008811154922131418,
 'pct_names': 0.233187215014208,
 'pct_commons': 0.5500583739013591,
 'num_unique_numbers': 150,
 'num_unique_common': 3447,
 'num_unique_names': 2056,
 'num_unique_total': 5653,
 'pct_unique_numbers': 0.02653458340704051,
 'pct_unique_common': 0.6097647266937909,
 'pct_unique_names': 0.3637006898991686}

In [14]:
with open('../output/pct_unique_names.tex', 'w') as f:
    f.write('%.0f\\%%' %(100*dev_stats['pct_unique_names']))

In [15]:
with open('../output/pct_names.tex', 'w') as f:
    f.write('%.0f\\%%' %(100*dev_stats['pct_names']))

In [17]:
uniq_train_names = set(train_names)
uniq_train_tokens = set(train_tokens)
dev_names_in_train = [n for n in dev_names if n in uniq_train_names]
dev_tokens_in_train = [n for n in dev_tokens if n in uniq_train_tokens]

In [18]:
len(dev_names_in_train) / len(dev_names)

0.8286416021160022

In [19]:
len(dev_tokens_in_train) / len(dev_tokens)

0.9318457166773135

In [22]:
Counter(dev_names_in_train).most_common(100)

[('Taiwan', 241),
 ('Mr.', 200),
 ('China', 177),
 ('God', 165),
 ('Hong', 143),
 ('Kong', 143),
 ('Jesus', 136),
 ('President', 115),
 ('Bush', 92),
 ('U.S.', 83),
 ('New', 82),
 ('Ye', 60),
 ('Iraq', 59),
 ('News', 57),
 ('Sony', 56),
 ('York', 53),
 ('Chen', 52),
 ('Lord', 50),
 ('NBC', 48),
 ('Court', 46),
 ('Florida', 44),
 ('Israel', 44),
 ('Peter', 43),
 ('Gore', 43),
 ('Clinton', 41),
 ('Congress', 39),
 ('Milosevic', 38),
 ('Christ', 36),
 ('Japan', 36),
 ('Association', 36),
 ('Qingqing', 36),
 ('Washington', 35),
 ('Bank', 35),
 ('Supreme', 35),
 ('Cross', 35),
 ('United', 34),
 ('World', 34),
 ('Mao', 34),
 ('Cole', 34),
 ('Straits', 34),
 ('Beijing', 33),
 ('Red', 33),
 ('US', 32),
 ('ANC', 31),
 ('Saddam', 30),
 ('West', 30),
 ('Kostunica', 30),
 ('Father', 29),
 ('Friday', 29),
 ('Michael', 28),
 ('States', 27),
 ('John', 27),
 ('Inc.', 27),
 ('Chinese', 26),
 ('Europe', 26),
 ('Senate', 26),
 ('Justin', 26),
 ('Jackson', 25),
 ('South', 25),
 ('Secretary', 24),
 ('Serbi

In [4]:
def read_dataset2(inp_dir):
    for fname in os.listdir(inp_dir):
        if fname.endswith('_gold_conll'):
            path = os.path.join(inp_dir, fname)
            try:
                with codecs.open(path, 'r', 'utf-8') as f:
                    corpus = Corpus.from_file('', f)
            except KeyError:
                # https://github.com/dmcc/PyStanfordDependencies/issues/24
                sys.stderr.write("Ignored due to parsing error: %s\n" %path)
            for doc in corpus:
                for speaker in doc.speakers:
                    yield speaker

In [7]:
speakers = Counter(read_dataset2(os.path.join(inp_dir, 'dev')))