In [1]:
%cd '/Users/max/Projects/Coreference/'

/Users/max/Projects/Coreference


In [2]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.tagsets.utils import same_grammemmes
from anaphoralib.experiments import mentionpair
from anaphoralib.experiments import coref_utils
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
#%load_ext autoreload
#%autoreload 2

scorer_path = 'rucoref/external/reference-coreference-scorers/scorer.pl'

/Users/max/Projects/Coreference/rucoref
/Users/max/Projects/Coreference


Initialization
--------------

In [3]:
rucoref_train = rueval.RuCorefCorpus(multeast, rueval)
rucoref_test = rueval.RuCorefCorpus(multeast, rueval)

In [4]:
exp_utils.load_corpus(rucoref_train, 'Corpus-2015/Tokens.train.fixmorph.txt.parsed', 'Corpus-2015/Groups.train.txt')

Number of texts: 126
Number of GS texts: 126
Number of chains in a corpus: 2515
Number of words in all chains: 11453


In [5]:
exp_utils.load_corpus(rucoref_test, 'Corpus-2015/Tokens.test.fixmorph.txt.parsed', 'Corpus-2015/Groups.test.txt')

Number of texts: 55
Number of GS texts: 55
Number of chains in a corpus: 1123
Number of words in all chains: 5104


Defining the classifiers
--------------------

In [6]:
class BaselineAllSingletonsClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        return False

In [7]:
class BaselineAllInOneClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        return True

In [8]:
class BaselineStrMatchClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        is_pronoun = rucoref_test.tagset.pos_filters['pronoun'](pair[0])
        is_personal_pronoun = rucoref_test.tagset.extract_feature('person', pair[0]) in ('1', '2')
        
        return (not is_pronoun or is_personal_pronoun) and ' '.join(pair[0].lemma) == ' '.join(pair[1].lemma)

In [9]:
class BaselineHeadMatchClassifier(mentionpair.MentionPairClassifier):
    def pair_coreferent(self, pair, groups, words):
        is_pronoun = rucoref_test.tagset.pos_filters['pronoun'](pair[0])
        is_personal_pronoun = rucoref_test.tagset.extract_feature('person', pair[0]) in ('1', '2')
        
        return (not is_pronoun or is_personal_pronoun) and pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head]

In [10]:
class BaselineHeadMatchProClassifier(mentionpair.MentionPairClassifier):
    def __init__(self, scorer_path):
        super(BaselineHeadMatchProClassifier, self).__init__(scorer_path)
        self.groups_match = lambda pair: pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head]
    def pair_coreferent(self, pair, groups, words):
        tagset = rucoref_test.tagset
        
        is_pronoun = lambda w: tagset.pos_filters['pronoun'](w)
        is_deictic_pronoun = lambda w: tagset.extract_feature('person', w) in ('1', '2')
        
        number_agrees = lambda p: same_grammemmes('number', p, tagset)
        gender_agrees = lambda p: same_grammemmes('gender', p, tagset)
        
        if is_pronoun(pair[1]):
            heads = [np.words[np.head] if np.type != 'word' else np for np in pair]
            heads_indices = [words.index(head) for head in heads]

            nouns_agr_between = [word for word in words[heads_indices[0]+1:heads_indices[1]]
                                     if tagset.pos_filters['noun'](word)
                                     and number_agrees((word, pair[1]))
                                     and gender_agrees((word, pair[1]))
                                ]
        
        return (
                (is_deictic_pronoun(pair[0]) and self.groups_match(pair))
               or
                (not is_pronoun(pair[0]) and pair[0].lemma[pair[0].head] == pair[1].lemma[pair[1].head])
               or
               (
                not is_pronoun(pair[0]) and is_pronoun(pair[1])
                and number_agrees(pair)
                and gender_agrees(pair)
                and len(nouns_agr_between) == 0
               )
        )

In [11]:
class BaselineStrMatchProClassifier(BaselineHeadMatchProClassifier):
    def __init__(self, scorer_path):
        super(BaselineStrMatchProClassifier, self).__init__(scorer_path)
        self.groups_match = lambda pair: ' '.join(pair[0].lemma) == ' '.join(pair[1].lemma)

In [12]:
good_pronouns = {u'я', u'мы', 
                 u'ты', u'вы', 
                 u'он', u'она', u'оно', u'они', 
                 u'мой', 'наш', 
                 u'твой', u'ваш', 
                 u'его', u'ее', u'их',
                 u'себя', u'свой',
                 u'который'
                }
group_ok = lambda g: g.tag.startswith('N') or (g.tag.startswith('P') and g.lemma[0] in good_pronouns)

In [13]:
gs_mentions, gs_group_ids = coref_utils.get_gs_groups(rucoref_test)
gs_groups = gs_mentions

pred_mentions, pred_group_ids = coref_utils.get_pred_groups(rucoref_test, group_ok)
pred_groups = rucoref_test.groups

pred_mentions_gold_bound, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_test, group_ok)
pred_groups_gold_bound = rucoref_test.groups

In [14]:
print len(gs_mentions[1])
print len(pred_mentions[1])
print len(pred_mentions_gold_bound[1])

102
216
216


In [15]:
pred_mentions_gold_bound[0][:150]

[Неприятности(Ncfpnn, 5),
 конце лета(Ncmsln, 30),
 лета(Ncnsgn, 37),
 старом деревенском доме(Ncmsln, 52),
 кривоногая такса Фунтик(Ncfsnn, 87),
 Фунтик(Ncmsnn, 104),
 Фунтика(Ncfsnn, 112),
 Москвы(Ncfsgn, 132),
 черный кот Степан(Ncmsny, 154),
 Степан(Npmsny, 165),
 крыльце(Ncnsln, 194),
 Он(P-3msnn, 228),
 растопыренную пятерню(Ncfsan, 237),
 всей силы(Ncfsgn, 290),
 обслюненной лапой(Ncfsin, 300),
 себя(P----gn, 320),
 ухом(Ncnsin, 328),
 Степан(Npmsny, 344),
 чей - то пристальный взгляд(Ncmsnn, 365),
 Он(P-3msnn, 392),
 лапой(Ncfsin, 416),
 ухо(Ncnsan, 438),
 Степана(Npmsgy, 449),
 Степана(Npmsgy, 449),
 злости(Ncfsgn, 469),
 Маленький рыжий пес(Ncmsny, 477),
 Одно ухо у него(Ncnsan, 510),
 него(P-3msgn, 521),
 любопытства(Ncnsgn, 549),
 пес(Ncmsny, 563),
 мокрым носом(Ncmsin, 576),
 Степану(Npmsdy, 592),
 этого загадочного зверя(Ncmsgy, 618),
 Степан(Npmsny, 670),
 Фунтика(Npmsay, 697),
 вывернутому уху(Ncnsdn, 708),
 Война(Ncfsnn, 731),
 тех пор(Ncfpgn, 759),
 Степана(Npmsgy, 77

In [16]:
gs_mentions_train, gs_group_ids_train = coref_utils.get_gs_groups(rucoref_train)
gs_groups_train = gs_mentions_train

pred_mentions_train, pred_group_ids_train = coref_utils.get_pred_groups(rucoref_train, group_ok)
pred_groups_train = rucoref_train.groups

pred_mentions_gold_bound_train, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_train, group_ok)
pred_groups_gold_bound_train = rucoref_train.groups

Testing the baseline classifiers:

In [17]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, gs_mentions, gs_groups, False)

\textsc{BaselineAllInOneClassifier} &  $99.89$  & $78.59$ & $99.84$ & $87.95$  & $13.48$ & $99.79$ & $23.76$  & $27.73$ \\
\textsc{BaselineAllSingletonsClassifier} &  $100.00$  & $0.00$ & $0.00$ & $0.00$  & $100.00$ & $22.15$ & $36.27$  & $22.15$ \\
\textsc{BaselineStrMatchClassifier} &  $100.00$  & $94.29$ & $37.36$ & $53.52$  & $97.09$ & $38.19$ & $54.82$  & $45.43$ \\
\textsc{BaselineStrMatchProClassifier} &  $100.00$  & $84.90$ & $52.42$ & $64.82$  & $89.34$ & $43.35$ & $58.37$  & $49.72$ \\
\textsc{BaselineHeadMatchClassifier} &  $100.00$  & $87.78$ & $47.06$ & $61.27$  & $92.11$ & $43.64$ & $59.22$  & $50.77$ \\
\textsc{BaselineHeadMatchProClassifier} &  $100.00$  & $84.89$ & $52.50$ & $64.87$  & $89.29$ & $43.38$ & $58.40$  & $49.76$ \\


In [18]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)

\textsc{BaselineAllInOneClassifier} &  $50.49$  & $26.61$ & $88.90$ & $40.96$  & $1.71$ & $85.89$ & $3.36$  & $13.97$ \\
\textsc{BaselineAllSingletonsClassifier} &  $51.38$  & $0.00$ & $0.00$ & $0.00$  & $35.52$ & $20.84$ & $26.27$  & $12.12$ \\
\textsc{BaselineStrMatchClassifier} &  $51.38$  & $52.86$ & $32.29$ & $40.09$  & $33.54$ & $34.04$ & $33.79$  & $23.34$ \\
\textsc{BaselineStrMatchProClassifier} &  $51.19$  & $34.40$ & $45.46$ & $39.16$  & $26.89$ & $39.58$ & $32.02$  & $25.81$ \\
\textsc{BaselineHeadMatchClassifier} &  $51.19$  & $35.26$ & $41.38$ & $38.07$  & $29.57$ & $38.88$ & $33.59$  & $26.12$ \\
\textsc{BaselineHeadMatchProClassifier} &  $51.19$  & $34.40$ & $45.49$ & $39.18$  & $26.89$ & $39.58$ & $32.02$  & $25.81$ \\


In [30]:
coref_utils.get_score_table(BaselineAllInOneClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineAllSingletonsClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineStrMatchClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineStrMatchProClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineHeadMatchClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)
coref_utils.get_score_table(BaselineHeadMatchProClassifier(scorer_path), rucoref_test, pred_mentions, pred_groups, False)

\textsc{BaselineAllInOneClassifier} &  $37.66$  & $18.55$ & $65.65$ & $28.92$  & $1.04$ & $58.44$ & $2.04$  & $10.87$ \\
\textsc{BaselineAllSingletonsClassifier} &  $37.66$  & $0.00$ & $0.00$ & $0.00$  & $25.64$ & $15.66$ & $19.44$  & $10.52$ \\
\textsc{BaselineStrMatchClassifier} &  $37.66$  & $42.08$ & $23.35$ & $30.04$  & $23.96$ & $24.56$ & $24.26$  & $18.64$ \\
\textsc{BaselineStrMatchProClassifier} &  $37.66$  & $22.80$ & $31.42$ & $26.43$  & $18.21$ & $27.90$ & $22.04$  & $19.80$ \\
\textsc{BaselineHeadMatchClassifier} &  $37.66$  & $23.01$ & $28.23$ & $25.35$  & $21.01$ & $27.04$ & $23.65$  & $19.95$ \\
\textsc{BaselineHeadMatchProClassifier} &  $37.66$  & $22.82$ & $31.45$ & $26.45$  & $18.21$ & $27.90$ & $22.04$  & $19.80$ \\


In [19]:
scores, groups, chains_base = BaselineHeadMatchProClassifier(scorer_path).score(rucoref_test, 
                                                                                pred_mentions_gold_bound, 
                                                                                pred_groups_gold_bound, 
                                                                                metrics=('muc',), heads_only=False)

In [29]:
coref_utils.print_chains_in_text(rucoref_test, 1, chains_base, pred_mentions_gold_bound)

-- SYS --
Мальчик:мальчик(Ncmsny, 17)
он:он(P-3msnn, 58)

школу:школа(Ncfsan, 43)
новую хорошую школу:новый хороший школа(Ncfsan, 323)
новую хорошую школу:новый хороший школа(Ncfsan, 323)
этой школы:этот школа(Ncfsgn, 570)
этой школы:этот школа(Ncfsgn, 570)
прежней школе:прежний школа(Ncfsln, 607)
новую школу с гуманитарным уклоном:новый школа с гуманитарный уклон(Ncfsan, 801)
музыкальную школу:музыкальный школа(Ncfsan, 872)
новую школу:новый школа(Ncfsan, 2411)
музыкальной школы:музыкальный школа(Ncfsgn, 3636)
музыкальной школы:музыкальный школа(Ncfsgn, 3993)
музыкальной школы:музыкальный школа(Ncfsgn, 3993)
школу:школа(Ncfsan, 4475)

метро:метро(Ncnsln, 84)
метро:метро(Ncnsln, 893)

две остановки на троллейбусе ( трудный путь:два остановка на троллейбус ( трудный путь(Ncfsgn, 126)
две остановки на автобусе:два остановка на автобус(Ncfsgn, 904)

троллейбусе ( трудный путь:троллейбус ( трудный путь(Ncmsln, 143)
троллейбусе:троллейбус(Ncmsln, 939)

трудный путь:трудный путь(Ncmsnn, 156)