In [1]:
%cd '/Users/max/Projects/Coreference/'

/Users/max/Projects/Coreference


In [2]:
%cd 'rucoref'
from anaphoralib.corpora import rueval
from anaphoralib.tagsets import multeast
from anaphoralib.tagsets.utils import same_grammemmes
from anaphoralib.experiments import mentionpair
from anaphoralib.experiments import coref_utils
from anaphoralib import utils
from anaphoralib.experiments import utils as exp_utils
%cd '..'
#%load_ext autoreload
#%autoreload 2

scorer_path = 'rucoref/external/reference-coreference-scorers/scorer.pl'

/Users/max/Projects/Coreference/rucoref
/Users/max/Projects/Coreference


In [3]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import export_graphviz

Initializing:
---------------------

In [4]:
rucoref_train = rueval.RuCorefCorpus(multeast, rueval)
rucoref_test = rueval.RuCorefCorpus(multeast, rueval)

In [5]:
exp_utils.load_corpus(rucoref_train, 'Corpus-2015/Tokens.train.fixmorph.txt.parsed', 'Corpus-2015/Groups.train.txt')

Number of texts: 126
Number of GS texts: 126
Number of chains in a corpus: 2515
Number of words in all chains: 11453


In [6]:
exp_utils.load_corpus(rucoref_test, 'Corpus-2015/Tokens.test.fixmorph.txt.parsed', 'Corpus-2015/Groups.test.txt')

Number of texts: 55
Number of GS texts: 55
Number of chains in a corpus: 1123
Number of words in all chains: 5104


In [7]:
group_ok = lambda g: g.tag.startswith('N') or (g.tag.startswith('P') and g.lemma[0] in multeast.coref_pronouns)

In [8]:
gs_mentions, gs_group_ids = coref_utils.get_gs_groups(rucoref_test)
gs_groups = gs_mentions

pred_mentions, pred_group_ids = coref_utils.get_pred_groups(rucoref_test, group_ok)
pred_groups = rucoref_test.groups

pred_mentions_gold_bound, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_test, group_ok)
pred_groups_gold_bound = rucoref_test.groups

In [9]:
gs_mentions_train, gs_group_ids_train = coref_utils.get_gs_groups(rucoref_train)
gs_groups_train = gs_mentions_train

pred_mentions_train, pred_group_ids_train = coref_utils.get_pred_groups(rucoref_train, group_ok)
pred_groups_train = rucoref_train.groups

pred_mentions_gold_bound_train, pred_gold_bounds_ids = coref_utils.get_pred_groups_gold_boundaries(rucoref_train, group_ok)
pred_groups_gold_bound_train = rucoref_train.groups

Defining the classifier
--------------------

In [10]:
class MLMentionPairClassifier(mentionpair.MentionPairClassifier):
    NEEDS_TRAINING = True
    def __init__(self, scorer_path=None):
        self.scorer_path = scorer_path
    
    def train(self, clf, corpus, mentions):
        self.data_x = []
        self.data_y = []
        self.appositives = []
        
        self.tagset = corpus.tagset
        
        for i, text in enumerate(corpus.texts):
            all_mentions = utils.find_mentions(corpus.groups[i], corpus.tagset)
            gs = corpus.gs[i]
            words_index = corpus.words_index[i]

            
            for chain_id in gs['chains']:
                chain = gs['chains'][chain_id]
                for pair in ((chain[i], chain[i+1]) for i in range(len(chain)-1)):
                    text_groups = []
                    for pair_elem in pair:
                        gs_group = gs['groups'][pair_elem]
                        
                        words = [text[words_index[shift]] for shift in gs_group['tokens_shifts']]
                        head = text[words_index[gs_group['head_shift'][0]]]
                        text_groups.append(coref_utils.create_gs_group(gs_group, words, head))
                    
                    self.data_x.append(self.get_feature_vector(corpus.texts[i], corpus.parses[i] if corpus.parses else None, *text_groups))
                    self.data_y.append(True)
                    
                    neg_first = None
                    neg_last = None

                    for i_mention, mention in enumerate(all_mentions):
                        if mention.offset == text_groups[0].offset:
                            neg_first = i_mention
                        if mention.offset == text_groups[1].offset:
                            neg_last = i_mention
                        if neg_first and neg_last:
                            break
                    
                    if not neg_first or not neg_last:
                        continue
                        
                    neg_text_groups = all_mentions[neg_first+1:neg_last]
                    for neg_pair in ((neg_text_groups[i], neg_text_groups[i+1]) for i in range(len(neg_text_groups)-1)):
                        self.data_x.append(self.get_feature_vector(corpus.texts[i], *neg_pair))
                        self.data_y.append(False)
        
        self.clf = clf
        self.clf.fit(self.data_x, self.data_y)
    
    def pair_coreferent(self, pair, groups, words, parse):
        vctr = self.get_feature_vector(words, parse, *pair)
        return self.clf.predict([vctr])[0]
    
    def get_feature_vector(self, words, group_1, group_2):
        # group_1 — possible antecedent
        # group_2 — anaphor
        
        head_1 = group_1.words[group_1.head] if group_1.type != 'word' else group_1
        head_2 = group_2.words[group_2.head] if group_2.type != 'word' else group_2
        
        is_appo = False
        
        if not head_1 in words or not head_2 in words:
            n_sentences = -1
            print 'no alignment found'
        else:
            i = words.index(head_1)
            j = words.index(head_2)
            
            between_groups = words[i+1:j]
            n_sentences = sum(1 for gr in between_groups if gr.tag == 'SENT')
            
            if j - i == 2 and words[i+1].tag.startswith(',') \
                and same_grammemmes('case', (group_1, group_2), self.tagset) \
                and same_grammemmes('number', (group_1, group_2), self.tagset) \
                and same_grammemmes('gender', (group_1, group_2), self.tagset) \
                and group_1.tag.startswith('N') and group_2.tag.startswith('N'):
                #and self.tagset.extract_feature('animacity', group_1) == 'y':
                is_appo = True
                self.appositives.append((group_1, group_2, i, j))
        
        is_demonstrative = lambda w: [tag.startswith('Pd') or w.lemma[i] in {u'этот', u'тот'} for i, tag in enumerate(w.tags)]
        demonstr_1 = is_demonstrative(group_1) if len(group_1.lemma) > 1 else [0]
        demonstr_2 = is_demonstrative(group_2) if len(group_2.lemma) > 1 else [0]
        
        filtered_lemma_1 = ' '.join(lemma for (i, lemma) in enumerate(group_1.lemma) if not demonstr_1[i])
        filtered_lemma_2 = ' '.join(lemma for (i, lemma) in enumerate(group_2.lemma) if not demonstr_2[i])
        
        vctr = []
        feat_names = []
        
        pronoun_1 = self.tagset.pos_filters['pronoun'](group_1) and group_ok(group_1)
        pronoun_2 = self.tagset.pos_filters['pronoun'](group_2) and group_ok(group_1)
        
        vctr.append(pronoun_2 and n_sentences == 1)
        feat_names.append('dist==1')
        
        #vctr.append(pronoun_2 and n_sentences == 0)
        #feat_names.append('dist==0')
        
        vctr.append(not pronoun_1 and not pronoun_2 and filtered_lemma_1 == filtered_lemma_2)
        feat_names.append('str_match')
        
        is_animate_1 = self.tagset.extract_feature('animate', group_1) in ('y', 'a')
        is_animate_2 = self.tagset.extract_feature('animate', group_2) in ('y', 'a')
        sem_class_agreement = (is_animate_1 and is_animate_2) or (not is_animate_1 and not is_animate_2)
        
        if not pronoun_1:
            sem_class_agreement &= group_1.lemma[group_1.head] == group_2.lemma[group_2.head]
        
        vctr.append(sem_class_agreement)
        feat_names.append('sem_class_agreement')
        
        vctr.append(pronoun_1)# and group_ok(group_1))
        vctr.append(pronoun_2)# and group_ok(group_2))
        feat_names.extend(('i_pronoun', 'j_pronoun'))
        
        vctr.append(vctr[-1] and vctr[-2])
        feat_names.append('both_pronouns')
        
        vctr.append(self.tagset.extract_feature('number', group_1) == self.tagset.extract_feature('number', group_2))
        vctr.append(self.tagset.extract_feature('gender', group_1) == self.tagset.extract_feature('gender', group_2))
        feat_names.extend(('number-agr', 'gender-agr'))
        
        vctr.append(self.tagset.extract_feature('proper', group_1) == 'p' 
                    and self.tagset.extract_feature('proper', group_2) == 'p')
        feat_names.append('both-proper')
        vctr.append(any(demonstr_2[:group_2.head+1]))
        feat_names.append('anaphor-is-demonstrative')
        
        vctr.append(is_appo)
        feat_names.append('appositive')
        
        self.feat_names = feat_names
        return vctr

Testing the classifier:
-----------------------

### Gold mentions:

In [11]:
clf = MLMentionPairClassifier(scorer_path)
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)

clf = MLMentionPairClassifier(scorer_path)
clf.train(LinearSVC(random_state=42), rucoref_train, gs_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, gs_mentions, gs_groups, False)

\textsc{MLMentionPairClassifier} &  $100.00$  & $72.62$ & $56.02$ & $63.25$  & $77.27$ & $44.46$ & $56.45$  & $50.94$ \\
\textsc{MLMentionPairClassifier} &  $100.00$  & $71.97$ & $47.19$ & $57.00$  & $80.15$ & $38.95$ & $52.42$  & $45.84$ \\


In [12]:
pairs, disc_pairs = clf.predict_pairs(gs_mentions[0], rucoref_test.groups[0], rucoref_test.texts[0], None, 0)

In [13]:
pairs[:10]

[(Он(P-3msnn, 228), Он(P-3msnn, 392)),
 (ухом(Ncnsin, 328), ухо(Ncnsan, 438)),
 (Степан(Npmsny, 344), Степана(Npmsgy, 449)),
 (Он(P-3msnn, 392), него(P-3msgn, 521)),
 (Степана(Npmsgy, 449), Степану(Npmsdy, 592)),
 (Степану(Npmsdy, 592), Степан(Npmsny, 670)),
 (Фунтика(Ncfsnn, 112), Фунтика(Npmsay, 697)),
 (Степан(Npmsny, 670), Степана(Npmsgy, 779)),
 (Фунтика(Npmsay, 697), Фунтика(Ncmsgn, 1089)),
 (Степана(Npmsgy, 779), Степана(Npmsgy, 1106))]

In [14]:
disc_pairs[:10]

[(старом деревенском доме(Ncmsln, 52),
  кривоногая такса Фунтик(Ncfsnn, 87)),
 (кривоногая такса Фунтик(Ncfsnn, 87),
  Фунтика(Ncfsnn, 112)),
 (старом деревенском доме(Ncmsln, 52),
  Фунтика(Ncfsnn, 112)),
 (Фунтика(Ncfsnn, 112), черный кот Степан(Ncmsny, 154)),
 (кривоногая такса Фунтик(Ncfsnn, 87),
  черный кот Степан(Ncmsny, 154)),
 (старом деревенском доме(Ncmsln, 52),
  черный кот Степан(Ncmsny, 154)),
 (черный кот Степан(Ncmsny, 154), Он(P-3msnn, 228)),
 (Фунтика(Ncfsnn, 112), Он(P-3msnn, 228)),
 (кривоногая такса Фунтик(Ncfsnn, 87),
  Он(P-3msnn, 228)),
 (старом деревенском доме(Ncmsln, 52),
  Он(P-3msnn, 228))]

In [19]:
rucoref_test.gs[0]['groups'][rucoref_test.gs[0]['chains'][1018][0]]

{'attributes': {u'ref': u'def', u'str': u'noun', u'type': u'coref'},
 'head_lengths': [6],
 'head_shift': [4470],
 'length': 6,
 'parent': 0,
 'tokens_lengths': [6],
 'tokens_shifts': [4470]}

### Gold boundaries:

In [12]:
clf = MLMentionPairClassifier(scorer_path)
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)

clf = MLMentionPairClassifier(scorer_path)
clf.train(LinearSVC(random_state=42), rucoref_train, pred_mentions_gold_bound_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions_gold_bound, pred_groups_gold_bound, False)

\textsc{MLMentionPairClassifier} &  $51.19$  & $37.36$ & $50.19$ & $42.83$  & $24.30$ & $40.06$ & $30.25$  & $26.32$ \\
\textsc{MLMentionPairClassifier} &  $51.37$  & $48.39$ & $42.59$ & $45.31$  & $27.36$ & $35.84$ & $31.03$  & $24.14$ \\


### Predicted mentions:

In [13]:
clf = MLMentionPairClassifier(scorer_path)
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, pred_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions, pred_groups, False)

clf = MLMentionPairClassifier(scorer_path)
clf.train(LinearSVC(random_state=42), rucoref_train, pred_mentions_train)
coref_utils.get_score_table(clf, rucoref_test, pred_mentions, pred_groups, False)

\textsc{MLMentionPairClassifier} &  $37.67$  & $26.84$ & $37.01$ & $31.12$  & $16.23$ & $28.72$ & $20.74$  & $20.25$ \\
\textsc{MLMentionPairClassifier} &  $37.67$  & $40.27$ & $32.80$ & $36.15$  & $18.39$ & $26.33$ & $21.66$  & $18.76$ \\


Analyzing the results:
----------------------

In [14]:
clf = MLMentionPairClassifier(scorer_path)
clf.train(DecisionTreeClassifier(random_state=42), rucoref_train, gs_mentions_train)

In [15]:
print '\n'.join('{}: {:.3f}'.format(*pair) for pair in zip(clf.feat_names, clf.clf.feature_importances_))

dist==1: 0.007
str_match: 0.463
sem_class_agreement: 0.019
i_pronoun: 0.130
j_pronoun: 0.197
both_pronouns: 0.086
number-agr: 0.030
gender-agr: 0.059
both-proper: 0.003
anaphor-is-demonstrative: 0.005
appositive: 0.001


In [18]:
scores, test_groups, test_chains = clf.score(rucoref_test, gs_mentions, gs_groups)
coref_utils.print_chains_in_text(rucoref_test, 1, test_chains, gs_mentions)

-- SYS --
Мальчик:мальчик(Ncmsny, 17)
мальчика:мальчик(Ncmsgy, 386)
Мальчик:мальчик(Ncmsny, 843)
мальчик:мальчик(Ncmsny, 1089)
мальчик:мальчик(Ncmsny, 1327)
Мальчик:мальчик(Ncmsny, 2122)
Мальчик:мальчик(Ncmsny, 2330)
Мальчик:мальчик(Ncmsny, 2508)
Мальчик:мальчик(Ncmsny, 2616)
мальчика:мальчик(Ncmsgy, 2967)
Мальчика:мальчик(Ncmsay, 3192)
Мальчик:мальчик(Ncmsny, 3348)
Мальчик:мальчик(Ncmsny, 3495)
Мальчик:мальчик(Ncmsny, 4217)
мальчик:мальчик(Ncmsny, 4539)

он:он(P-3msnn, 58)
своего:свой(P--msga, 1692)
Его:его(P-3msan, 1718)
Он:он(P-3msnn, 2160)
его:его(P-3msan, 2296)

новую хорошую школу:новый хороший школа(Ncfsan, 323)
этой школы:этот школа(Ncfsgn, 570)
новую школу с гуманитарным уклоном:новый школа с гуманитарный уклон(Ncfsan, 801)
музыкальную школу:музыкальный школа(Ncfsan, 872)
новую школу:новый школа(Ncfsan, 2411)
музыкальной школы:музыкальный школа(Ncfsgn, 3636)
музыкальной школы:музыкальный школа(Ncfsgn, 3993)
школу:школа(Ncfsan, 4475)

отец:отец(Ncmsny, 2312)
отцу:отец(Ncmsdy, 3

In [20]:
coref_utils.print_chains_in_text(rucoref_test, 1, test_chains, gs_mentions)

-- SYS --
Мальчик:мальчик(Ncmsny, 17)
мальчика:мальчик(Ncmsgy, 386)
Мальчик:мальчик(Ncmsny, 843)
мальчик:мальчик(Ncmsny, 1089)
мальчик:мальчик(Ncmsny, 1327)
Мальчик:мальчик(Ncmsny, 2122)
Мальчик:мальчик(Ncmsny, 2330)
Мальчик:мальчик(Ncmsny, 2508)
Мальчик:мальчик(Ncmsny, 2616)
мальчика:мальчик(Ncmsgy, 2967)
Мальчика:мальчик(Ncmsay, 3192)
Мальчик:мальчик(Ncmsny, 3348)
Мальчик:мальчик(Ncmsny, 3495)
Мальчик:мальчик(Ncmsny, 4217)
мальчик:мальчик(Ncmsny, 4539)

он:он(P-3msnn, 58)
своего:свой(P--msga, 1692)
Его:его(P-3msan, 1718)
Он:он(P-3msnn, 2160)
его:его(P-3msan, 2296)

новую хорошую школу:новый хороший школа(Ncfsan, 323)
этой школы:этот школа(Ncfsgn, 570)
новую школу с гуманитарным уклоном:новый школа с гуманитарный уклон(Ncfsan, 801)
музыкальную школу:музыкальный школа(Ncfsan, 872)
новую школу:новый школа(Ncfsan, 2411)
музыкальной школы:музыкальный школа(Ncfsgn, 3636)
музыкальной школы:музыкальный школа(Ncfsgn, 3993)
школу:школа(Ncfsan, 4475)

отец:отец(Ncmsny, 2312)
отцу:отец(Ncmsdy, 3