In [1]:
import subprocess
import os
import sys
import pandas as pd
import wikiapi
import re
from collections import defaultdict
import gensim
import requests
from string import punctuation
from lxml import html
import random
import math

In [2]:
sys.path.append('/home/john/diploma/ner')
cwd = os.getcwd()
if cwd == '/home/john/diploma/ner':
    pass
else:
    os.chdir('/home/john/diploma/ner')
import ner.NER

In [36]:
with open('../coref/rucoref_texts/fiction/67_zamiatin_kolumb.txt', 'r') as file:
    inpt = file.read()
    out = ner.NER.process_input_(inpt)
    print(out)

[{'text': 'Колумб', 'len': 6, 'index': 128, 'type': 'PER'}, {'text': 'Колумб', 'len': 6, 'index': 380, 'type': 'PER'}, {'text': 'Груздев', 'len': 7, 'index': 853, 'type': 'PER'}, {'text': 'Груздев', 'len': 7, 'index': 1209, 'type': 'PER'}, {'text': 'Колумба', 'len': 7, 'index': 1311, 'type': 'PER'}, {'text': 'Груздева', 'len': 8, 'index': 1483, 'type': 'PER'}, {'text': 'Груздев', 'len': 7, 'index': 1761, 'type': 'PER'}, {'text': 'Колумб', 'len': 6, 'index': 1799, 'type': 'PER'}, {'text': 'Фоминой', 'len': 7, 'index': 1868, 'type': 'PER'}, {'text': 'Колумб', 'len': 6, 'index': 1890, 'type': 'PER'}, {'text': 'Колумб', 'len': 6, 'index': 2040, 'type': 'PER'}, {'text': 'Колумбу Катя', 'len': 12, 'index': 2797, 'type': 'PER'}, {'text': 'Яновского', 'len': 9, 'index': 2936, 'type': 'PER'}, {'text': 'Америку', 'len': 7, 'index': 3167, 'type': 'LOC'}, {'text': 'Колумб', 'len': 6, 'index': 3480, 'type': 'PER'}, {'text': 'Колумба', 'len': 7, 'index': 3550, 'type': 'PER'}]


In [159]:
class Corefsetup:
    
    
    def __init__(self):
        cwd = os.getcwd()
        if cwd == '/home/john/diploma':
            pass
        else:
            os.chdir('/home/john/diploma')
        self.docs = pd.read_csv('coref/Documents.txt', delimiter='\t') #init pandas df for document ids and names.
        self.groups = pd.read_csv('coref/Groups.txt', delimiter='\t') #init coreference groups into pandas df.
        self.tokens = pd.read_csv('coref/Tokens.txt', delimiter='\t') #init tokens df.
        

    def prepdf(self):
        '''
        Specifc to the structure of rucoref data. Creates table with doc ids and names.
        '''          
        self.newdf = pd.DataFrame(columns=['docid', 'docname'])
        for path in self.docs['path']:
            name = path.split('/')[1]
            did = self.docs.loc[self.docs.path.values == path, 'doc_id'].values[0]
            self.newdf = self.newdf.append(pd.Series([did, name], index=['docid', 'docname']), ignore_index=True)
        return self.newdf

    def formtoks(self, tokens, file, path):
        '''
        format token file for UDpipe tagging and parsing
        '''
        newdoc = 'newdoc id = {0}'.format(path)
        holder = '# {0}\n# newpar\n'.format(newdoc, )
        did = int(self.newdf[self.newdf['docname'] == file]['docid'])
        doc_toks = tokens[tokens['doc_id'] == did]
        doc_toks = doc_toks.replace('\s+', r'"', regex=True) #quotations are not read in properly.
        tid = 1
        sid = 1
        for i, trow in doc_toks.iterrows():
            sentid = '# sent_id = {0}\n'.format(sid)
            line = '{0}\t{1}\t_\t_\t_\t_\t_\t_\t_\t_\n'.format(tid, trow.token)
            if tid == 1:
                holder += sentid
            holder += line
            if not trow.gram == 'SENT':
                tid += 1
            else:
                tid = 1
                sid += 1
                holder += '\n'
        return holder
               
    
    def create_dep(self, textp, modp, outp):
        '''
        For creating new dependecies trees through UDpipe. Not necessary if pre-parsed. Texts should be in
        CoNLL-U format. Will be updated using python bindings for UDpipe.
        
        textp - location of folder containing texts
        
        modp - location of UDpipe language model. Not used in current form.
        
        outp - location and name of output folder. Will be created if none exists.
        '''
        for root, dirs, files in os.walk(textp):
            for f in files:
                path = os.path.join(root, f)
                tokenized = self.formtoks(self.tokens, f, path)
                with open ('temp.txt', 'w', encoding='UTF-8') as temp:
                    temp.write(tokenized)
#                 tokenized = subprocess.call('udpipe/src/udpipe --tokenize \
#                 udpipe/src/russian-syntagrus-ud-2.0-conll17-170315.udpipe {0} > temp.txt'.format(path), shell=True)
                tagged = subprocess.call('udpipe/src/udpipe --tag \
                udpipe/src/russian-syntagrus-ud-2.0-conll17-170315.udpipe temp.txt > temp2.txt', shell=True)
                parsed = subprocess.check_output('udpipe/src/udpipe --parse \
                udpipe/src/russian-syntagrus-ud-2.0-conll17-170315.udpipe temp2.txt', shell=True)
                if not os.path.exists('{0}'.format(outp)):
                    os.makedirs('{0}'.format(outp))
                with open('{0}/{1}'.format(outp, f), 'w', encoding='UTF-8') as stdout:
                    stdout.write(parsed.decode('utf-8'))
   

    def get_vars(self, srow):
        '''
        attain variables for mention extraction
        '''
        deprel = srow['deprel'] #attached to dependencies of head noun
        skip = False
        try:
            srow_h = int(srow['head']) 
        except ValueError:  # can't specify quotechars when creating a df.
            skip = True
            srow_h = None
        spos = srow['udpos']
        form = srow['form']
        return deprel, spos, form, srow_h, skip
            
            
    def quotations(self, frame, dep_form, head, dep, dep_tid_list, srow):
        '''
        matches words in french quotes if in certain range of head and depending words.
        Only captures short mentions, to avoid longer quotations being captured.
        '''
        if dep_form == "»" and 0 < (dep - head) < 5: #catch ending quotes 
            try:
                start = frame[frame['form'] == "«"]
                st_ind = start.index.tolist()[0]
                if 0 < (head -  st_ind) < 3:
                    dep_range = [x for x in range(st_ind, dep+1)]
                    for x in dep_range:
                        dep_tid = frame.ix[x]['tid']
                        dep_tid_list.append(dep_tid)
            except IndexError:
                pass
        return dep_tid_list
    
    
    def adj_participle(self, head, dep, dep_tid_list, srow, dep_spos):
        '''
        matches participles if they appear before head nouns in adjective-like position.
        '''
        if dep_spos in ('VERB') and (head - dep) > 0:
            dep_tid = srow['tid']
            dep_tid_list.append(dep_tid)
        return dep_tid_list

     
    def string_series(self, frame, dep_form, head, dep, dep_tid_list, srow, dep_spos, dep_dep_rel, cycle, indv_mention):
        '''
        matches string series mentions, such as lists of objects. 
        Also matches prepositions and conjunctions inside of phrases.
        Lastly, creates seperate column for words conected by conj deprel, for individual mentions.
        '''
        srow_plus_one = srow['tid'] + 1.0
        srow_min_one = srow['tid'] - 1.0
        dep_plus_one = frame[frame['tid'] == srow_plus_one]
        dep_min_one = frame[frame['tid'] == srow_min_one]
        try:
            dep_plus_one_pos = dep_plus_one['udpos'].values[0]
            dep_min_one_pos = dep_min_one['udpos'].values[0]
        except IndexError:
            pass
        if dep_spos in ('PROPN', 'NOUN') and dep_dep_rel in ('conj') and (dep - head) > 0:
            dep_tid = srow['tid']
            dep_tid_list.append(dep_tid)
            indv_mention = True
        elif (dep_form == ',' and (dep_plus_one['deprel'].values[0] in ('conj', 'amod', 'advmod') or dep_plus_one_pos == 'CCONJ'))\
        or (dep_form == ',' and dep_plus_one_pos == 'ADJ' and dep_min_one_pos == 'ADJ'):
            dep_tid = srow['tid']
            dep_tid_list.append(dep_tid)
        elif cycle == False and dep_spos in ('CCONJ', 'ADP') and (dep - head) > 0: #stricter for first run through.
            dep_tid = srow['tid']
            dep_tid_list.append(dep_tid)
        elif cycle == True and dep_spos in ('CCONJ', 'ADP'):
            dep_tid = srow['tid']
            dep_tid_list.append(dep_tid)
        return dep_tid_list, indv_mention
        
    
    def find_locs (self, sent_frame, dep_form, head, dep, dep_tid_list, srow, dep_spos, dep_dep_rel, cycle, indv_mention):
        '''
        initiates specific mention detection sieves.
        '''
        dep_tid_list = self.quotations(sent_frame, dep_form, head, dep, dep_tid_list, srow) #functions for specific parts and types of mentions
        dep_tid_list = self.adj_participle(head, dep, dep_tid_list, srow, dep_spos)
        dep_tid_list, indv_mention = self.string_series(sent_frame, dep_form, head, dep, dep_tid_list, srow, dep_spos, dep_dep_rel, cycle, indv_mention)   
        return dep_tid_list, indv_mention
    
    
    def cycling(self, sent_frame, dep_form, head, dep, dep_tid_list, srow, dep_spos, dep_dep_rel, cycle, indv_mention):
        '''
        cycle through the dependency location algorithm.
        '''
        dep_tid_list, indv_mention = self.find_locs(sent_frame, dep_form, head, dep, dep_tid_list, srow, dep_spos, dep_dep_rel, cycle, indv_mention)
        dep_tid = srow['tid']
        cur_tid = int(dep_tid)
        cycle = True
        if not dep_form in (',', '«', '»') and not dep_dep_rel == 'case' and dep_spos not in ('VERB'):
            dep_tid_list.append(dep_tid)
        return cur_tid, dep_tid_list, cycle, indv_mention
    
    
    def last_comma(self, word, dep_tid_list):
        '''
        remove comma and tid if mention ends or starts with a comma, period, or hyphen.
        '''
        if word.endswith(' ,') or word.endswith(' .') or word.endswith(' -'):
            word = word[:-2]
            del dep_tid_list[-1]
        elif word.endswith(',') or word.endswith('.') or word.endswith('-'):
            word = word[:-1]
            del dep_tid_list[-1]
        if word.startswith(', ') or word.startswith('. ') or word.startswith('- '):
            word = word[2:]
            del dep_tid_list[0]
        elif word.startswith(',') or word.startswith('.') or word.startswith('-'):
            word = word[1:]
            del dep_tid_list[0]
        return word, dep_tid_list
    
    
    def append_mention(self, cur_tid, dep_tid_list, dep_tid_list_indv, deps, sent_frame, i3, indv_mention):
        '''
        create the mention and tk_shifts from stored index values.
        '''
        dep_tid_list.append(cur_tid)
        dep_tid_list = list(set(dep_tid_list))
        dep_tid_list.sort()
        dep_tid_list_indv.append(cur_tid)
        dep_tid_list_indv = list(set(dep_tid_list_indv))
        dep_tid_list_indv.sort()
        word = ''
        word_indv = ''
        for x in dep_tid_list:
            frame = sent_frame[sent_frame['tid'] == float(x)]
            word += '{0} '.format(frame['form'].values[0])
        word = word[:-1]
        if indv_mention == True:  # for extracting individual mentions from series and lists.
            for x in dep_tid_list_indv:
                frame = sent_frame[sent_frame['tid'] == float(x)]
                word_indv += '{0} '.format(frame['form'].values[0])
            word_indv, dep_tid_list_indv = self.last_comma(word_indv, dep_tid_list_indv)
            deps.ix[i3, 'part_men'] = word_indv
            deps.ix[i3, 'part_shifts'] = ", ".join(str(int(sent_frame['shift'][sent_frame['tid'] == x])) for x in dep_tid_list_indv)
            deps.ix[i3, 'series'] = True
        word, dep_tid_list = self.last_comma(word, dep_tid_list)
        deps.ix[i3, 'full_men'] = word 
        deps.ix[i3, 'tk_shifts'] = ", ".join(str(int(sent_frame['shift'][sent_frame['tid'] == x])) for x in dep_tid_list)
        return deps
        
    
    def mention_det(self, prev_sid, deps, poss):
        '''
        creates a full mention column and tk_shifts for every NP and specific pronouns and determiners.
        For training this all real mentions replaced with actual mentions. This module is used for
        actual mention extraction.
        '''
        sent_frame = deps[deps['sid'] == prev_sid]
        for i3, srow in sent_frame.iterrows(): #srow is an idividual token in sentence.
            indv_mention = False
            dep_tid_list = []
            dep_tid_list_indv = []
            spos = srow['udpos']
            slem = srow['lemma']
            sdep = srow['deprel']
            try:
                if spos == 'PRON' and slem in poss: #catch incorrect parsing of его, ее, их. need sentence parsed to find.
                    head = sent_frame['udpos'][sent_frame['tid'] == float(srow['head'])].values[0]
                    head_rel = sent_frame['deprel'][sent_frame['tid'] == float(srow['head'])].values[0] 
                    if head == 'NOUN' and not head_rel == 'root': #if the head of pronoun is noun
                        deps.ix[i3, 'type'] = 'poss'
            except IndexError:
                pass
            if spos in ('PROPN', 'NOUN') or (spos == 'NUM' and sdep in 'nsubj' or 'root') or (spos == 'ADJ' and sdep in ('nsubj', 'nmod', 'root')):
                cur_tid = int(srow['tid'])
                for i4, srow2 in sent_frame.iterrows():
                    cycle = False
                    deprel2, spos2, form2, srow2_h, skip2 = self.get_vars(srow2) #gets vars for booleans.
                    if skip2 == False and srow2_h == cur_tid and not srow2['type'] =='rel' and\
                    (deprel2 in ('nmod', 'flat:name', 'nummod:gov', 'nummod', 'amod', 'conj', 'nummod:entity', 'flat:foreign') or form2 in (',', '«', '»', '\"', '-')):
                        cur_tid2, dep_tid_list, cycle, indv_mention2 = self.cycling(sent_frame, form2, i3, i4, dep_tid_list, srow2, spos2, deprel2, cycle, indv_mention)
                        if indv_mention2 == True:
                            indv_mention = True
                        for i5, srow3 in sent_frame.iterrows():# second check for prepositions and adjectives. Adverbs also added at this stage.
                            deprel3, spos3, form3, srow3_h, skip3 = self.get_vars(srow3)
                            if skip3 == False and srow3_h == cur_tid2 and not srow3['type'] =='rel' and \
                            (deprel3 in ('amod', 'nmod', 'nummod', 'case', 'conj', 'advmod', 'nummod:entity', 'flat:foreign') or form3 in (',', '«', '»', '\"', '-') or spos3 in ('CCONJ', 'PROPN')):
                                cur_tid3, dep_tid_list, cycle, indv_mention3 = self.cycling(sent_frame, form3, i4, i5, dep_tid_list, srow3, spos3, deprel3, cycle, indv_mention)
                                if indv_mention3 == True:
                                    indv_mention = True
                                for i6, srow4 in sent_frame.iterrows(): #third check for mostly adjectives, and list series
                                    deprel4, spos4, form4, srow4_h, skip4 = self.get_vars(srow4)
                                    if skip4 == False and srow4_h == cur_tid3 and not srow4['type'] =='rel' and \
                                    (deprel4 in ('case', 'amod', 'nmod', 'nummod', 'conj', 'advmod', 'flat:foreign') or spos4 in ('CCONJ', 'PROPN') or form4 in ('«', '»', ',', '\"', '-')): 
                                        cur_tid4, dep_tid_list, cycle, indv_mention4 = self.cycling(sent_frame, form4, i5, i6, dep_tid_list, srow4, spos4, deprel4, cycle, indv_mention)
                                        if indv_mention4 == True:
                                            indv_mention = True
                                        for i7, srow5 in sent_frame.iterrows(): #forth check for list series
                                            deprel5, spos5, form5, srow5_h, skip5 = self.get_vars(srow4)
                                            if skip5 == False and srow5_h == cur_tid4 and not srow5['type'] =='rel' and \
                                            (deprel5 in ('case', 'amod', 'nmod', 'nummod', 'conj', 'advmod', 'flat:foreign') or spos5 == 'CCONJ' or form5 in ('«', '»', ',')): 
                                                cur_tid5, dep_tid_list, cycle, indv_mention5 = self.cycling(sent_frame, form5, i6, i7, dep_tid_list, srow5, spos5, deprel5, cycle, indv_mention)
                                                if indv_mention5 == True:
                                                    indv_mention = True
                if indv_mention == True: #create single mentions for parts of series lists
                    for i4, srow2 in sent_frame.iterrows():
                        cycle = False
                        deprel2, spos2, form2, srow2_h, skip2 = self.get_vars(srow2) #gets vars for booleans.
                        if skip2 == False and srow2_h == cur_tid and not srow2['type'] =='rel' and\
                        (deprel2 in ('nmod', 'flat:name', 'nummod:gov', 'nummod', 'amod')):
                            cur_tid2, dep_tid_list_indv, cycle, indv_mention2 = self.cycling(sent_frame, form2, i3, i4, dep_tid_list_indv, srow2, spos2, deprel2, cycle, indv_mention)
                deps = self.append_mention(cur_tid, dep_tid_list, dep_tid_list_indv, deps, sent_frame, i3, indv_mention)
            elif (spos in ('DET', 'PRON') and srow['type'] in ('refl', 'pron', 'rel', 'poss')):#pronouns and adjectives as nouns.
                dep_tid = srow['tid']
                cur_tid = int(dep_tid)
                dep_tid_list.append(dep_tid)
                deps = self.append_mention(cur_tid, dep_tid_list, dep_tid_list_indv, deps, sent_frame, i3, indv_mention)
        return deps

    
    def find_type(self, s, drow, deps, depend, docdf, ner_inds, dind, sloc):
        '''
        find types of NEs, and pronouns, ect. Used later for coref sieves.
        '''
        refl = ['себя', 'свой']
        poss = ['мой', 'он', 'она', 'они', 'наш', 'ваш', 'твой']  
        if drow.tid == 1: 
            prev_sid = s # first ittereates through previous sentence to create full mentions and tk_shifts.
            if prev_sid != 0:
                deps = self.mention_det(prev_sid, deps, poss)
            s += 1
        drow['sid'] = s
        drow['length'] = docdf.get_value(sloc, 'length')
        drow['shift'] = docdf.get_value(sloc, 'shift')
        stop = False
        pos = drow['udpos']
        lem = drow['lemma']
        if pos == 'PROPN': # this section appends NE values for all words
            if stop == False:
                indx = drow['shift']
                idces = [x for x in range(indx,indx-5, -1)] + [x for x in range(indx, indx+5)] #because of indexing mismatch.
                for ind in idces:
                    if ind in ner_inds:
                            drow['type'] = 'NE'
                            typ = dind[ner_inds.index(ind)]['type']
                            drow['ne_type'] = typ
                            stop = True
        if pos == 'PROPN' and not 'type' in drow: #grabs people, that NER index misses
            drow['type'] = 'NE'      
            if drow['morph'].split('|')[0] == 'Animacy=Anim':
                drow['ne_type'] = 'PER'
            else:
                drow['ne_type'] = '_'
            stop = True
        else:  # sets types for other than NE's, for seperation later.
            if pos == 'NOUN':
                drow['type'] = 'noun'
            elif pos == 'PRON':
                if lem in refl:
                    drow['type'] = 'refl'
                elif lem == 'который':
                    drow['type'] = 'rel'
                else:
                    drow['type'] = 'pron'
            elif pos == 'DET':
                if lem in refl:
                    drow['type'] = 'refl'
                elif lem in poss:
                    drow['type'] = 'poss' 
        return s, drow, deps
    
    
    def fix_deps(self):
        '''
        creates final version of ready-to-use dependencies for feature creation and testing, including
        ref type and NE type determination. Also calls function for mention detection.
        '''
        with open('ner_dict.txt', 'r') as dic:
            ner_dict = eval(dic.read())
        headers = ['tid', 'form', 'lemma', 'udpos', 'xpos', 'morph', 'head', 'deprel', 'deps', 'misc']
        for i, nrow in self.newdf.iterrows():
            print(nrow.docname)
            dind = ner_dict[nrow.docname]  #ner list for specific doc
            ner_inds = [x['index'] for x in dind] #list of all inds that are NE's
            depend = pd.read_csv('coref/new_parsed_texts/{0}'.format(nrow.docname), names=headers, \
                                 delimiter='\t', comment='#', quotechar = '"')
            depend = depend.fillna('_')
            deps = pd.DataFrame(columns=headers)
            docdf = self.tokens[self.tokens['doc_id'] == nrow.docid]
            sloc = docdf[docdf['shift'] == docdf['shift'].min()].index.tolist()[0] 
            s = 0
            for i2, drow in depend.iterrows():
                s, drow, deps = self.find_type(s, drow, deps, depend, docdf, ner_inds, dind, sloc) #finds types of NEs and pronouns
                sloc += 1
                deps = deps.append(drow, ignore_index=True)
            deps = deps.fillna('_')
            if not os.path.exists('coref/df_ready'):
                os.makedirs('coref/df_ready')
            deps.to_csv('coref/df_ready/{0}'.format(nrow.docname), sep='\t')
                        
    
    def make_ner_dict(self, textp):
        '''
        Print out dictionary with NE locations and attributes of each text. Prints dictionary due to memory issues.
        '''
        self.ner_dict = defaultdict(list)
        cwd = os.getcwd()
        if cwd == '/home/john/diploma/ner':
            pass
        else:
            os.chdir('/home/john/diploma/ner')
        for root, dirs, files in os.walk('../{0}'.format(textp)):
            for f in files:
                path = os.path.join(root, f)
                with open('{0}'.format(path), 'r') as text:
                    text = text.read()
                    out = ner.NER.process_input_(text)
                    self.ner_dict[f] = out
        self.ner_dict = dict(self.ner_dict)
        with open('../ner_dict.txt', 'w') as file:
            file.write(str(self.ner_dict))
    
    
    def create_mentions(self):
        '''
        Creates Gold mentions docs.
        '''
        for i, nrow in self.newdf.iterrows():
            print(nrow.docname)
            if not os.path.exists('coref/mentions/{0}'.format(nrow.docname)):
                hold = pd.DataFrame()
                deps = pd.read_csv('coref/df_ready/{0}'.format(nrow.docname), delimiter='\t')
                docrefs = self.groups[self.groups['doc_id'] == nrow['docid']]
                for i, ref in docrefs.iterrows():
                    wlist = ref['content'].split(' ')
                    if len(wlist) > 1:
                        found = False
                        for sft in ref['tk_shifts'].split(','):
                            if found == False:
                                head_test = deps[deps['shift'] == int(sft)]
                                udpos = head_test['udpos'].values[0]
                                if udpos == 'PROPN' or udpos == 'NOUN':
                                    men = head_test
                                    found = True
                    else:
                        men = deps[deps['shift'] == ref['shift']]
                    men['chain_id'] = ref['chain_id']
                    men['tk_shifts'] = ref['tk_shifts']
                    men['full_men'] = ref['content']
                    hold = hold.append(men)
                hold = hold.fillna('_')
                if not os.path.exists('coref/mentions'):
                    os.mkdir('coref/mentions')
                hold.to_csv('coref/mentions/{0}'.format(nrow.docname), sep='\t')
            else:
                pass
    
    
    def old_morph(self, headmen, head_pos):
        '''
        gathers morphological info from head mention.
        '''
        leng = len(headmen['morph'].values[0].split('|'))
        if leng == 4:
            ment_anim = headmen['morph'].values[0].split('|')[0].replace('Animacy=', '').split(',')
            ment_case = headmen['morph'].values[0].split('|')[1].replace('Case=', '') #only one possible value.
            ment_gen = headmen['morph'].values[0].split('|')[2].replace('Gender=', '').split(',')
            ment_num = headmen['morph'].values[0].split('|')[3].replace('Number=', '').split(',')
        elif leng == 3:
            ment_anim = None
            ment_case = headmen['morph'].values[0].split('|')[0].replace('Case=', '') #only one possible value.
            ment_gen = headmen['morph'].values[0].split('|')[1].replace('Gender=', '').split(',')
            ment_num = headmen['morph'].values[0].split('|')[2].replace('Number=', '').split(',')
        else:
            ment_anim = None
            ment_case = None
            ment_gen = None
            ment_num = None
        return ment_anim, ment_case, ment_gen, ment_num
    
    
    def new_morph(self, new_anim, new_gen, new_num, ment_anim, ment_gen, ment_num, mer_men):
        '''
        combines morphological features of mentions with head mention.
        '''
        if not ment_anim == None:
            if len(mer_men['morph'].values[0].split('|')) == 4:
                mer_anim = mer_men['morph'].values[0].split('|')[0].replace('Animacy=', '').split(', ')
                mer_gen = mer_men['morph'].values[0].split('|')[2].replace('Gender=', '').split(', ')
                mer_num = mer_men['morph'].values[0].split('|')[3].replace('Number=', '').split(', ')
                for mer_a in mer_anim:
                    if mer_a not in ment_anim: #merging morphological features, if different.
                        new_anim.append(mer_a)
                for mer_g in mer_gen:
                    if mer_g not in ment_gen:
                        new_gen.append(mer_g)
                for mer_n in mer_num:
                    if mer_n not in ment_num:
                        new_num.append(mer_n)
                no_merge = False
            else:
                no_merge = True
        elif not ment_gen == None:
            mer_gen = mer_men['morph'].values[0].split('|')[1].replace('Gender=', '').split(', ')
            mer_num = mer_men['morph'].values[0].split('|')[2].replace('Number=', '').split(', ')
            new_anim = None
            for mer_g in mer_gen:
                if mer_g not in ment_gen:
                    new_gen.append(mer_g)
            for mer_n in mer_num:
                if mer_n not in ment_num:
                    new_num.append(mer_n)
            no_merge = False
        else:
            new_anim = None
            new_gen = None
            new_num = None
            no_merge = True
        return new_anim, new_gen, new_num, no_merge
          

# merge types
        
   
    def morph_merge(self, headmen, merges, mens):
        '''
        merges morphology of mentions to head mention.
        '''
        head_pos = headmen['udpos']
        ment_anim, ment_case, ment_gen, ment_num = self.old_morph(headmen, head_pos) #creates list of all morph features of head mention.
        new_anim = []
        new_gen = []
        new_num = []
        if not ment_anim == None:
            new_anim.extend(ment_anim)
        elif not ment_case == None:
            new_gen.extend(ment_gen)
            new_num.extend(ment_num)
        for mer in merges:
            mer_men = mens[mens['shift'] == mer]
            new_anim, new_gen, new_num, no_merge = self.new_morph(new_anim, new_gen, new_num, ment_anim, ment_gen, ment_num, mer_men)
        if not new_anim == None and no_merge == False:
            new_anim = ", ".join(list(set(new_anim))) #remove repeating features.
            new_gen = ", ".join(list(set(new_gen)))
            new_num = ", ".join(list(set(new_num)))
            new_mor = 'Animacy={0}|Case={1}|Gender={2}|Number={3}'.format(new_anim, ment_case, new_gen, new_num)
        elif no_merge == False and not new_gen == None:
            new_gen = ", ".join(list(set(new_gen)))
            new_num = ", ".join(list(set(new_num)))
            new_mor = 'Case={0}|Gender={1}|Number={2}'.format(ment_case, new_gen, new_num)
        else:
            new_mor = headmen['morph'].values[0]
        return new_mor   
    
    
    def sid_merge(self, headmen, headmen_id, merges, mens):
        '''
        merges sentence id info, for later calculation of distances, as program chooses closest distance
        when considering sentence range. Creates clust_head category for referencing back to head in order to
        gather cluster level info.
        '''
        head_sid = headmen['sid'].values #extract all values
        new_sids = []
        new_sids.extend(head_sid)
        clust_head_of_head = mens.ix[headmen_id, 'clust_head'] # find head of head mention.
        for mer in merges:
            mer_men = mens[mens['shift'] == mer]
            mer_sid = mer_men['sid'].values
            mer_id = mer_men.index.tolist()[0] #check functioning!
            new_sids.extend(mer_sid)
            if not clust_head_of_head == '_': # adds head_clust of headmen to mention if it exists.
                mens.ix[mer_id, 'clust_head'] = clust_head_of_head
            else:
                mens.ix[mer_id, 'clust_head'] = headmen['shift'].values[0] #shift head
        new_sids = ', '.join(list(set([str(x) for x in new_sids])))
        mens.ix[headmen_id, 'sid_corefs'] = new_sids
        return mens
            
            
            
    def merge(self, mens, men_dict, manual, morph_merge, gold): # main merge function.
        '''
        merges data for corefering mentions(i.e. sid, shift of all), appending necessary 
        info to head mention (often most informative mention.), removes all child 
        mentions from mention list, replaces child mention pairings in candidate list with head word. 
        Because features are aggregated across all mentions in cluster, will not make impact on performance.
        '''      
        if manual == True:
            for ment, merges in men_dict.items():
                skip = False
                headmen = mens[mens['shift'] == ment]
                if gold == True: #to select for merging in gold model, only those that exist.
                    indexes = []
                    if not headmen.empty:
                        indexes.append(ment)
                    for mer in merges:
                        mermen = mens[mens['shift'] == mer]
                        if not mermen.empty:
                            indexes.append(mer)
                    if len(indexes) > 0:
                        indexes.sort() # to get the most early occuring mention first in order.
                        head_sh = indexes[0]
                        headmen = mens[mens['shift'] == head_sh]
                        headmen_id = headmen.index.tolist()[0]
                        del indexes[0]
                        merges = indexes 
                    else:
                        skip = True
                else:
                    try: #because of trying to train mens and deps at same time
                        headmen_id = headmen.index.tolist()[0]
                    except IndexError:
                        print('first ment error!', ment)               
                if skip == False:
                    mens.ix[headmen_id, 'corefs'] = ', '.join([str(x) for x in merges]) # add shifts of all corefering mentions
                    if len(merges) > 0:
                        if morph_merge == True:
                            new_mor = self.morph_merge(headmen, merges, mens)
                            mens.ix[headmen_id, 'morph'] = new_mor
                        mens = self.sid_merge(headmen, headmen_id, merges, mens) 
                    mens = mens.fillna('_')
            return mens
            
    
    def manual_sieves(self, mens, deps, check, docname):
        '''
        cycles through all docs for the manual sieves and prints output to folder.
        '''
        manual = True
        mentions_final = pd.DataFrame(columns = ['clust_head', 'part_men'])
        mentions_final = self.head_match_sieve(mens, mentions_final, deps, manual, gold=True)
        deps = self.head_match_sieve(deps, mentions_final, deps, manual, gold=False)
        deps, mentions_final = self.discourse_sieve(mentions_final, deps, manual) #change to mentions_final in final version!!!
        deps, mentions_final = self.rel_sieve(mentions_final, deps, manual)
        deps, mentions_final = self.alias_sieve(mentions_final, deps, manual)
        if check == True:
            if not os.path.exists('coref/sieves/'):
                os.mkdir('coref/sieves/')
            deps.to_csv('coref/sieves/{0}'.format(docname), sep='\t')
            if not os.path.exists('coref/men_sieves/'):
                os.mkdir('coref/men_sieves/')
            mentions_final.to_csv('coref/men_sieves/{0}'.format(docname), sep='\t')
        
            
            
#all manual sieves


    def head_match_sieve(self, mens, mentions_final, deps, manual, gold): # need to test
        '''
        matches head words and merges on match. !!!If there is a head noun than pair that instead.
        if gender same on dep.
        '''
        morph_merge = True
        men_dict = defaultdict(list)
        men_past = []
        for i2, mention1 in mens.iterrows():
            skip = False
            men1_sh = mention1['shift']
            sent1 = deps[deps['sid'] == mention1['sid']]
            for i3, check in sent1.iterrows():
                if men1_sh in check['tk_shifts'].split(',') and not check['series'] == 'True' and skip == False:
                    mention1 = check
                    skip = True #only once, first mention found will always be longest
            men1_pos = mention1['udpos']
            men1_dep = mention1['deprel']
            men_lem = mention1['lemma']
            if men1_pos in ('PROPN') and men1_sh not in men_past: #avoid repeats
                for i4, mention2 in mens.iterrows():
                    skip2 = False
                    men2_sh = mention2['shift']
                    sent2 = deps[deps['sid'] == mention2['sid']]
                    men_lem2 = mention2['lemma']
                    for i4, check2 in sent1.iterrows():
                        if men2_sh in check2['tk_shifts'].split(',') and not check2['series'] == 'True'\
                        and skip2 == False:
                            mention2 = check2
                            skip2 = True
                    if men_lem == men_lem2 and not men1_sh == men2_sh and men2_sh not in men_past:
                        men_dict[mention1['shift']].append(mention2['shift'])
                        templist = [mention1['shift'], mention2['shift']]
                        men_past.extend(templist)
                        men_past = list(set(men_past)) #remove copies
        mentions_final = self.merge(mens, men_dict, manual, morph_merge, gold) #merge morph feats
        return mentions_final
    
    
    def find_quote(self, token, sent, deps, sid, mentions_final, prev_nar, found, new_par, first):
        '''
        Finds the narrator of discourse. Also marks new_par false only once new narrator is found to avoid
        the second boolean comparison being triggered early. Assumes paragraph seperation for new dialogue
        narrators.
        '''
        for si2, token2 in sent.iterrows():
            try:
                tok2_head = float(token2['head'])
                if found == False:
                    if token2['deprel'] == 'nsubj': #if there is a nsubj then it will be narrator, if head root.
                        rootv = sent[sent['tid'] == tok2_head] # get head of nsubj noun.
                        rootv_dep =  rootv['deprel'].values[0] 
                        if rootv_dep == 'root': #check if head verb of nsubj is root.
                            nar = token2['shift']
                            deps.ix[deps['sid'] == sid, 'nar'] = nar # append to deps and mentions
                            mentions_final.ix[mentions_final['sid'] == sid, 'nar'] = nar
                            if first == True and sid > 1: #if symbol first and behind it is a connected clause then 
                                sent_min_one = deps[deps['sid'] == (sid - 1)] # that clasue also gets identical nar.
                                sent_min_one_end = sent_min_one.ix[sent_min_one['tid'].idxmax()] #extend to go until hits par.
                                sent_min_one_end_sh = sent_min_one_end['shift']
                                sent_start_sh = token['shift']
                                sent_dif = sent_start_sh - sent_min_one_end_sh #check shift distance btw end of current sent and next
                                sent_min_one_nar = sent_min_one['nar']
                                if sent_dif < 3 and not sent_min_one_nar == '_':
                                    deps.ix[deps.sid == sid, 'nar'] = sent_min_one_nar
                                    mentions_final.ix[mentions_final.sid == sid, 'nar'] = sent_min_one_nar
                                elif sent_dif < 3 and sent_min_one_nar == '_':   
                                    deps.ix[deps.sid == (sid - 1), 'nar'] = nar # append to deps and mentions
                                    mentions_final.ix[mentions_final.sid == (sid - 1), 'nar'] = nar
                            found = True
                            new_par = False #set to false
                            prev_nar = nar
                        else:
                            nar = '_'
                            prev_nar = nar
                            deps.ix[deps.sid == sid, 'nar'] = nar # if first mention matched. Need to fix!!!
                            mentions_final.ix[mentions_final.sid == sid, 'nar'] = nar
                    else:
                        nar = '_'
                        prev_nar = '_'
                        deps.ix[deps.sid == sid, 'nar'] = nar
                        mentions_final.ix[mentions_final.sid == sid, 'nar'] = nar
            except ValueError:
                pass
        return deps, mentions_final, nar, new_par, found
                            
                                
    
    def find_dialogue(self, sid, sent, deps, mentions_final, prev_nar, new_par, sent_end, shift_dif_start):
        '''
        Finds sentence with dialogue. If there is indication of non dialouge in same sentence
        takes the subject of the root verb as the narrator.
        '''
        found = False
        for si, token in sent.iterrows():
            tok_tid = token['tid']
            tok_form = token['form']
            sent_end_tid = sent_end['tid']
            if tok_tid == 1.0 and tok_form in ('-', '\"') and shift_dif_start < 3 and found == False:
                first = True
                deps, mentions_final, nar, new_par, found = self.find_quote(token, sent, deps, sid, mentions_final, prev_nar, found, new_par, first)                
            elif not tok_tid in (1.0, sent_end_tid) and tok_form in ('-', '\"') and found == False: #check all other than first token
                tok_min_one_pos = sent.ix[si-1]['udpos']
                tok_plus_one_pos = sent.ix[si+1]['udpos']
                first = False
                if tok_form == '-' and tok_min_one_pos == 'PUNCT': #check if there is a shift from dialogue to text in same sentence.
                    deps, mentions_final, nar, new_par, found = self.find_quote(token, sent, deps, sid, mentions_final, prev_nar, found, new_par, first)
                elif tok_form == '\"' and (tok_min_one_pos or tok_plus_one_pos) == 'PUNCT': #check before and after for punctuation.
                    deps, mentions_final, nar, new_par, found = self.find_quote(token, sent, deps, sid, mentions_final, prev_nar, found, new_par, first)
        if found == False and new_par == False: #if nothing is found in sentence, but their was no indication of a new line.
            nar = prev_nar
            deps.ix[deps.sid == sid, 'nar'] = nar # append to deps and mentions
            mentions_final.ix[mentions_final.sid == sid, 'nar'] = nar
            found = True
        elif found == False:
            nar = prev_nar
            deps.ix[deps.sid == sid, 'nar'] = '_'
            mentions_final.ix[mentions_final.sid == sid, 'nar'] = '_'
        return deps, mentions_final, nar, new_par
                
                
    def find_new_par(self, deps, sent, sent_end, new_par, sid, prev_nar):
        '''
        Set new_par to true if new paragraph or line break found.
        '''
        sent2 = deps[deps['sid'] == (sid + 1)]
        sent2_start = sent2[sent2['tid'] == 1.0]
        sent2_start_sh = sent2_start['shift'].values[0]
        sent1_end_sh = sent_end['shift']
        sent_dif = sent2_start_sh - sent1_end_sh #check shift distance btw end of current sent and next
        if sent_dif > 3:
            new_par = True
            prev_nar = '_' #resets prev_nar after new paragraph.
        return new_par, prev_nar
    
    
    def pro_disc(self, mens, manual, gold):
        '''
        merge pronouns in first person, where the speaker is identified.
        '''
        morph_merge = False
        men_dict = defaultdict(list)
        men_past = []
        for i, mention in mens.iterrows():
            men1_sh = mention['shift']
            if mention['lemma'] in ('я', 'мой') and men1_sh not in men_past and not mention['nar'] == '_': #avoid repeats
                men_dict[mention['nar']].append(men1_sh)
                templist = [men1_sh]
                men_past.extend(templist)
                men_past = list(set(men_past)) #remove copies  
        mentions_final = self.merge(mens, men_dict, manual, morph_merge, gold)
        return mentions_final 
        
    
    def discourse_sieve(self, mentions_final, deps, manual):
        '''
        Finds speaker for mentions found in discourse models. Calls pro_discourse
        in order to resolve pronouns in discourse. (1st and 2nd singular and plural)
        '''
        sids = list(deps['sid'].unique())
        if '_' in sids: sids.remove('_')
        prev_nar = '_' 
        new_par = True
        last_sent_id = deps.ix[deps['sid'].idxmax()]
        last_sent = last_sent_id['sid']
        for sid in sids:
            sent = deps[deps['sid'] == sid]
            if len(sent.index) > 2:
                sent_end = sent.ix[sent['tid'].idxmax()]
                start = sent[sent['tid'] == 1.0]
                start_form = start['form'].values[0]
                start_plus_one = sent['tid'][sent['tid'] == 2.0].values[0]      
                shift_dif_start =  start_plus_one - 1 #get space difference
                deps, mentions_final, prev_nar, new_par = self.find_dialogue(sid, sent, deps, mentions_final, prev_nar, new_par, sent_end, shift_dif_start) # for dialouge inside sentences.
                if not sid == last_sent:
                    new_par, prev_nar = self.find_new_par(deps, sent, sent_end, new_par, sid, prev_nar) #analyze if new paragraph.
        deps = deps.fillna('_')
        mentions_final = mentions_final.fillna('_')
        mentions_final = self.pro_disc(mentions_final, manual, gold=True)
        deps = self.pro_disc(deps, manual, gold=False) # for ease of use currently. DELETE LATER!
        return deps, mentions_final     


    def rel_sieve(self, mentions_final, deps, manual): #excluded for now. Too many errors. Better to include in ML model.
        '''
        sieve for labeling all possible morph features of relative pronouns. (aka который)
        Info will be used later in ML pronoun resolution.
        '''
        men_dict = defaultdict(list)
        morph_merge = True
        sids = list(deps['sid'].unique()) #possibly combine later so not to reiterate sents twice.
        if '_' in sids: sids.remove('_')
        masc = ['ый', 'ому', 'ом', 'ого']
        fem = ['ой', 'ую', 'ая']
        plur = ['ые', 'ых', 'ыми']
        for sid in sids:
            stop = False
            sent = deps[deps['sid'] == sid]
            for i, token in sent.iterrows():
                if stop == False:
                    t_lem = token['lemma']
                    t_shift = token['shift']
                    if t_lem == 'который':
                        t_form = token['form']
                        prev_morph = token['morph']
                        for x in masc: #determine morph info for rel mention
                            if t_form.endswith(x):
                                t_gen = 'Gender=Masc, Neut'
                                t_num = 'Number=Sing'
                        for x in fem:
                            if t_form.endswith(x):
                                t_gen = 'Gender=Fem'
                                t_num = 'Number=Sing'
                        for x in plur:
                            if t_form.endswith(x):
                                t_gen = 'Gender=Masc, Neut, Fem'
                                t_num = 'Number=Plur'
                        if t_form.endswith('ым'):
                            if token['morph'] == 'Case=Ins':
                                t_gen = 'Gender=Masc, Neut'
                                t_num = 'Number=Sing'
                            else:
                                t_gen = 'Gender=Masc, Neut, Fem'
                                t_num = 'Number=Plur'
                        elif t_form.endswith('ое'):
                            t_gen = 'Gender=Neut'
                            t_num = 'Number=Sing'
                        n_morph = '{0}|{1}|{2}'.format(prev_morph, t_gen, t_num)
                        deps.ix[i, 'morph'] = n_morph
                        try:
                            men_id = mentions_final[mentions_final['shift'] == t_shift].index.tolist()[0]
                            mentions_final.ix[men_id, 'morph'] = n_morph
                            men_dict[t_shift].append(t_shift)
                        except:
                            pass
#         deps = self.merge(deps, men_dict, manual, morph_merge, gold=False)  #excess head ref info to self won't matter.
#         mentions_final = self.merge(mentions_final, men_dict, manual, morph_merge, gold=True)                                 
        return deps, mentions_final
    
    
    def make_alias_dict(self, i, alias, deps, alias_dict, men_past):
        '''
        make_dictionary for merging
        '''
        word_sh = deps.ix[i, 'shift']
        for i2, ment2 in deps.iterrows():
            ment2_sh = ment2['shift']
            ment2_anim = ment2['morph'].split('|')[0]
            if (ment2['full_men'] in alias or 'part_men' in ment2 and ment2['part_men'] in alias or ment2['lemma'] in alias) and \
            ment2_sh not in men_past and word_sh not in men_past and not ment2_sh == word_sh and \
            not ment2_anim == 'Animacy=Anim': #also check for animacy. Should only be handled in head_match sieve
                sent = ment2[ment2['sid'] == ment2['sid']]
                ment2_hid = float(ment2['head'])
                try: #because of quotations
                    ment_head = sent[sent['tid'] == ment2_hid]
                    if ment_head['udpos'] not in ('NOUN', 'PROPN'):
                        alias_dict[word_sh].append(ment2_sh)
                        men_past.append(ment2_sh)
                    else:
                        pass
                except TypeError:
                    pass
        return alias_dict, men_past
             
        
    def remove_accents(self, word):
        '''
        remove pronunciation accents on wikpedia
        '''
        replacements = [('е́', 'e'), ('и́', 'и'), ('а́', 'а'), ('о́', 'о'), ('у́', 'у'), ('ы́', 'ы'), ('э́', 'э'), \
                    ('ю́', 'ю'), ('я́', 'я'), ('А́', 'А'), ('Е́','Е'), ('И́','И'), ('О́','О'), ('У́','У'), ('Ы́','Ы'), \
                    ('Э́','Э'), ('Ю́','Ю'), ('Я́','Я')]
        for a, b in replacements:
            word = word.replace(a, b)
        return word    
    
    
    def wiki_alias(self, word, r, punct):
        '''
        find aiases from wikipedia
        '''
        tree = html.fromstring(r.text)
        bolds = tree.xpath(".//div[@class='mw-content-ltr']/p[1]/b/text()") #bold words in summary often acronyms.
        if len(bolds) < 1:
            bolds = tree.xpath("(.//div[@class='mw-content-ltr']//p[1])[1]/b/text()")
        appos = tree.xpath(".//div[@class='mw-content-ltr']/p[1]/text()|.//div[@class='mw-content-ltr']/p[1]/a/text()")
        if len(appos) < 1:
            appos = tree.xpath("(.//div[@class='mw-content-ltr']//p[1])[1]/text()| \
            (.//div[@class='mw-content-ltr']//p[1])[1]/a/text()")
        stop = False
        for w in appos: #capture first link-word group after — which is indicative of a noun and apposition. Can improve.
            if '\xa0—' in w and stop == False:
                i = appos.index(w)
                i2 = i + 1
                try:
                    bolds.append(appos[i2])
                except IndexError:
                    pass
                stop = True
        for p in punct:
            for w in bolds:
                if '\xa0— ' in w:
                    del bolds[bolds.index(w)]
                if p in w:
                    words = w.split(p)
                    i = bolds.index(w)
                    del bolds[i]
                    for wd in words:
                        if not wd == '':
                            bolds.append(wd)
        new_words = [self.remove_accents(x) for x in bolds]
        if ' ' in new_words:
            del new_words[new_words.index(' ')]
        if 'нет статьи' in new_words:
            new_words = []
        return new_words
            
           
    def make_alias(self, word, punct):
        '''
        makes potential acronym by taking words in mention and combining all first letters.
        '''
        stop = False
        for p in punct:
            if p in word:
                n_word = ''
                stop = True
        if ' ' in word and stop == False:
            words = word.split(' ') #removing spaces
            n_word = []
            if ' ' in words:
                del words[words.index(' ')]
            for w in words:
                if ' ' in w:
                    w = w.replace(' ', '')
                n_word += w[0]
            n_word = "".join(n_word)
        else:
            n_word = ''
        return n_word
            
            
    def alias_sieve(self, mens, deps, manual):
        '''
        Finds acronyms and other potential names for mentions.
        '''
        morph_merge = True
        alias_dict = defaultdict(list)
        men_past = []
        for i, ment in deps.iterrows():
            alias_sh = []
            ment_pos = ment['udpos']
            ment_anim = ment['morph'].split('|')[0]
            if ment_pos in ('PROPN') and not ment_anim == 'Animacy=Anim': #so that alias doesn
                if 'series' in ment and ment['series'] == True:
                    word = ment['part_men'] 
                else:
                    word = ment['full_men']
                r = requests.get('https://ru.wikipedia.org/wiki/{0}'.format(word))
                status = r.status_code
                punct = punctuation + '«»'
                alias = []
                if status == 200:
                    alias.extend(self.wiki_alias(word, r, punct))
                else:
                    word = word.upper()
                    alias.extend(self.wiki_alias(word, r, punct))
                alias.append(self.make_alias(word, punct))
                if '' in alias:
                    del alias[alias.index('')]
                alias = list(set(alias)) # remove repeats
                if len(alias) > 0:
                    alias_dict, men_past = self.make_alias_dict(i, alias, deps, alias_dict, men_past)
        if len(alias_dict) > 0:
            deps = self.merge(deps, alias_dict, manual, morph_merge, gold=False)
            mentions_final = self.merge(mens, alias_dict, manual, morph_merge, gold=True)
        else:
            mentions_final = mens
        return deps, mentions_final
            
        
    
    def create_data(self, create_men=True):
        '''
        pass through manual sieves, collect mention pairs, and create final variable list for all 
        training and test data sets.
        '''
        cwd = os.getcwd()
        if cwd == '/home/john/diploma':
            pass
        else:
            os.chdir('/home/john/diploma')
        check = True #for printing results if True
        for i, nrow in self.newdf.iterrows():
            docname = nrow.docname
            print(docname)
            docid = nrow.docid
            mens = pd.read_csv('coref/mentions/{0}'.format(nrow.docname), delimiter='\t')
            deps = pd.read_csv('coref/df_ready/{0}'.format(nrow.docname), delimiter='\t')
#             mens = pd.read_csv('coref/mentions/73_ilf_schastlivy_otec.txt', delimiter='\t') #for doc testing
#             deps = pd.read_csv('coref/df_ready/73_ilf_schastlivy_otec.txt', delimiter='\t')
            mens['clust_head'] = '_' # create clust_head column.
            deps['clust_head'] = '_'
            self.manual_sieves(mens, deps, check, docname)        
        


In [162]:
inpt = '../../coref/rucoref_texts/fiction/5_petrushevskaya_v_detstve.txt'
out = '../../output.text'
path1 = 'syntaxnet/models/parsey_universal/parse.sh'    #path to parsh.sh script
path2 = 'syntaxnet/models/parsey_universal/Russian-SynTagRus'    #path to your dependencies model

modp = 'udpipe/src/russian-syntagrus-ud-2.0-conll17-170315.udpipe'
outp = 'coref/new_parsed_texts'
textp = 'coref/rucoref_texts'

example1 = Corefsetup()
newdf = example1.prepdf()
# example1.create_dep(textp, modp, outp)
# example1.make_ner_dict(textp)
# example1.fix_deps()
# example1.create_mentions()
example1.create_data()



102_beliajev_nad_bezdnoj.txt
107_dragunsky_volshebnaja_sila_iskusstva.txt
15_paustovsky_zhilcy_starogo_doma.txt
2_astafiev_zhizn_prozhit.txt
30_dojl_sluchaj.txt
34_kassil_solnce_svetit.txt
43_musatov_stozhary.txt
44_nagibin_siren.txt
53_beliajev_dom_s_prividenijami.txt
5_petrushevskaya_v_detstve.txt
67_zamiatin_kolumb.txt
73_ilf_schastlivy_otec.txt
andersen_motylek.txt
bazhov_travyanaja_zapadenka.txt
bunin_skazka.txt
dostojevskij_podrostok.txt
dovlatov_kompromiss_6.txt
fet_knyaginya.txt
gilyarovskij_moi_skitanija.txt
gogol_zapiski_3.txt
harms_upadanije.txt
korolenko_mgnovenije.txt
turgenev_veshnije_vody.txt
2013_04_11_dotless_.txt
2013_07_31_krebs_.txt
lenta.ru-news-2014-01-19-cutshort.txt
lenta.ru-news-2014-01-24-if.txt
lenta.ru-news-2014-01-30-crimea.txt
lenta.ru-news-2014-02-03-capitanic.txt
lenta.ru-news-2014-02-03-london.txt
lenta.ru-news-2014-02-03-name1.txt
lenta.ru-news-2014-02-03-rucksack.txt
lenta.ru-news-2014-02-04-party.txt
lenta.ru-news-2014-02-04-pyramid.txt
448-done.txt


KeyboardInterrupt: 

In [163]:
from sklearn.tree import DecisionTreeClassifier

In [1]:
class Coreftrain:
    
    def __init__(self, newdf):
        self.train_noun = pd.DataFrame(columns=['docid', 'start_ref', 'cor', 'incor'])
        self.gold_noun =  pd.DataFrame(columns=['docid', 'start_ref', 'refs'])
        self.no_gold_noun =  pd.DataFrame(columns=['docid', 'start_ref', 'refs'])
        self.train_ne = pd.DataFrame(columns=['docid', 'start_ref', 'cor', 'incor'])
        self.gold_ne =  pd.DataFrame(columns=['docid', 'start_ref', 'refs'])
        self.no_gold_ne =  pd.DataFrame(columns=['docid', 'start_ref', 'refs'])
        self.train_pron = pd.DataFrame(columns=['docid', 'start_ref', 'cor', 'incor'])
        self.gold_pron =  pd.DataFrame(columns=['docid', 'start_ref', 'refs'])
        self.no_gold_pron =  pd.DataFrame(columns=['docid', 'start_ref', 'refs'])
        self.answers_noun = pd.DataFrame() # hold answers for training sets. separtatedby type. 9 total.
        self.answers_ne = pd.DataFrame()
        self.answers_pron = pd.DataFrame()
        self.train_var_noun = pd.DataFrame()#training data
        self.train_var_ne = pd.DataFrame()
        self.train_var_pron = pd.DataFrame()
        self.gold_var_noun = pd.DataFrame()#start vars for nouns
        self.no_gold_var_noun = pd.DataFrame()
        self.gold_var_ne = pd.DataFrame() #start vars for NEs
        self.no_gold_var_ne = pd.DataFrame()
        self.gold_var_pron = pd.DataFrame()#start vars for pronouns
        self.no_gold_var_pron = pd.DataFrame()
        self.idstr_noun = pd.DataFrame() # holds ids for merge reference.
        self.idstr_ne = pd.DataFrame()
        self.idstr_pron = pd.DataFrame()
        self.ids_ne_gold = pd.DataFrame()# ne ids
        self.ids_ne_nogold = pd.DataFrame()
        self.ids_noun_gold = pd.DataFrame()# ne noun
        self.ids_noun_nogold = pd.DataFrame()
        self.ids_pron_gold = pd.DataFrame()# ne pron
        self.ids_pron_nogold = pd.DataFrame()
        self.newdf = newdf
        
    
        #merge functions from previous doc. Was decided that corefsetup already to large, hence the repetition.
    
    def old_morph(self, headmen, head_pos):
        '''
        gathers morphological info from head mention.
        '''
        leng = len(headmen['morph'].values[0].split('|'))
        if leng == 4:
            ment_anim = headmen['morph'].values[0].split('|')[0].replace('Animacy=', '').split(',')
            ment_case = headmen['morph'].values[0].split('|')[1].replace('Case=', '') #only one possible value.
            ment_gen = headmen['morph'].values[0].split('|')[2].replace('Gender=', '').split(',')
            ment_num = headmen['morph'].values[0].split('|')[3].replace('Number=', '').split(',')
        elif leng == 3:
            ment_anim = None
            ment_case = headmen['morph'].values[0].split('|')[0].replace('Case=', '') #only one possible value.
            ment_gen = headmen['morph'].values[0].split('|')[1].replace('Gender=', '').split(',')
            ment_num = headmen['morph'].values[0].split('|')[2].replace('Number=', '').split(',')
        else:
            ment_anim = None
            ment_case = None
            ment_gen = None
            ment_num = None
        return ment_anim, ment_case, ment_gen, ment_num
    
    
    def new_morph(self, new_anim, new_gen, new_num, ment_anim, ment_gen, ment_num, mer_men):
        '''
        combines morphological features of mentions with head mention.
        '''
        if not ment_anim == None:
            if len(mer_men['morph'].values[0].split('|')) == 4:
                mer_anim = mer_men['morph'].values[0].split('|')[0].replace('Animacy=', '').split(', ')
                mer_gen = mer_men['morph'].values[0].split('|')[2].replace('Gender=', '').split(', ')
                mer_num = mer_men['morph'].values[0].split('|')[3].replace('Number=', '').split(', ')
                for mer_a in mer_anim:
                    if mer_a not in ment_anim: #merging morphological features, if different.
                        new_anim.append(mer_a)
                for mer_g in mer_gen:
                    if mer_g not in ment_gen:
                        new_gen.append(mer_g)
                for mer_n in mer_num:
                    if mer_n not in ment_num:
                        new_num.append(mer_n)
                no_merge = False
            else:
                no_merge = True
        elif not ment_gen == None:
            mer_gen = mer_men['morph'].values[0].split('|')[1].replace('Gender=', '').split(', ')
            mer_num = mer_men['morph'].values[0].split('|')[2].replace('Number=', '').split(', ')
            new_anim = None
            for mer_g in mer_gen:
                if mer_g not in ment_gen:
                    new_gen.append(mer_g)
            for mer_n in mer_num:
                if mer_n not in ment_num:
                    new_num.append(mer_n)
            no_merge = False
        else:
            new_anim = None
            new_gen = None
            new_num = None
            no_merge = True
        return new_anim, new_gen, new_num, no_merge
          

# merge types
        
   
    def morph_merge(self, headmen, merges, mens):
        '''
        merges morphology of mentions to head mention.
        '''
        head_pos = headmen['udpos']
        ment_anim, ment_case, ment_gen, ment_num = self.old_morph(headmen, head_pos) #creates list of all morph features of head mention.
        new_anim = []
        new_gen = []
        new_num = []
        if not ment_anim == None:
            new_anim.extend(ment_anim)
        elif not ment_case == None:
            new_gen.extend(ment_gen)
            new_num.extend(ment_num)
        for mer in merges:
            mer_men = mens[mens['shift'] == mer]
            new_anim, new_gen, new_num, no_merge = self.new_morph(new_anim, new_gen, new_num, ment_anim, ment_gen, ment_num, mer_men)
        if not new_anim == None and no_merge == False:
            new_anim = ", ".join(list(set(new_anim))) #remove repeating features.
            new_gen = ", ".join(list(set(new_gen)))
            new_num = ", ".join(list(set(new_num)))
            new_mor = 'Animacy={0}|Case={1}|Gender={2}|Number={3}'.format(new_anim, ment_case, new_gen, new_num)
        elif no_merge == False and not new_gen == None:
            new_gen = ", ".join(list(set(new_gen)))
            new_num = ", ".join(list(set(new_num)))
            new_mor = 'Case={0}|Gender={1}|Number={2}'.format(ment_case, new_gen, new_num)
        else:
            new_mor = headmen['morph'].values[0]
        return new_mor   
    
    
    def sid_merge(self, headmen, headmen_id, merges, mens):
        '''
        merges sentence id info, for later calculation of distances, as program chooses closest distance
        when considering sentence range. Creates clust_head category for referencing back to head in order to
        gather cluster level info.
        '''
        head_sid = headmen['sid'].values #extract all values
        new_sids = []
        new_sids.extend(head_sid)
        clust_head_of_head = mens.ix[headmen_id, 'clust_head'] # find head of head mention.
        for mer in merges:
            mer_men = mens[mens['shift'] == mer]
            mer_sid = mer_men['sid'].values
            mer_id = mer_men.index.tolist()[0] #check functioning!
            new_sids.extend(mer_sid)
            if not clust_head_of_head == '_': # adds head_clust of headmen to mention if it exists.
                mens.ix[mer_id, 'clust_head'] = clust_head_of_head
            else:
                mens.ix[mer_id, 'clust_head'] = headmen['shift'].values[0] #shift head
        new_sids = ', '.join(list(set([str(x) for x in new_sids])))
        mens.ix[headmen_id, 'sid_corefs'] = new_sids
        return mens
            
            
            
    def merge(self, mens, men_dict, manual, morph_merge, gold): # main merge function.
        '''
        merges data for corefering mentions(i.e. sid, shift of all), appending necessary 
        info to head mention (often most informative mention.), removes all child 
        mentions from mention list, replaces child mention pairings in candidate list with head word. 
        Because features are aggregated across all mentions in cluster, will not make impact on performance.
        '''      
        if manual == True:
            for ment, merges in men_dict.items():
                skip = False
                headmen = mens[mens['shift'] == ment]
                if gold == True: #to select for merging in gold model, only those that exist.
                    indexes = []
                    if not headmen.empty:
                        indexes.append(ment)
                    for mer in merges:
                        mermen = mens[mens['shift'] == mer]
                        if not mermen.empty:
                            indexes.append(mer)
                    if len(indexes) > 0:
                        indexes.sort() # to get the most early occuring mention first in order.
                        head_sh = indexes[0]
                        headmen = mens[mens['shift'] == head_sh]
                        headmen_id = headmen.index.tolist()[0]
                        del indexes[0]
                        merges = indexes 
                    else:
                        skip = True
                else:
                    try: #because of trying to train mens and deps at same time
                        headmen_id = headmen.index.tolist()[0]
                    except IndexError:
                        print('first ment error!', ment)               
                if skip == False:
                    mens.ix[headmen_id, 'corefs'] = ', '.join([str(x) for x in merges]) # add shifts of all corefering mentions
                    if len(merges) > 0:
                        if morph_merge == True:
                            new_mor = self.morph_merge(headmen, merges, mens)
                            mens.ix[headmen_id, 'morph'] = new_mor
                        mens = self.sid_merge(headmen, headmen_id, merges, mens) 
                    mens = mens.fillna('_')
            return mens  
    
    
    def form_frame(self, deps, mens, type_all, l_sids, r_sids, sid, typ, goldm):
        if type_all == 'pron':
            if typ in ('pron', 'poss'):
                rng = 3
                frame = self.create_frames(l_sids, r_sids, sid, rng, deps, goldm, mens)#sentence frames for candidate searching.
            elif typ in ('refl', 'rel'):
                if goldm == False:
                    frame = deps[deps['sid'] == sid]
                else:
                    frame = mens[mens['sid'] == sid]
            else:
                frame = []
        elif type_all == 'ne':
            if typ == 'NE':
                rng = 5
                frame = self.create_frames(l_sids, r_sids, sid, rng, deps, goldm, mens)
            else:
                frame = []
        elif type_all == 'noun':
            if typ == 'noun':
                rng = 4
                frame = self.create_frames(l_sids, r_sids, sid, rng, deps, goldm, mens)
            else:
                frame = [] #catch gold mens to deps descrepancies for training set. Just skip.
        return frame
    
            
    def set_up_frames(self, ment, typ, type_all, deps, mens, train, goldm):  
        '''
        sort frame ranges and frames for different mentions.
        '''
        sid = int(ment['sid'])
        sid_max = int(deps['sid'][deps['sid'].idxmax()])
        ment_sh = ment['shift']
        l_sids = 0
        r_sids = 0
        if sid < 4: #finds left and right boundaries if sentence at beginning or end of doc.
            l_sids = [x for x in range(1, sid+1)]
        elif (sid_max - sid) < 4:
            r_sids = [x for x in range(sid, sid_max+1)]
        frames = self.form_frame(deps, mens, type_all, l_sids, r_sids, sid, typ, goldm)
        return frames
    

    def create_frames(self, l_sids, r_sids, sid, rng, deps, goldm, mens):
        '''
        creates frames for mention search.
        '''
        if goldm == False:
            if not l_sids == 0 and not r_sids == 0:
                l_len = len(l_sids)
                r_len = len(r_sids)
                if l_len > 0 and r_len > 0:
                    frame = deps.query('{0} <= sid <= {1}'.format(l_sids, r_sids))
                elif l_len > 0 and not r_len > 0:
                    frame = deps.query('{0} <= sid <= {1}'.format(l_sids, sid+rng))
                elif r_len > 0 and not l_len > 0:
                    frame = deps.query('{0} <= sid <= {1}'.format(sid-rng, r_sids))
            else:
                frame = deps.query('{0} <= sid <= {1}'.format(sid-rng, sid+rng))
        else:
            if not l_sids == 0 and not r_sids == 0:
                l_len = len(l_sids)
                r_len = len(r_sids)
                if l_len > 0 and r_len > 0:
                    frame = mens.query('{0} <= sid <= {1}'.format(l_sids, r_sids))
                elif l_len > 0 and not r_len > 0:
                    frame = mens.query('{0} <= sid <= {1}'.format(l_sids, sid+rng))
                elif r_len > 0 and not l_len > 0:
                    frame = mens.query('{0} <= sid <= {1}'.format(sid-rng, r_sids))
            else:
                frame = mens.query('{0} <= sid <= {1}'.format(sid-rng, sid+rng))
        return frame

    
    def create_test_sets(self, ment, deps, frame, docid, df, ment_ser, goldm):
        '''
        appends rows to test sets for each ment
        '''
        ments = []
        ment_sh = ment['shift']
        ment_shs = ment['tk_shifts'].split(', ')
        if not len(frame) == 0: #aviod skipped mentions.
            for i, m in frame.iterrows():
                m_sh = m['shift']
                if m_sh in ment_shs or m_sh == ment_sh: # so not to capture mentions in mentions as possibly coreferent if they overlap at all.
                    pass
                elif not m['full_men'] == '_':
                    ments.append(m_sh)
            ments = list(set(ments))
            tempval = {'docid':[docid], 'start_ref':[ment_sh], 'refs':[ments], 'series':[ment_ser]}
            tempdf = pd.DataFrame(tempval)
            df = df.append(tempdf)
        return df
    
    
    def create_candidates(self, mens, deps, train, gold, no_gold, docid, \
                          train_ids, test_ids, type_all):
        '''
        creates initial candidate lists for training, gold mentions, and no gold mentions.
        '''
        if docid in test_ids:
            for i, ment in deps.iterrows():
                if not ment['full_men'] == '_' and not ment['udpos'] == 'PUNCT': #avoid non mentions and parsing errors.
                    ment_typ = ment['type']
                    if 'series' not in ment:
                        ment['series'] = '_'
                    ment_ser = ment['series']
                    frame = self.set_up_frames(ment, ment_typ, type_all, deps, mens, train=False, goldm=False)
                    no_gold = self.create_test_sets(ment, deps, frame, docid, no_gold, ment_ser, goldm=False)
        for i, ment in mens.iterrows():
            if not ment['udpos'] == 'PUNCT' and not ment['full_men'] =='_':
                if 'series' not in ment:
                    ment['series'] = '_'
                if docid in test_ids:
                    ment_typ = ment['type']
                    ment_ser = ment['series']
                    frame = self.set_up_frames(ment, ment_typ, type_all, deps, mens, train=False, goldm=True)
                    gold = self.create_test_sets(ment, deps, frame, docid, gold, ment_ser, goldm=True)
                else:
                    sid = int(ment['sid'])
                    ment_sh = ment['shift']
                    ment_typ = ment['type']
                    ment_ser = ment['series']
                    frame = self.set_up_frames(ment, ment_typ, type_all, deps, mens, train=True, goldm=False) # create frame.
                    if len(frame) == 0:
                        pass
                    else:
                        cor_id = ment['chain_id']
                        cor_men = mens['shift'][mens['chain_id'] == cor_id].tolist()
                        cor_ments = []
                        for cor in cor_men:
                            ment_temp = frame[frame['shift'] == cor]
                            cor_ments.append(ment_temp)
                        n_cor_ments = []
                        for df in cor_ments:
                            if df.empty:
                                pass
                            elif df['shift'].values[0] == ment_sh:
                                pass
                            else:
                                n_cor_ments.append(df)
                        if len(n_cor_ments) == 0: #for refs that can't be found within the given frame.
                            pass
                        else:
                            cor = random.choice(n_cor_ments) # in order to randomly choose mentions.
                            cor_sh = cor['shift'].values[0]
                            if cor_sh > ment_sh: #select incorrect only if it's between correct
                                incors = frame[(ment_sh < frame['shift']) & (frame['shift'] < cor_sh)]
                            elif ment_sh > cor_sh:
                                incors = frame[(cor_sh < frame['shift']) & (frame['shift'] < ment_sh)]
                            incor_ments = []
                            for i, incorr in incors.iterrows():
                                if incorr['shift'] in cor_men or incorr['shift'] == ment_sh:
                                    pass
                                elif not incorr['full_men'] == '_': #weed out non mentions
                                    incor_ments.append(incorr)
                            if len(incor_ments) == 0: # if no possible mentions in between.
                                if cor_sh > ment_sh: #select any possible mention in frame.
                                    incors = frame[(cor_sh < frame['shift']) | (frame['shift'] < ment_sh)]
                                elif ment_sh > cor_sh:
                                    incors = frame[(ment_sh < frame['shift']) | (frame['shift'] < cor_sh)]
                                for i, incorr in incors.iterrows():
                                    if incorr['shift'] in cor_men or incorr['shift'] == ment_sh:
                                        pass
                                    elif not incorr['full_men'] == '_': #weed out non mentions
                                        incor_ments.append(incorr)
                            if len(incor_ments) == 0: # If no other posible mentions in frame besides correct men
                                incors = deps[deps['sid'] == sid + 1] # add following sent so there is a value
                                for i, incorr in incors.iterrows():
                                    if incorr['shift'] in cor_men or incorr['shift'] == ment_sh:
                                        pass
                                    elif not incorr['full_men'] == '_': #weed out non mentions
                                        incor_ments.append(incorr)
                                if len(incor_ments) == 0: # If no other posible mentions in frame besides correct men
                                    incors = deps[deps['sid'] == sid - 1] # add following sent so there is a value
                                    for i, incorr in incors.iterrows():
                                        if incorr['shift'] in cor_men or incorr['shift'] == ment_sh:
                                            pass
                                        elif not incorr['full_men'] == '_': #weed out non mentions
                                            incor_ments.append(incorr)
                            incor = random.choice(incor_ments)
                            incor_sh = incor['shift']
                            tempval = {'docid':[docid], 'start_ref':[ment_sh], 'cor':[cor_sh], 'incor':[incor_sh], 'series':[ment_ser]}
                            tempdf = pd.DataFrame(tempval)
                            train = train.append(tempdf)
        return train, gold, no_gold
      
    
    
    def compare(self, title, val, temp_ref, start_ment, ref_ment):
        '''
        module for comparing values of two mentions. Appends 1 for True, 0 for False to given title.
        '''
        if val == 'morph': #because need to split morph values.
            start_morph = start_ment[val].values[0].split('|')
            try:
                ref_morph = ref_ment[val].values[0].split('|')
            except IndexError:
                print(ref_ment)
                print(title)
                sys.exit()
            stopg = False # set stops to false default
            stopa = False
            stopn = False
            stopp = False
            stopgr = False
            stopar = False
            stopnr = False
            stoppr = False
            for morph in start_morph: # determine existance of morph
                if title == 'genderm':
                    if morph.startswith('Gender=') and stopg == False:
                        stopg = True
                elif title == 'animacym':
                    if morph.startswith('Animacy=') and stopa == False:
                        stopa = True
                elif title == 'numberm':
                    if morph.startswith('Number=') and stopn == False:
                        stopn = True
                elif title == 'person_match':
                    if morph.startswith('Person=') and stopp == False:
                        stopp = True
                        
            for morph in ref_morph:
                if title == 'genderm':
                    if morph.startswith('Gender=') and stopgr == False:
                        stopgr = True
                elif title == 'animacym':
                    if morph.startswith('Animacy=') and stopar == False:
                        stopar = True
                elif title == 'numberm':
                    if morph.startswith('Number=') and stopnr == False:
                        stopnr = True
                elif title == 'person_match':
                    if morph.startswith('Person=') and stoppr == False:
                        stoppr= True
                        
                        
            if val == 'clustagree':   
                if stopg == False:
                    start_val_gen = None
                if stopa == False:
                    start_val_ani = None
                if stopn == False:
                    start_val_num = None
                if stopgr == False:
                    ref_val_gen = None
                if stopar == False:
                    ref_val_ani = None
                if stopnr == False:
                    ref_val_num = None
                    
                for morph in start_morph:
                    if morph.startswith('Gender='):
                        start_val_gen = morph
                    if morph.startswith('Animacy='):
                        start_val_ani = morph
                    if morph.startswith('Number='):
                        start_val_num = morph
                        
                    
                for morph in ref_morph:
                    if morph.startswith('Gender='):
                        ref_val_gen = morph
                    if morph.startswith('Animacy='):
                        ref_val_ani = morph
                    if morph.startswith('Number='):
                        ref_val_num = morph
                if start_val_gen == ref_val_gen and start_val_ani == ref_val_ani and start_val_num == ref_val_num \
                and not (start_val_gen == None or start_val_ani == None or start_val_num == None or ref_val_gen == None \
                or ref_val_ani == None or ref_val_num == None):
                    temp_ref[title] = [1]
                else:
                    temp_ref[title] = [0]
                return temp_ref

                
            else:
                if (stopg or stopa or stopa or stopp) == False:
                    start_val = None
                
                if (stopgr or stopar or stopar or stoppr) == False:
                    ref_val = None

                    
                for morph in start_morph:
                    if title == 'genderm':
                        if morph.startswith('Gender='):
                            start_val = morph
                    elif title == 'animacym':
                        if morph.startswith('Animacy='):
                            start_val = morph
                    elif title == 'numberm':
                        if morph.startswith('Number='):
                            start_val = morph
                    elif title == 'person_match':
                        if morph.startswith('Person='):
                            start_val = morph
                            
                    
                for morph in ref_morph:
                    if title == 'genderm':
                        if morph.startswith('Gender='):
                            ref_val = morph
                    elif title == 'animacym':
                        if morph.startswith('Animacy='):
                            ref_val = morph
                    elif title == 'numberm':
                        if morph.startswith('Number='):
                            ref_val = morph
                    elif title == 'person_match':
                        if morph.startswith('Person='):
                            ref_val = morph
            
        else: # for all other compares, without special format
        
            start_val = start_ment[val].values[0]
            ref_val = ref_ment[val].values[0]
#         if ('start_val' in locals() or 'start_val' in globals()) \
#         and ('ref_val' in locals() or 'ref_val' in globals()):
        
        if not title == 'clustagree':
            try:
                if ref_val == start_val and not (ref_val or start_val) == None:
                    temp_ref[title] = [1]
                else:
                    temp_ref[title] = [0]
            except UnboundLocalError:
                print(start_morph)
                print(ref_morph)
                print(title)
                print(val)
                sys.exit()

#         else:
#             temp_ref[title] = [0]
        
        return temp_ref
            
            
    def partialheadm(self, title, val, temp_ref, start_ment, ref_ment, deps):
        '''
        relaxed head mention match. If any nouns or proper nouns in mention match returns 1 for true.
        '''
        start_shs = start_ment[val].values[0].split(',')
        ref_shs = ref_ment[val].values[0].split(',')
        if len(ref_shs) > 0:
            for stsh in start_shs:
                st = deps[deps['shift'] == float(stsh)]
                st_pos = st['udpos'].values[0]
                if st_pos in ('NOUN', 'PROPN'): #so that not just pairing anything
                    st_lem = st['lemma'].values[0]
                    for refsh in ref_shs:
                        try:
                            rf = deps[deps['shift'] == float(refsh)]
                        except ValueError:
                            print(ref_shs)
                            print(ref_ment)
                            print(title)
                            sys.exit()
                        rf_lem = rf['lemma'].values[0]
                        if st_lem == rf_lem:  # compare the two
                            temp_ref[title] = [1]
            if temp_ref[title].empty:
                temp_ref[title] = [0]
        else:
            temp_ref[title] = [0]
        return temp_ref
                    
        
    def distance(self, title, val, temp_ref, start_ment, ref_ment):
        '''
        finds distance bewteen given start and finish, depending on variable.
        '''
        start_sid = start_ment[val].values[0]
        ref_sid = ref_ment[val].values[0]
        dif = int(start_sid) - int(ref_sid) #can be negative to account for direction.
        temp_ref[title] = [dif]
        return temp_ref
           
        
    def sing_to_plur(self, morph):
        '''
        Change number of series to plural
        '''
        morph = ['Number=Plur' if x=='Number=' else x for x in morph]
        morph = '|'.join(morph)
        return morph
      
        
    def find_val(self, title, val, temp_ref, start_ment, find):
        '''
        returns 1 for true if head begins with a determiner.
        '''
        if val == 'tk_shifts':
            st_first = start_ment[val].values[0]
            if not type(st_first) == float:
                st_first = st_first.split(',')[0]
            st_f_pos = start_ment['udpos'].values[0]
            if st_f_pos == find:
                temp_ref[title] = [1]
            else:
                temp_ref[title] = [0]
        elif val in ('deprel', 'type', 'shift', 'containnum'):
            st_first = start_ment[val].values[0]
            if st_first == find:
                temp_ref[title] = [1]
            else:
                temp_ref[title] = [0]
        return temp_ref
        
        
    def get_val(self, title, val, temp_ref, start_ment):
        temp_ref[title] = [start_ment[val].values[0]]
        return temp_ref
    
    
    def comb_attr(self, title, val, temp_ref, start_ment, ref_ment):
        start_val = start_ment[val].values[0]
        ref_val = ref_ment[val].values[0]
        new_val = '-'.join((start_val, ref_val))
        temp_ref[title] = [new_val]
        return temp_ref
        
    
    def clust_info(self, title, val, temp_ref, start_ment, ref_ment, deps, act):
        '''
        for gathering and comparing cluster level info.
        '''
        ref_head = deps[deps['shift'] == float(ref_ment['head'].values[0])]
        start_head = deps[deps['shift'] == float(start_ment['head'].values[0])]
        if act == 'compare':
            title = title
            val = 'morph'
            temp_ref.merge(self.compare(title, val, temp_ref, start_ment, ref_ment))
        return temp_ref
           
        
    def make_vars(self, deps, starts, refs, var_list, typ, train):
        '''
        makes vars sets for each mention.
        '''
        all_refs = []#list for temp storing of mention comparisons.
        for start in starts:
            for ref in refs:
                ref_ment = deps[deps['shift'] == ref]
                temp_ref = pd.DataFrame(columns=var_list)
                if 'genderm' in var_list:
                    title = 'genderm'
                    val = 'morph'
                    temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                if 'numberm' in var_list:
                    title = 'numberm'
                    val = 'morph'
                    temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                if 'headm' in var_list:
                    title = 'headm'
                    val = 'lemma'
                    temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                if 'partialheadm' in var_list:
                    title = 'partialheadm'
                    val = 'tk_shifts'
                    temp_ref.merge(self.partialheadm(title, val, temp_ref, start, ref_ment, deps))
                if 'sentdist' in var_list:
                    title = 'sentdist'
                    val = 'sid'
                    temp_ref.merge(self.distance(title, val, temp_ref, start, ref_ment))
                if 'detmen' in var_list:
                    title = 'detmen'
                    val = 'tk_shifts'
                    find = 'DET'
                    temp_ref.merge(self.find_val(title, val, temp_ref, start, find))
                if 'detref' in var_list:
                    title = 'detref'
                    val = 'tk_shifts'
                    find = 'DET'
                    temp_ref.merge(self.find_val(title, val, temp_ref, ref_ment, find))
                if 'nsubjant' in var_list:
                    title = 'nsubjant'
                    val = 'deprel'
                    find = 'nsubj'
                    temp_ref.merge(self.find_val(title, val, temp_ref, ref_ment, find))
                if 'nsubjmen' in var_list:
                    title = 'nsubjmen'
                    val = 'deprel'
                    find = 'nsubj'
                    temp_ref.merge(self.find_val(title, val, temp_ref, start, find))
                if 'nar_match' in var_list:
                    if not start['nar'].values[0] == '_':
                        title = 'nar_match'
                        val = 'nar'
                        temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                if 'antecedenttype' in var_list:
                    title = 'antecedenttype'
                    val = 'type'
                    temp_ref.merge(self.get_val(title, val, temp_ref, ref_ment))
                if 'namepartref' in var_list:
                    title = 'namepartref'
                    val = 'deprel'
                    find = 'flat:name'
                    temp_ref.merge(self.find_val(title, val, temp_ref, ref_ment, find))
                if 'isrefspeaker' in var_list:
                    title = 'isrefspeaker'
                    val = 'shift'
                    find = start['nar'].values[0]
                    temp_ref.merge(self.find_val(title, val, temp_ref, ref_ment, find))
                if 'deproleant' in var_list:
                    title = 'deproleant'
                    val = 'deprel'
                    temp_ref.merge(self.get_val(title, val, temp_ref, ref_ment))
                if 'deprolemen' in var_list:
                    title = 'deprolemen'
                    val = 'deprel'
                    temp_ref.merge(self.get_val(title, val, temp_ref, start))
                if 'deprolecomb' in var_list:
                    title = 'deprolecomb'
                    val = 'deprel'
                    temp_ref.merge(self.comb_attr(title, val, temp_ref, start, ref_ment))
                if 'samesent' in var_list:
                    title = 'samesent'
                    val = 'sid'
                    temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))


                ref_pos = ref_ment['udpos'].values[0]

                if typ == 'noun': #noun specific vars
                    if ref_pos in ('NOUN', 'PROPN'):
                        if 'animacym' in var_list and ref_pos in ('NOUN', 'PROPN'):
                            title = 'animacym'
                            val = 'morph'
                            temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                        if 'clustagree' in var_list: #cluster level info
                            title = 'clustagree'
                            val = 'morph'
                            act = 'compare'
                            temp_ref.merge(self.clust_info(title, val, temp_ref, start, ref_ment, deps, act))
                        if 'containnum' in var_list: #cluster level info
                            title = 'containnum'
                            val = 'udpos'
                            find = 'NUM'
                            temp_ref.merge(self.find_val(title, val, temp_ref, ref_ment, find))


                elif typ == 'ne': #Ne specific vars
                    if ref_pos in ('NOUN', 'PROPN'):
                        if 'animacym' in var_list:
                            title = 'animacym'
                            val = 'morph'
                            temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                        if 'clustagree' in var_list: #cluster level info
                            title = 'clustagree'
                            val = 'morph'
                            act = 'compare'
                            temp_ref.merge(self.clust_info(title, val, temp_ref, start, ref_ment, deps, act))
                    if 'NEtype_match' in var_list:
                        title = 'NEtype_match'
                        val = 'ne_type'
                        temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment))
                    if 'namepart' in var_list: #part of a name, not head. Ideally learns to not pair these.
                        title = 'namepart'
                        val = 'deprel'
                        find = 'flat:name'
                        temp_ref.merge(self.find_val(title, val, temp_ref, start, find))


                elif typ == 'pron': # pronoun specific vars
                    if 'person_match' in var_list:
                        title = 'person_match'
                        val = 'morph'
                        temp_ref.merge(self.compare(title, val, temp_ref, start, ref_ment)) 
                    if 'ptypemen' in var_list:
                        title = 'ptypemen'
                        val = 'deprel'
                        temp_ref.merge(self.get_val(title, val, temp_ref, start, start_ment))
                        
                temp_ref = temp_ref.fillna(0)
                all_refs.append(temp_ref)
        all_refdf = pd.concat(all_refs)
        return all_refdf

            
    
    def set_up_vars(self, docid, row, deps, df, answers, var_list, typ, ids, train):#add docid
        '''
        set up variables adn dfs for creating vars.
        '''
        if train == True:
            start_refs = [deps[deps['shift'] == row.start_ref]] # can't be more than one, but program accepts lists here.
            cor_ref = row.cor
            incor_ref = row.incor
            refs = [cor_ref, incor_ref]
            tempdf = self.make_vars(deps, start_refs, refs, var_list, typ, train) # appends head ref pair variables
            df = pd.concat([df, tempdf])
            
            for start_ref in start_refs:
                iddf_cor = pd.DataFrame({'docid': [docid], 'start': [start_ref], 'ref':[cor_ref]})
                iddf_incor = pd.DataFrame({'docid': [docid], 'start': [start_ref], 'ref':[incor_ref]})
                ids = ids.append(iddf_cor)
                ids = ids.append(iddf_incor)
                answers.append(pd.DataFrame({'answer': [1]}))
                answers.append(pd.DataFrame({'answer': [0]}))
        elif train == False:
            ser = row.series
            if ser == True:#change morph for series to plural and split seperate mention for singular short ref.
                start_ment = deps[deps['shift'] == row.start_ref]
                start_ment_full = start_ment
                start_ment_full['morph'] = self.sing_to_plur(start_ment['morph'].values[0].split('|'))
                start_ment_part = start_ment
                start_ment_part['full_men'] = start_ment_part['part_men']
                start_ment_part['tk_shifts'] = start_ment_part['part_shifts']
                start_refs = [start_ment_full, start_ment_part]
            else:
                start_refs = [deps[deps['shift'] == row.start_ref]]
            refs = row.refs
            tempdf = self.make_vars(deps, start_refs, refs, var_list, typ, train)
            df = pd.concat([df, tempdf])
            for start_ref in start_refs:
                for ref in refs:
                    iddf = pd.DataFrame({'docid': [docid], 'start': [start_ref], 'ref':[ref]})
                    ids = ids.append(iddf)
                               
        return df, answers, ids


                    
                    
    def start_vars(self, deps, answers, train, gold_var, no_gold_var, \
                   train_ids, idstr, ids_gold, \
                   ids_nogold, test_ids, train_var, gold, no_gold, type_all, docid, var_list):
        '''
        create mention variable lists for training and testing.
        '''
        
        if docid in train_ids: #make training data.
            print('train')
            i = 1
            for i, trow in train.iterrows():
                train_var, answers, idstr = self.set_up_vars(docid, trow, deps, train_var, answers, \
                                                                        var_list, type_all, idstr, train=True)
            
            print(i)
            i += 1
            
        elif docid in test_ids: # make test data
            print('test')
            no_answers = None
            q = 1
            for i, rowg in gold.iterrows():
                gold_var_ne, no_answers, ids_gold = self.set_up_vars(docid, rowg, deps, gold_var, no_answers, \
                                                var_list, type_all, ids_gold, train=False)
                print(q)
                q += 1
            
            t = 1
            for i, rowng in no_gold.iterrows():
                no_gold_var, no_answers, ids_nogold = self.set_up_vars(docid, rowg, deps, no_gold_var, no_answers, \
                                                    var_list, type_all, ids_nogold, train=False)
                print(t)
                t += 1
       
        return answers, idstr, ids_gold, \
        ids_nogold, train_var, gold_var, no_gold_var, 
        
        
    def get_train_ids(self, train_per):
        '''
        randomizes training and test sets based on given percentage value, 
        returning the docids in two lists for training and testing.
        '''
        train_per = math.floor(train_per * 181)
        train_ids = random.sample(range(1, 181), train_per) #harcoded quantity of texts.
        test_ids = [x for x in range(1, 181) if x not in train_ids]
        return train_ids, test_ids
        
    
    def create_vars(self, var_list, type_all, train_per, create_men=True):
        '''
        pass through manual sieves, collect mention pairs, and create final variable list for all 
        training and test data sets.
        '''
        self.train_ids, self.test_ids = self.get_train_ids(train_per) #set ids fro training
        for i, nrow in self.newdf.iterrows():
            docname = nrow.docname
            print(docname)
            docid = nrow.docid
            mens = pd.read_csv('coref/men_sieves/{0}'.format(nrow.docname), delimiter='\t') # temp method for testing.
            deps = pd.read_csv('coref/sieves/{0}'.format(nrow.docname), delimiter='\t')
            
            if type_all == 'noun':
                self.train_noun, self.gold_noun, \
                self.no_gold_noun = self.create_candidates(mens, deps, self.train_noun, self.gold_noun, self.no_gold_noun, \
                                                           docid, self.train_ids, self.test_ids, type_all)
                
                self.answers_noun, self.idstr_noun, self.ids_ne_gold, self.ids_ne_nogold, \
                self.train_var_noun, self.gold_var_noun, \
                self.no_gold_var_noun = self.start_vars(deps, self.answers_noun, \
                     self.train_noun, self.gold_var_noun, self.no_gold_var_noun, \
                     self.train_ids, self.idstr_noun, self.ids_noun_gold, \
                     self.ids_noun_nogold, self.test_ids, self.train_var_noun, \
                     self.gold_noun, self.no_gold_noun, type_all, docid, var_list)
            
            
            elif type_all == 'ne':
                self.train_ne, self.gold_ne, \
                self.no_gold_ne = self.create_candidates(mens, deps, self.train_ne, self.gold_ne, self.no_gold_ne, \
                                                           docid, self.train_ids, self.test_ids, type_all)
            
                self.answers_ne, self.idstr_ne, self.ids_ne_gold, self.ids_ne_nogold, \
                self.train_var_ne, self.gold_var_ne, \
                self.no_gold_var_ne = self.start_vars(deps, self.answers_ne, \
                     self.train_ne, self.gold_var_ne, self.no_gold_var_ne, \
                     self.train_ids, self.idstr_ne, self.ids_ne_gold, \
                     self.ids_ne_nogold, self.test_ids, self.train_var_ne, \
                     self.gold_ne, self.no_gold_ne, type_all, docid, var_list)
                
            elif type_all == 'pron':
                self.train_pron, self.gold_pron, \
                self.no_gold_pron = self.create_candidates(mens, deps, self.train_pron, self.gold_pron, self.no_gold_pron, \
                                                           docid, self.train_ids, self.test_ids, type_all)

                self.answers_pron, self.idstr_pron, self.ids_ne_gold, self.ids_ne_nogold, \
                self.train_var_pron, self.gold_var_pron, \
                self.no_gold_var_pron = self.start_vars(deps, self.answers_pron, \
                     self.train_pron, self.gold_var_pron, self.no_gold_var_pron, \
                     self.train_ids, self.idstr_pron, self.ids_pron_gold, \
                     self.ids_pron_nogold, self.test_ids, self.train_var_pron, \
                     self.gold_pron, self.no_gold_pron, type_all, docid, var_listt)
            
            
            if not os.path.exists('coref/sets'): #for testing only
                os.mkdir('coref/sets')

            if type_all == 'ne':
                self.train_ne.to_csv('coref/sets/train_ne', sep='\t')
                self.gold_ne.to_csv('coref/sets/gold_ne', sep='\t') 
                self.no_gold_ne.to_csv('coref/sets/no_gold_ne', sep='\t')
                self.train_var_ne.to_csv('coref/train_var_ne', sep='\t')
                self.gold_var_ne.to_csv('coref/gold', sep='\t')
                self.no_gold_var_ne.to_csv('coref/nogold', sep='\t')

            elif type_all == 'noun':
                self.train_noun.to_csv('coref/sets/train_noun', sep='\t')
                self.gold_noun.to_csv('coref/sets/gold_noun', sep='\t')
                self.no_gold_noun.to_csv('coref/sets/no_gold_noun', sep='\t')
                self.train_var_noun.to_csv('coref/train_var_noun', sep='\t')
                self.gold_var_noun.to_csv('coref/gold', sep='\t')
                self.no_gold_var_noun.to_csv('coref/nogold', sep='\t')

            elif type_all == 'pron':
                self.train_pron.to_csv('coref/sets/train_pron', sep='\t')
                self.gold_pron.to_csv('coref/sets/gold_pron', sep='\t')
                self.no_gold_pron.to_csv('coref/sets/no_gold_pron', sep='\t')
                self.train_var_pron.to_csv('coref/train_var_pron', sep='\t')
                self.gold_var_pron.to_csv('coref/gold', sep='\t')
                self.no_gold_var_pron.to_csv('coref/nogold', sep='\t')
            
        

In [2]:
var_list = ['genderm', 'animacym', 'numberm', 'headm', 'partialheadm', 'sentdist', \
           'nsubjant', 'detant', 'nsubjmen', 'detmen', 'nar_match', 'antecedenttype', 'NEtype_match', \
           'person_match', 'namepart', 'clustagree', 'containnum', 'namepartref', 'deprolemen', 'deproleant', \
           'deprolecomb', 'samesent','ptypemen']

example2 = Coreftrain(newdf)

example2.create_vars(var_list, 'noun', train_per=.7)


NameError: name 'newdf' is not defined

In [86]:
model = gensim.models.keyedvectors.load_word2vec_format('ruwikiruscorpora_0_300_20.bin', binary=True, encoding='utf8')

AttributeError: module 'gensim.models.keyedvectors' has no attribute 'load_word2vec_format'

In [3]:
gensim.models.Word2Vec.load_word2vec_format('../ruwikiruscorpora_0_300_20.bin.gz', binary=True)

NameError: name 'gensim' is not defined

In [487]:
model.similarity('говорить', 'сказать')

KeyError: "word 'говорить' not in vocabulary"

In [None]:
#mistakes in parser
#воздушное пространство , и морскую акваторию ,
# 121	14.0	горный	горный	ADJ	_	Case=Nom|Degree=Pos|Gender=Masc|Number=Sing	15.0	amod	_	_	6.0	701.0	6.0	_	_	_	_
# 122	15.0	кластер	кластер	NOUN	_	Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing	4.0	conj	_	_	7.0	708.0	6.0	noun	_	горный кластер	701, 708

#mistakes in dependencies
# 62	62	6.0	сказал	сказать	VERB	_	Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act	7	parataxis	_	_	6.0	305.0	9.0	_	_	_	_	_	_	_	_
# 63	63	7.0	человек	человек	NOUN	_	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing	0	root	_	_	7.0

# comma attached to вас makes imposible to parse for discourse.
# 924	924	3.0	благодарю	благодарить	NOUN	_	Animacy=Inan|Case=Dat|Gender=Neut|Number=Sing	1	conj	_	_	9.0	4817.0	66.0	noun	_	благодарю	4817	_	_	_	
# 925	925	4.0	вас,	вас,	PUNCT	_	_	3	punct	_	_	4.0	4827.0	66.0	_	_	_	_	_	_	_	
# 926	926	1.0	-	-	PUNCT	_	_	2	punct	_	_	1.0	4831.0	67.0	_	_	_	_	_	_	_	
# 927	927	2.0	спокойно	спокойно	ADV	_	Degree=Pos	3	advmod	_	_	8.0	4832.0	67.0	_	_	_	_	_	_	_	
# 928	928	3.0	ответил	отвечать	VERB	_	Aspect=Perf|Gender=Masc|Mood=Ind|Number=Sing|Tense=Past|VerbForm=Fin|Voice=Act	0	root	_	_	7.0	4841.0	67.0	_	_	_	_	_	_	_	
# 929	929	4.0	он,	он,	PUNCT	_	_	3	punct	_	_	3.0	4849.0	67.0	_	_	_


# trouble conecting female ending names with masc. nouns.
# 120	29.0	своем	свой	DET	_	Case=Loc|Gender=Masc|Number=Sing	30	amod	_	_	5.0	685.0	8.0	refl	своем	685	_	_	_	_
# 121	30.0	внуке	внук	NOUN	_	Animacy=Inan|Case=Loc|Gender=Masc|Number=Sing	27	obl	_	_	5.0	691.0	8.0	noun	своем внуке	685, 691	_	_	_	_
# 122	31.0	Порфишке	Порфишка	PROPN	_	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing	27	obj	_	_	8.0	697.0	8.0	NE	Порфишке	697	_	_	_	PER


# trouble separating sentences. Can't resolve который.
# 989	5.0	автор	автор	NOUN	_	Animacy=Anim|Case=Nom|Gender=Masc|Number=Sing	0	root	_	_	5.0	5623.0	92.0	noun	_	_	_	_	единственный русский автор XVIII в	5602, 5615, 5623, 5629, 5635
# 990	6.0	XVIII	xviii	NUM	_	_	7	nummod	_	_	5.0	5629.0	92.0	_	_	_	_	_	_	_
# 991	7.0	в	век	NOUN	_	Animacy=Inan|Case=Gen|Gender=Masc|Number=Sing	5	nmod	_	_	1.0	5635.0	92.0	noun	_	_	_	_	XVIII в	5629, 5635
# 992	8.0	.	.	PUNCT	_	_	7	punct	_	_	1.0	5636.0	92.0	_	_	_	_	_	_	_
# 993	1.0	,	,	PUNCT	_	_	0	root	_	_	1.0	5637.0	93.0	_	_	_	_	_	_	_
# 994	2.0	который	который	PRON	_	Case=Nom	3	nsubj	_	_	7.0	5639.0	93.0	rel	_	_

In [None]:
To_do:
#     1. finish merge and head match
#     2. finish mention pos and neg extraction
    3. train models for NE, common NP, pron (pron, poss, rel and refl)
    4. finish sieves (priority on semantic and alias and set constuctions and then discourse)
#         a. который - to head verb of clause, then grab noun head of verb. deprel verb = acl:relcl
    5. create module for storing res (likely in same format as original)
    6. create module for scoring
    7. Run and test results of model
    8. improve mention detection:
#         a. deprel == case for secone cycle parts of mentions
#         b. adj dependent on anything except nouns and verbs to be rel references or deprel == nsubj
        c. dealing with chains and mentions in chains. # will probably be handled not in mention detection.
        d. i within i relations
#         e. remove end comma.
        f. preprocessing for space between punctuation.
        g. add other punctuation such as -
#         h. deprels - nummod:entity
        i. advmod but not pos PART #maybe. look into it.
#         j. PROPN titles in mention
        k. sentence splits on english words (foregin flat, root)
        l. Make all categories from start and fix exceptions in program. (part_men, )
        m. improve quotation module. (see 2.txt) 
    9. narrator combine in merge
Cleaning list:
    1. change number representations in dfs to single format
    2. Improve speed by reducing redundant loops.