In [224]:
import pandas as pd
import numpy as np
import conllu
from conllu import parse, parse_tree, print_tree
import re
import collections
import os
import itertools
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Create class for NPI candidates

In [162]:
class Candidate:
    def __init__(self, word, pos):
        self.word = word
        self.pos = pos
        self.contexts = {'DN':0, 'DNN':0, 'RESTR': 0, 'COND':0, 'CONDIRR':0, 'QUEST':0, 'MODAL':0, 'IN':0, 'IMP':0, 'IRR': 0}
        self.all_nonver = 0
        self.all_occ = 0
        self.all_other = 0
        
    def spotted(self, cont):
        if cont is not 'VER':
            self.contexts[cont] += 1
            self.all_nonver += 1
            
    def is_npi(self):
        n = self.all_nonver / self.all_occ
        m = self.all_occ - self.all_nonver
        if m > self.all_nonver:
            return False
        else:
            return True
    def set_other_c(self):
        self.all_other = self.all_occ - self.all_nonver

## Turning sentences into a dataframe with relevant information

In [3]:
def parse_sent(sent):
    tokens = []
    parsed = parse(sent)
    for x in parsed[0]:
        if x['upostag'] == 'VERB' or x['upostag'] == 'PRON' or x['upostag'] == 'ADV':
            gram = x['feats']
        else:
            gram = None
        #print(x['form'])
        token = [x['id'], x['head'], x['form'], x['lemma'], x['upostag'], gram, None]
        tokens.append(token)
    return tokens

def parsed_to_df(tokens):
    df = pd.DataFrame(tokens)
    df.columns = ['t_id', 'head', 'form', 'lemma', 'pos', 'gram', 'is_in_scope']
    #df.set_index(['token_id', 'head', 'form', 'lemma', 'pos'])
    return df

## Basic functions for work

### Separate a sentence into clauses if there is direct speech

In [6]:
def separate_clauses(df):
    dash = df.loc[df['lemma']=='-']
    if dash.empty is False:
        for index, row in dash.iterrows():
            punct = df.loc[df['t_id']==(row['t_id']-1), 'pos']
            if punct.max() == 'PUNCT':
                #print(punct)
                df1 = df.loc[df['t_id'] <= row['t_id']]
                df2 = df.loc[df['t_id'] > row['t_id']]
                return [df1, df2]
            else:
                return [df]
    else:
        return [df]

### Collect items if the scope goes down the UD Tree

In [81]:
def scope_down(df, token, depth=2):
    scoped = []
    children = df.loc[df['head']==token]
    for index, row in children.iterrows():
        if row['pos'] == 'VERB' or row['pos'] == 'ADV' or row['pos'] == 'PRON':
            scoped.append([row['lemma'], row['pos'], row['t_id']])
        if depth > 0:
            depth -= 1
            scoped1 = scope_down(df, row['t_id'])
            if len(scoped1) > 0:
                for x in scoped1:
                    scoped.append(x)
    return scoped

In [8]:
def scope_down_gram(df, token, depth=0):
    scoped = []
    children = df.loc[df['head']==token]
    scoped = []
    for index, row in children.iterrows():
        if row['pos'] == 'VERB' or row['pos'] == 'ADV' or row['pos'] == 'PRON':
            scoped.append([row['lemma'], row['pos'], row['gram']])
        if depth > 0:
            depth -= 1
            scoped1 = scope_down(sent, row['t_id'])
            if len(scoped1) > 0:
                for x in scoped1:
                    scoped.append(x)
                scoped = list(itertools.chain(*scoped))
    return scoped

### Collect items if the scope goes up the UD Tree

In [129]:
def scope_up1(df, token): #for now we won't raise any further
    parent_id = int(df.loc[df['t_id']==token, 'head'])
    parent = df.loc[df['t_id']==parent_id]
    return [parent['lemma'].max(), parent['pos'].max(), parent_id]

In [10]:
def scope_up1_gram(df, token): #for now we won't raise any further
    parent_id = int(df.loc[df['t_id']==token, 'head'])
    parent = df.loc[df['t_id']==parent_id]
    scoped = [parent['lemma'].max(), parent['pos'].max(), parent['gram'].max()]
    return scoped

In [80]:
def scope_up_far(df, token): #here we raise (by depth?)
    scoped = []
    depth = 4
    parent_id = int(df.loc[df['t_id']==token, 'head'])
    parent = df.loc[df['t_id']==parent_id]
    for d in range(depth):
        scoped1= scope_up_far(df, int(parent['head']))
    if len(scoped1) > 0:
        for x in scoped1:
            scoped.append(x)
    return scoped

### Count occurences of all lemmas

In [12]:
def count_all_lemmas(parsed_sentences):
    all_lemmas = []
    for x in parsed_sentences:
        all_lemmas.append(list(x['lemma']))
    all_lemmas = list(itertools.chain(*all_lemmas))
    lemmas = collections.Counter(all_lemmas)
    return lemmas

## Special functions for particular contexts (if needed)

### Direct & Indirect Negation

In [103]:
def dir_neg(df, token):
    scoped = []
    if df.loc[df['t_id']==token, 'pos'].max() == 'VERB' and df.loc[df['t_id']==token, 'lemma'].max() != 'быть':
        scoped = scope_down(df, token)
        for s in scoped:
            s.append('DN')
    else:
        negated1 = scope_up1(df, token)
        negated1.append('DN')
        if negated1 is not None:
            scoped.append(negated1)
        nn = df.loc[df['lemma']==negated1[0]]
        for index, row in nn.iterrows():
            if row['head'] == token:
                negated2 = scope_down(df, row['t_id'], 4)
                for x in negated2:
                    x.append('IN')
                if len(negated2) > 0:
                    scoped += negated2
    return scoped

### Imperative

In [141]:
def get_imp(df, candidates):
    verb = df.loc[df['pos']=='VERB']
    if verb.empty is False:
        for index, row in verb.iterrows():
            try:
                i = row['gram']['Mood']
                if i == 'Imp':
                    scoped = scope_down(df, int(row['t_id']))
                    for x in scoped:
                        x.append('IMP')
                    return scoped
                else:
                    return []
            except (KeyError, TypeError):
                return []
    else:
        return []

### Question

In [106]:
def get_quest(df, token):
    scoped = []
    question = df.loc[df['t_id'] < token]
    #print(question)
    scope_border = question.loc[question['lemma']=='-']
    if scope_border.empty is False:
        if len(scope_border) > 1:
            q_ids = list(scope_border['t_id'])
            #print(q_ids[0])
            question = question.loc[question['t_id'] > q_ids[0]]
            #print(question)
        else:
            question = question.loc[question['t_id'] > int(scope_border['t_id'])]
    for index, row in question.iterrows():
        if row['pos'] == 'VERB' or row['pos'] == 'ADV' or row['pos'] == 'PRON':
            scoped.append([row['lemma'], row['pos'], row['t_id'], 'QUEST'])
    return scoped

### Some Irrealis

In [107]:
def get_irr(df, token):
    scoped = scope_up1(df, token)
    scoped.append('IRR')
    return scoped

## All together: check if there is a licenser and what is in its scope

In [85]:
markers_d = dict(не='DN', нет='DN', нету='DN', только='RESTR', некого='DNN', нечего='DNN', если='COND', якобы='IRR', бы='CONDIRR', мочь='MODAL', хотеть='MODAL', должен='MODAL', обязан='MODAL', вынудить='MODAL', вынужденный='MODAL', надо='MODAL', можно='MODAL', хотеться='MODAL', заставлять='MODAL', заставить='MODAL')
markers_l = list(markers_d.keys())
markers_l.append('?')

### Add an item or item's occurrence to the dictionary of candidates

In [146]:
def add_to_candidates(x, candidates, scoped):
    if deja_scoped(x[2], scoped) is False:
        if x[0] not in candidates:
            candidates[x[0]] = Candidate(x[0], x[1])
        candidates[x[0]].contexts[x[3]] += 1
    return candidates

In [177]:
def deja_scoped(token, scoped):
    if token in scoped:
        return True
    else:
        return False

### Apply all to a sentence

In [223]:
sent = parsed_sentences[9000]
sent

Unnamed: 0,t_id,head,form,lemma,pos,gram,is_in_scope
0,1,4,Но,но,CCONJ,,
1,2,4,всё,все,PRON,"{'Animacy': 'Inan', 'Case': 'Nom', 'Gender': '...",
2,3,2,же,же,PART,,
3,4,0,интересно,интересный,ADJ,,
4,5,4,-,-,PUNCT,,
5,6,4,листовки,листовка,NOUN,,
6,7,6,то,то,SCONJ,,
7,8,6,такой,такой,DET,,
8,9,8,не,не,PART,,
9,10,8,было,быть,VERB,"{'Aspect': 'Imp', 'Gender': 'Neut', 'Mood': 'I...",


# Make it work

In [222]:
def is_marked(df, markers_d, markers_l, candidates):
    tokens_scoped = []
    dfs = separate_clauses(df)
    for df in dfs:
        mm = df.loc[df['lemma'].isin(markers_l)]
        if mm.empty is False:
            for n in list(mm['t_id']):
                m = df.loc[df['t_id']==n, 'lemma'].max()
                if m == '?':
                    m_h = df.loc[df['lemma']==m, 't_id']
                    for j in list(m_h):
                        scoped = get_quest(df, j)
                        for x in scoped:
                            candidates = add_to_candidates(x, candidates, tokens_scoped)
                elif markers_d[m] is not None:
                    if markers_d[m] == 'DN':
                        scoped = dir_neg(df, n)
                        for x in scoped:
                            candidates = add_to_candidates(x, candidates, tokens_scoped)
                    elif markers_d[m] == 'DNN':
                        scoped = scope_down(df, n)
                        for x in scoped:
                            x.append('DNN')
                            candidates = add_to_candidates(x, candidates, tokens_scoped)
                    elif markers_d[m] == 'CONDIRR':
                        scoped = scope_up1(df, n)
                        scoped.append('CONDIRR')
                        candidates = add_to_candidates(scoped, candidates, tokens_scoped)  
                    elif markers_d[m] == 'COND':
                        scoped = scope_up1(df, n)
                        scoped.append('COND')
                        candidates = add_to_candidates(scoped, candidates, tokens_scoped)
                    elif markers_d[m] == 'IRR':
                        scoped = get_irr(df, n)
                        if len(scoped) > 0:
                            candidates = add_to_candidates(scoped, candidates, tokens_scoped)
                    elif markers_d[m] == 'MODAL':
                        if m == 'мочь':
                            if df.loc[df['t_id']==(+1), 'lemma'].max() in [',', 'быть']:
                                break
                            else:
                                scoped = scope_down(df, n)
                                for x in scoped:
                                    x.append('MODAL')
                                    candidates = add_to_candidates(x, candidates, tokens_scoped)
                        else:
                            scoped = scope_down(df, n)
                            for x in scoped:
                                x.append('MODAL')
                                candidates = add_to_candidates(x, candidates, tokens_scoped)     
                    elif markers_d[m] == 'RESTR':
                        scoped = [scope_up1(df, n)]
                        if scoped[0][1] != 'VERB':
                            sc = df.loc[df['lemma']==scoped[0][0], 't_id']
                            #print(list(sc))
                            for c in sc:
                                #print(row['t_id'])
                                #scoped_id = int(df.loc[df['lemma']==scoped[0][0], 't_id'])
                                scoped.append(scope_up1(df, c))
                            #print(type(scoped[0][0]))
                            try:
                                if scoped[1][1] != 'VERB' and type(scoped[0][0]) is not float:
                                #if type(scoped[1][1]) is not float:
                                    scoped_id = list(df.loc[df['lemma']==scoped[1][0], 't_id'])
                                    for z in scoped_id:
                                        scoped.append(scope_up1(df, z))
                            except IndexError:
                                pass
                        for x in scoped:
                            if x[1] == 'VERB' or x[1]=='ADV' or x[1]=='PRON':
                                x.append('RESTR')
                                candidates = add_to_candidates(x, candidates, tokens_scoped)   
        impered = get_imp(df, candidates)
        if len(impered) > 0:
            for i in impered:
                candidates = add_to_candidates(i, candidates, tokens_scoped)
                    
    return candidates

### Extract sentences from .conllu files

In [196]:
def get_sents(fname):
    sentences = []
    text= open(fname, 'r', encoding='utf-8').read()
    sents = re.findall('sent_id = (\S+)\n# text = ([^\n]+)\n([^#]+)\n\n', text, flags=re.DOTALL)
    for sent in sents:
        sentence =  sent[2]
        sentences.append(sentence)
    return sentences
#texto = open('tst_conllu.conllu', 'r', encoding='utf-8').read()
sentences = get_sents('rus/ru_syntagrus-ud-dev.conllu')
sentences += get_sents('rus/vktexts.txt')
sentences += get_sents('rus/ru_taiga-ud-train.conllu')
sentences += get_sents('rus/ru_syntagrus-ud-train.conllu')
sentences += get_sents('rus/ru_syntagrus-ud-train.conllu')
sentences += get_sents('rus/ru-ud-train.conllu')
sentences += get_sents('rus/ru-ud-test.conllu')
sentences += get_sents('rus/ru-ud-dev.conllu')

len(sentences)

720587

### ...and do the thing 

In [197]:
parsed_sentences = []
for s in sentences:
    parsed_s = parsed_to_df(parse_sent(s))
    parsed_sentences.append(parsed_s)
len(parsed_sentences)

720587

In [198]:
candidates = {}
for s in range(len(parsed_sentences)):
    if s % 500 == 0:
        print(s)
    #print(s)
    candidates = is_marked(parsed_sentences[s], markers_d, markers_l, candidates)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000
49500
50000
50500
51000
51500
52000
52500
53000
53500
54000
54500
55000
55500
56000
56500
57000
57500
58000
58500
59000
59500
60000
60500
61000
61500
62000
62500
63000
63500
64000
64500
65000
65500
66000
66500
67000
67500
68000
68500
69000
69500
70000
70500
71000
71500
72000
72500
73000
73500
74000
74500
75000
75500
76000
76500
77000
77500
78000
78500
79000
79500
80000
80500
81000
81500
82000
82500
83000
83500
84000
84500
8500

603500
604000
604500
605000
605500
606000
606500
607000
607500
608000
608500
609000
609500
610000
610500
611000
611500
612000
612500
613000
613500
614000
614500
615000
615500
616000
616500
617000
617500
618000
618500
619000
619500
620000
620500
621000
621500
622000
622500
623000
623500
624000
624500
625000
625500
626000
626500
627000
627500
628000
628500
629000
629500
630000
630500
631000
631500
632000
632500
633000
633500
634000
634500
635000
635500
636000
636500
637000
637500
638000
638500
639000
639500
640000
640500
641000
641500
642000
642500
643000
643500
644000
644500
645000
645500
646000
646500
647000
647500
648000
648500
649000
649500
650000
650500
651000
651500
652000
652500
653000
653500
654000
654500
655000
655500
656000
656500
657000
657500
658000
658500
659000
659500
660000
660500
661000
661500
662000
662500
663000
663500
664000
664500
665000
665500
666000
666500
667000
667500
668000
668500
669000
669500
670000
670500
671000
671500
672000
672500
673000
673500
674000
674500

In [199]:
all_lemmas = count_all_lemmas(parsed_sentences)
len(all_lemmas)

355125

In [60]:
def count_more_lemmas(parsed_sentences, parsed_sentences2):
    all_lemmas = []
    for x in parsed_sentences:
        all_lemmas.append(list(x['lemma']))
    for y in parsed_sentences2:
        all_lemmas.append(list(y['lemma']))
    all_lemmas = list(itertools.chain(*all_lemmas))
    lemmas = collections.Counter(all_lemmas)
    return lemmas

In [201]:
punct = ['!', '?', ',', '.', '"', '-', '~', ')', '(', ':', ';']
for p in punct:
    all_lemmas.pop(p, None)

In [202]:
sum(list(all_lemmas.values()))

10082464

In [210]:
cand_lemmas = candidates.keys()
for x in cand_lemmas:
    candidates[x].all_occ = all_lemmas[x]
    candidates[x].all_nonver = sum(list(candidates[x].contexts.values()))
    candidates[x].all_other = candidates[x].all_occ - candidates[x].all_nonver

In [217]:
verbal_candidates = []
for v in cand_lemmas: 
    if candidates[v].pos == 'VERB':
        verbal_candidates.append(candidates[v])
len(verbal_candidates)

19794

In [218]:
apresyan = ['житься', 'запомнить', 'обобраться', 'терпеться', 'подумать', 'посмотреть', 'задуматься', 'замедлить', 'обинуясь', 'преминуть', 'заладиться', 'миновать', 'пара', 'напастись', 'надивиться', 'наздравствоваться', 'сроду', 'выносить', 'плошать', 'скупиться.', 'клеиться', 'взвидеть', 'удосужиться', 'стерпеть', 'притронуться', 'сидеться', 'наготовиться', 'ведать', 'переваривать', 'накупиться', 'рыпаться', 'впервой', 'допроситься', 'дозваться', 'трогать', 'видать', 'видаться', 'поддаваться', 'сметь', 'досмотреть', 'лежаться', 'писаться', 'тронуть', 'прикоснуться', 'доглядеть', 'годиться', 'пристать', 'задаться', 'нюхать', 'сходить', 'выдержать', 'спросить', 'угнаться', 'браться', 'навоевать', 'повинный', 'поверить', 'стесняться', 'вытерпеть', 'уколупнуть', 'укупить', 'справиться', 'гадать', 'пропасть', 'трогать', 'полагаться', 'положить', 'переносить', 'постыдить', 'терпеть']

70

In [192]:
apr_strong = ['житься', 'запомнить', 'обобраться', 'терпеться', 'подумать', 'посмотреть', 'задуматься', 'замедлить', 'обинуясь', 'преминуть', 'заладиться', 'миновать', 'пара', 'напастись', 'надивиться', 'наздравствоваться', 'сроду', 'выносить', 'плошать', 'скупиться.', 'клеиться', 'взвидеть', 'удосужиться', 'стерпеть', 'притронуться', 'сидеться', 'наготовиться', 'ведать', 'переваривать', 'накупиться', 'рыпаться']

In [219]:
k = 0
shared = []
for x in cand_lemmas:
    v = candidates[x]
    if v.word in apr_strong:
        shared.append(v.word)
        print(v.word + ' ' + str(v.all_occ) + ' - ' + str(v.all_nonver))
        print(v.contexts)
        k+=1
k

выносить 463 - 131
{'DN': 42, 'DNN': 0, 'RESTR': 9, 'COND': 19, 'CONDIRR': 8, 'QUEST': 4, 'MODAL': 49, 'IN': 0, 'IMP': 0, 'IRR': 0}
подумать 1039 - 309
{'DN': 73, 'DNN': 0, 'RESTR': 26, 'COND': 22, 'CONDIRR': 18, 'QUEST': 10, 'MODAL': 136, 'IN': 0, 'IMP': 24, 'IRR': 0}
ведать 108 - 51
{'DN': 45, 'DNN': 0, 'RESTR': 3, 'COND': 0, 'CONDIRR': 0, 'QUEST': 0, 'MODAL': 2, 'IN': 0, 'IMP': 0, 'IRR': 1}
задуматься 97 - 21
{'DN': 1, 'DNN': 0, 'RESTR': 4, 'COND': 3, 'CONDIRR': 1, 'QUEST': 5, 'MODAL': 7, 'IN': 0, 'IMP': 0, 'IRR': 0}
стерпеть 4 - 1
{'DN': 0, 'DNN': 0, 'RESTR': 0, 'COND': 0, 'CONDIRR': 0, 'QUEST': 0, 'MODAL': 1, 'IN': 0, 'IMP': 0, 'IRR': 0}
рыпаться 20 - 13
{'DN': 11, 'DNN': 0, 'RESTR': 0, 'COND': 0, 'CONDIRR': 0, 'QUEST': 0, 'MODAL': 2, 'IN': 0, 'IMP': 0, 'IRR': 0}
преминуть 13 - 17
{'DN': 13, 'DNN': 0, 'RESTR': 0, 'COND': 2, 'CONDIRR': 2, 'QUEST': 0, 'MODAL': 0, 'IN': 0, 'IMP': 0, 'IRR': 0}
удосужиться 11 - 12
{'DN': 12, 'DNN': 0, 'RESTR': 0, 'COND': 0, 'CONDIRR': 0, 'QUEST': 0, 'M

21

In [221]:
results = []
for y in cand_lemmas:
    ca = candidates[y]
    info = {'lemma': ca.word, 'pos': ca.pos, 'DNN': ca.contexts['DNN'],'DN': ca.contexts['DN'],
           'RESTR': ca.contexts['RESTR'],'COND': ca.contexts['COND'],'CONDIRR': ca.contexts['CONDIRR'],
            'QUEST': ca.contexts['QUEST'],'MODAL': ca.contexts['MODAL'],'IN': ca.contexts['IN'],
            'IMP': ca.contexts['IMP'],'IRR': ca.contexts['IRR'], 'nonver': ca.all_nonver, 'all_occ': ca.all_occ, 'all_other': ca.all_other}
    results.append(info)
#len(results)
res_df = pd.DataFrame.from_dict(results)
res_file = res_df.to_csv('npi_results2.csv', sep='\t')