Блокнот для написания `NlpAlgorithm`-класса и фичурайзеров

In [2]:
from tg.grammar_ru.common import Separator
import pandas as pd

s = 'Приглашение, оно было отклонено. Здесь стояла ваза, пока ее не разбили. Слон выронил мяч из хобота, и он покатился.'
db = Separator.build_bundle(s)
db.data_frames['src']

Unnamed: 0,word_id,sentence_id,word_index,paragraph_id,word_tail,word,word_type,word_length
0,0,0,0,0,0,Приглашение,ru,11
1,1,0,1,0,1,",",punct,1
2,2,0,2,0,1,оно,ru,3
3,3,0,3,0,1,было,ru,4
4,4,0,4,0,0,отклонено,ru,9
5,5,0,5,0,1,.,punct,1
6,6,1,0,0,1,Здесь,ru,5
7,7,1,1,0,1,стояла,ru,6
8,8,1,2,0,0,ваза,ru,4
9,9,1,3,0,1,",",punct,1


In [3]:
from tg.grammar_ru.ml.features import PyMorphyFeaturizer

pmf = PyMorphyFeaturizer()
pmf.featurize(db)
morphology_df = db.data_frames['pymorphy']
morphology_df

Unnamed: 0_level_0,normal_form,alternatives,score,delta_score,POS,animacy,gender,number,case,aspect,transitivity,person,tense,mood,voice,involvement
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,приглашение,2,0.578947,0.157895,NOUN,inan,neut,sing,nomn,,,,,,,
1,",",1,1.0,1.0,NONE,,,,,,,,,,,
2,оно,1,1.0,1.0,NPRO,,neut,sing,nomn,,,3per,,,,
3,быть,2,0.965311,0.930623,VERB,,neut,sing,,impf,intr,,past,indc,,
4,отклонить,1,1.0,1.0,PRTS,,neut,sing,,perf,,,past,,pssv,
5,.,1,1.0,1.0,NONE,,,,,,,,,,,
6,здесь,1,1.0,1.0,ADVB,,,,,,,,,,,
7,стоять,1,1.0,1.0,VERB,,femn,sing,,impf,intr,,past,indc,,
8,ваза,2,0.625,0.25,NOUN,inan,femn,sing,nomn,,,,,,,
9,",",1,1.0,1.0,NONE,,,,,,,,,,,


In [4]:
def _get_pronoun_filter():
    return lambda x: ((x.normal_form == 'он') |
                      (x.normal_form == 'она') |
                      (x.normal_form == 'оно'))

def _get_candidate_filter():
    return lambda x: ((x.number == 'sing') &
                      ((x.POS == 'NOUN') |
                       (x.POS == 'PRON') |
                       (x.POS == 'ADJF') |
                       (x.POS == 'ADJS') |
                       (x.POS == 'NPRO') |
                       (x.POS == 'PRCL') |
                       (x.POS == 'PRTF') |
                       (x.POS == 'PRTS') |
                       (x.POS == 'ADVB')))

In [5]:
pronouns_df = morphology_df.loc[_get_pronoun_filter(), ['gender', 'case']]
pronouns_df['word_id'] = pronouns_df.index
pronouns_df = pronouns_df.add_prefix('pronoun_')
pronouns_df

Unnamed: 0_level_0,pronoun_gender,pronoun_case,pronoun_word_id
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,neut,nomn,2
11,femn,accs,11
22,masc,nomn,22


In [6]:
antecedent_candidates_df = morphology_df.loc[_get_candidate_filter(), ['POS', 'animacy', 'gender', 'case']]
antecedent_candidates_df['word_id'] = antecedent_candidates_df.index
antecedent_candidates_df = antecedent_candidates_df.add_prefix('candidate_')
antecedent_candidates_df

Unnamed: 0_level_0,candidate_POS,candidate_animacy,candidate_gender,candidate_case,candidate_word_id
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,NOUN,inan,neut,nomn,0
2,NPRO,,neut,nomn,2
4,PRTS,,neut,,4
8,NOUN,inan,femn,nomn,8
11,NPRO,,femn,accs,11
15,NOUN,anim,masc,nomn,15
17,NOUN,inan,masc,accs,17
19,NOUN,inan,masc,gent,19
22,NPRO,,masc,nomn,22


In [7]:
merged_df = pronouns_df.merge(antecedent_candidates_df, how='cross')
merged_df.head()

Unnamed: 0,pronoun_gender,pronoun_case,pronoun_word_id,candidate_POS,candidate_animacy,candidate_gender,candidate_case,candidate_word_id
0,neut,nomn,2,NOUN,inan,neut,nomn,0
1,neut,nomn,2,NPRO,,neut,nomn,2
2,neut,nomn,2,PRTS,,neut,,4
3,neut,nomn,2,NOUN,inan,femn,nomn,8
4,neut,nomn,2,NPRO,,femn,accs,11


In [8]:
merged_df = merged_df[(merged_df['pronoun_word_id'] > merged_df['candidate_word_id']) & 
                      (merged_df['pronoun_gender'] == merged_df['candidate_gender'])]
merged_df = merged_df.drop(columns=['pronoun_gender', 'candidate_gender']).reset_index(drop=True)
merged_df

Unnamed: 0,pronoun_case,pronoun_word_id,candidate_POS,candidate_animacy,candidate_case,candidate_word_id
0,nomn,2,NOUN,inan,nomn,0
1,accs,11,NOUN,inan,nomn,8
2,nomn,22,NOUN,anim,nomn,15
3,nomn,22,NOUN,inan,accs,17
4,nomn,22,NOUN,inan,gent,19


In [9]:
merged_df['candidate_distance'] = merged_df.groupby(['pronoun_word_id']).cumcount(ascending=False)
merged_df

Unnamed: 0,pronoun_case,pronoun_word_id,candidate_POS,candidate_animacy,candidate_case,candidate_word_id,candidate_distance
0,nomn,2,NOUN,inan,nomn,0,0
1,accs,11,NOUN,inan,nomn,8,0
2,nomn,22,NOUN,anim,nomn,15,2
3,nomn,22,NOUN,inan,accs,17,1
4,nomn,22,NOUN,inan,gent,19,0


In [10]:
from tg.grammar_ru.ml.features import SlovnetFeaturizer

slvnt = SlovnetFeaturizer()
slvnt.featurize(db)
slovnet = db['slovnet']

In [11]:
slovnet

Unnamed: 0_level_0,POS,Animacy,Case,Gender,Number,Person,Aspect,Mood,Tense,VerbForm,Voice,Variant,Degree,Polarity,relation,syntax_parent_id
word_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,NOUN,Inan,Nom,Neut,Sing,,,,,,,,,,nsubj:pass,4
1,PUNCT,,,,,,,,,,,,,,punct,0
2,PRON,,Nom,Neut,Sing,3.0,,,,,,,,,nsubj:pass,4
3,AUX,,,Neut,Sing,,Imp,Ind,Past,Fin,Act,,,,aux:pass,4
4,VERB,,,Neut,Sing,,Perf,,Past,Part,Pass,Short,,,root,-1
5,PUNCT,,,,,,,,,,,,,,punct,4
6,ADV,,,,,,,,,,,,Pos,,advmod,7
7,VERB,,,Fem,Sing,,Imp,Ind,Past,Fin,Act,,,,root,-1
8,NOUN,Inan,Nom,Fem,Sing,,,,,,,,,,nsubj,7
9,PUNCT,,,,,,,,,,,,,,punct,13


In [142]:
parent_ids = slovnet[slovnet.index.isin(merged_df['pronoun_word_id'])]['syntax_parent_id']
parent_df = (parent_ids.to_frame()
             .reset_index()
             .rename(columns={'syntax_parent_id': 'pronoun_parent_id', 'word_id': 'pronoun_word_id'}))
merged_df = merged_df.merge(parent_df, on='pronoun_word_id')
merged_df

Unnamed: 0,pronoun_case,pronoun_word_id,candidate_POS,candidate_animacy,candidate_case,candidate_word_id,candidate_distance,pronoun_parent_id
0,nomn,2,NOUN,inan,nomn,0,0,4
1,accs,11,NOUN,inan,nomn,8,0,13
2,nomn,22,NOUN,anim,nomn,15,2,23
3,nomn,22,NOUN,inan,accs,17,1,23
4,nomn,22,NOUN,inan,gent,19,0,23


Пытаемся добавить частотный словарь:

In [25]:
import os
import sys

file_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..', 'tg', 'grammar_ru', 'ml', 'features', 'bigrams.csv'))
bigrams = pd.read_csv(file_path, sep=" ")
bigrams = bigrams.drop('nans?', axis=1)
bigrams.head()

Unnamed: 0,first,second,abs,mil
0,и,в,1346,1083
1,в,том,1256,1010
2,российской,федерации,1253,1008
3,и,не,1139,916
4,а,также,1041,837


In [26]:
import pymorphy2

analyzer = pymorphy2.MorphAnalyzer()
bigrams['first_norm'] = bigrams.apply(lambda row: analyzer.parse(row['first'])[0].normal_form, axis=1)
bigrams['second_norm'] = bigrams.apply(lambda row: analyzer.parse(row.second)[0].normal_form, axis=1)
bigrams.head()

Unnamed: 0,first,second,abs,mil,first_norm,second_norm
0,и,в,1346,1083,и,в
1,в,том,1256,1010,в,тот
2,российской,федерации,1253,1008,российский,федерация
3,и,не,1139,916,и,не
4,а,также,1041,837,а,также


In [120]:
def filter(df, filter_df, required_col_name, index_col_name):
    col = df[df.index.isin(filter_df[index_col_name])][required_col_name]
    return (col.to_frame().reset_index().rename(columns={'word_id': index_col_name}))

pairs = merged_df[['pronoun_word_id', 'candidate_word_id', 'pronoun_parent_id']]
            
candidates = filter(morphology_df, pairs, 'normal_form', 'candidate_word_id')
parents = filter(morphology_df, pairs, 'normal_form', 'pronoun_parent_id')

pairs = pairs.merge(candidates, on='candidate_word_id')
pairs = pairs.merge(parents, on='pronoun_parent_id')
pairs

Unnamed: 0,pronoun_word_id,candidate_word_id,pronoun_parent_id,normal_form_x,normal_form_y
0,2,0,4,приглашение,отклонить
1,11,8,13,ваза,разбить
2,22,15,23,слон,покатиться
3,22,17,23,мяч,покатиться
4,22,19,23,хобот,покатиться


In [121]:
def get_pair(first, second):
    return (bigrams['first_norm'].isin(pairs[first])) & (bigrams['second_norm'].isin(pairs[second]))
dict_pairs = bigrams[get_pair('normal_form_x', 'normal_form_y') | get_pair('normal_form_y', 'normal_form_x')]
dict_pairs

Unnamed: 0,first,second,abs,mil,first_norm,second_norm
282642,отклонил,приглашение,1,0,отклонить,приглашение


In [122]:
pairs = pairs.merge(dict_pairs, how='left', left_on=['normal_form_x', 'normal_form_y'], right_on=['first_norm', 'second_norm']) 
pairs = pairs.merge(dict_pairs, how='left', left_on=['normal_form_x', 'normal_form_y'], right_on=['second_norm', 'first_norm']) 
pairs

Unnamed: 0,pronoun_word_id,candidate_word_id,pronoun_parent_id,normal_form_x,normal_form_y,first_x,second_x,abs_x,mil_x,first_norm_x,second_norm_x,first_y,second_y,abs_y,mil_y,first_norm_y,second_norm_y
0,2,0,4,приглашение,отклонить,,,,,,,отклонил,приглашение,1.0,0.0,отклонить,приглашение
1,11,8,13,ваза,разбить,,,,,,,,,,,,
2,22,15,23,слон,покатиться,,,,,,,,,,,,
3,22,17,23,мяч,покатиться,,,,,,,,,,,,
4,22,19,23,хобот,покатиться,,,,,,,,,,,,


In [130]:
import numpy as np

pairs['dict_bigrams_count'] = pairs['abs_y'].replace(np.nan, 0) + pairs['abs_x'].replace(np.nan, 0)
result = pairs[['pronoun_word_id', 'candidate_word_id', 'dict_bigrams_count']]
result

Unnamed: 0,pronoun_word_id,candidate_word_id,dict_bigrams_count
0,2,0,1.0
1,11,8,0.0
2,22,15,0.0
3,22,17,0.0
4,22,19,0.0


In [143]:
merged_df = merged_df.merge(result, on=['pronoun_word_id', 'candidate_word_id'])
merged_df

Unnamed: 0,pronoun_case,pronoun_word_id,candidate_POS,candidate_animacy,candidate_case,candidate_word_id,candidate_distance,pronoun_parent_id,dict_bigrams_count
0,nomn,2,NOUN,inan,nomn,0,0,4,1.0
1,accs,11,NOUN,inan,nomn,8,0,13,0.0
2,nomn,22,NOUN,anim,nomn,15,2,23,0.0
3,nomn,22,NOUN,inan,accs,17,1,23,0.0
4,nomn,22,NOUN,inan,gent,19,0,23,0.0
