In [1]:
import xml.etree.ElementTree as ET
import collections
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pprint

In [2]:
def get_contexts(xml_file, left_window=1, right_window=2):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    all_prepositions = collections.Counter()
    contexts = []

    for paragr in root[0]:
        for sent in paragr:
            if sent.tag == 'se':
                for word_id in range(len(sent)):
                    cur_word = sent[word_id][0].attrib
                    if cur_word['gr'] == 'PR':
                        # counting all prepositions in corpus
                        all_prepositions[cur_word['lex']] += 1
                        if (word_id - left_window) < 0 or (word_id + right_window) > len(sent)-1:
                            continue
                        n_grams = []
                        for win_type in left_window, right_window:
                            if win_type is left_window:
                                for win in range(win_type, 0, -1):
                                    n_grams.append(sent[word_id-win][0].attrib)
                                    if win is 1:
                                        n_grams.append(cur_word)
                            else:
                                for win in range(1, win_type+1):
                                    n_grams.append(sent[word_id+win][0].attrib)
                        contexts.append(n_grams)

    all_prepositions = sorted(all_prepositions.items(), key=lambda kv: kv[1], reverse=True)
    return contexts, all_prepositions

In [3]:
left_window = 1
right_window = 2
contexts, all_prepositions = get_contexts('discussion.xml', left_window=left_window, right_window=right_window)

In [4]:
# further work only with those prepositions which have frequency value >= 10  
top_prepositions = [prep for prep in all_prepositions if prep[1] >= 10]
contexts = [phrase for phrase in contexts if phrase[left_window]['lex'] in [prep[0] for prep in top_prepositions]]
pprint.pprint(top_prepositions)

[('в', 268),
 ('на', 111),
 ('с', 82),
 ('по', 58),
 ('о', 46),
 ('из', 37),
 ('к', 33),
 ('для', 32),
 ('за', 22),
 ('у', 20),
 ('от', 20),
 ('до', 20),
 ('во', 19),
 ('об', 16),
 ('при', 15),
 ('после', 12),
 ('между', 10)]


In [5]:
len(contexts)

708

In [6]:
# Workflow for a single prep ("В")

In [7]:
sub_corpus = [phrase for phrase in contexts if phrase[left_window]['lex'] == 'в']

In [8]:
# looking for similar verbs in preposition (specifically selecting verbs of movement)
movement_phrases = []
verb_counter = collections.Counter()
for phrase in sub_corpus:
    if phrase[0]['gr'][0] == 'V':
        if 'sem' in phrase[0]:
            verb_counter[phrase[0]['sem']] += 1
            if phrase[0]['sem'] == 't:move ':
                movement_phrases.append(phrase)
                
pprint.pprint(verb_counter)

Counter({'t:move ': 10,
         't:be:exist ': 5,
         't:poss ': 4,
         't:changest ': 2,
         't:perc ': 2,
         't:speech ': 2,
         't:ment ': 2,
         't:sound ': 2,
         't:be:appear ': 1,
         't:light ': 1})


In [9]:
pprint.pprint(movement_phrases)

[[{'gr': 'V ipf tran pl act praes 3p indic',
   'lex': 'проходить',
   'sem': 't:move '},
  {'gr': 'PR', 'lex': 'в'},
  {'gr': 'S n inan sg acc', 'lex': 'течение', 'sem': 't:move r:abstr '},
  {'gr': 'APRO m sg gen', 'lex': 'весь', 'sem': 'r:spec '}],
 [{'gr': 'V pf intr med pl praet indic', 'lex': 'разойтись', 'sem': 't:move '},
  {'gr': 'PR', 'lex': 'в'},
  {'gr': 'S f inan sg loc', 'lex': 'оценка', 'sem': 't:ment r:abstr '},
  {'gr': 'APRO', 'lex': 'его', 'sem': 'r:poss '}],
 [{'gr': 'V pf partcp praet pass brev pl', 'lex': 'подвести', 'sem': 't:move '},
  {'gr': 'PR', 'lex': 'в'},
  {'gr': 'A sg m loc plen', 'lex': 'обычный', 'sem': 'r:rel '},
  {'gr': 'S m inan sg loc', 'lex': 'стиль', 'sem': 'pt:aggr r:abstr '}],
 [{'gr': 'V pf intr act sg fut 1p indic', 'lex': 'уехать', 'sem': 't:move '},
  {'gr': 'PR', 'lex': 'в'},
  {'gr': 'S f inan sg acc', 'lex': 'суббота', 'sem': 't:time:week r:abstr '},
  {'gr': 'V ipf intr act m sg praet indic', 'lex': 'воевать'}],
 [{'gr': 'V pf tran act