In [1]:
import logging
logging.root.handlers = []  # Jupyter messes up logging so needs a reset
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
from smart_open import smart_open
import pandas as pd
import numpy as np
import gensim
import nltk
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model

%matplotlib inline

2017-03-30 20:37:50,462 : INFO : 'pattern' package not found; tag filters are not available for English


In [3]:
wv = Word2Vec.load_word2vec_format(
    '../meetin/models/ruscorpora_russe.model.bin.gz',
    binary=True)
wv.init_sims(replace=True)

2017-03-30 20:37:51,173 : INFO : loading projection weights from ../meetin/models/ruscorpora_russe.model.bin.gz
2017-03-30 20:38:10,879 : INFO : loaded (374526, 300) matrix from ../meetin/models/ruscorpora_russe.model.bin.gz
2017-03-30 20:38:10,879 : INFO : precomputing L2-norms of word weight vectors


In [4]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)
        else:
            if word == '_WORD_':
                mean.append(np.full((wv.layer1_size,), -1, dtype=float))

            if word == '_0_':
                mean.append(np.full((wv.layer1_size,), 0, dtype=float))

            if word == '_NUM_':
                mean.append(np.full((wv.layer1_size,), -3, dtype=float))
    
    if not mean:
        return np.full((wv.layer1_size,), 0)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, word) for word in text_list ])

In [5]:
from nltk.corpus import stopwords
nltk.download("stopwords")

import pymorphy2

morph = pymorphy2.MorphAnalyzer()


def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for i, word in enumerate(nltk.word_tokenize(sent)):
            tokens.append(word if i > 0 else word.lower())
    return tokens

2017-03-30 20:39:08,383 : INFO : Loading dictionaries from /usr/local/lib/python3.5/site-packages/pymorphy2_dicts/data
2017-03-30 20:39:08,442 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/enload/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

with open('data/ruscorpora.parsed.txt', 'r') as myfile:
    sentences = [s.split(';\n') for s in myfile.read().split('_')]
    
    
def get_word(s, index):
    if index < 0 or index >= len(s):
        return '_0_'
    
    word = s[index]

    if word.isdigit():
        word = '_NUM_'

    return word.split('/')[0]
    
data_lex = {}
for s in sentences:
    index = 0
    
    for word in s:
        parts = word.split('/')
        
        word = parts[0].lower()
        
        if len(parts) < 3:
            print(parts)
            continue
            
        tags = parts[2].replace(';', '').split(',')
        
        context = []
        
        for i in range(1, 3):
            context += [get_word(s, index - i)]
        
        context += ['_WORD_']
        
        for i in range(1, 3):
            context += [get_word(s, index + i)]
            
        context = ' '.join(context)
        
        index += 1
        
        if word not in data_lex:
            data_lex[word] = []
            
        data_lex[word] += [{
            'tags': tags,
            'context': context
        }]
        
print(data_lex['косой'])

2017-03-30 20:39:10,145 : INFO : Loading dictionaries from /usr/local/lib/python3.5/site-packages/pymorphy2_dicts/data
2017-03-30 20:39:10,202 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


['\n']
[{'context': 'и губой _WORD_ улыбочкой так', 'tags': ['A=f', 'sg', 'ins', 'plen']}, {'context': 'совершенно Правый _WORD_ устрашающий глаз', 'tags': ['A=sg', 'm', 'nom', 'plen']}, {'context': 'глаз один _WORD_ открылся и', 'tags': ['A=sg', 'm', 'nom', 'plen']}, {'context': 'светлой толстой _WORD_ и маленьким', 'tags': ['S', 'f', 'inan=sg', 'ins']}, {'context': 'с богатыря _WORD_ через спину', 'tags': ['S', 'f', 'inan=sg', 'ins']}]


In [7]:
'''
import json

data = json.loads(open('data/data_lex_final.json').read())

data_lex = {}

for word in data:
    [lemma] = word.keys()
    [params] = word.values()
    
    if lemma not in data_lex:
        data_lex[lemma] = []
    
    data_lex[lemma] += [params]
'''

"\nimport json\n\ndata = json.loads(open('data/data_lex_final.json').read())\n\ndata_lex = {}\n\nfor word in data:\n    [lemma] = word.keys()\n    [params] = word.values()\n    \n    if lemma not in data_lex:\n        data_lex[lemma] = []\n    \n    data_lex[lemma] += [params]\n"

In [21]:
data_lex_vectors = {}

for word, params in list(data_lex.items()):
    contexts = [w2v_tokenize_text(x['context']) for x in params]
    data_lex_vectors[word] = word_averaging_list(wv, contexts)

In [33]:
from scipy import spatial

vectors_trees = {}


'''
just build all vectors, but we will build them lazy instead

for word, vectors in data_lex_vectors.items():
    vectors_trees[word] = spatial.KDTree(vectors)
'''

'\njust build all vectors, but we will build them lazy instead\n\nfor word, vectors in data_lex_vectors.items():\n    vectors_trees[word] = spatial.KDTree(vectors)\n'

In [36]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()


def match(word, context):
    context_tokenized = w2v_tokenize_text(context)
    context_normalized = [morph.normal_forms(w)[0] for w in context_tokenized]
    
    context_vector = word_averaging_list(wv, [context_tokenized])
    
    if word in vectors_trees:
        tree = vectors_trees[word]
    else:
        tree = spatial.KDTree(data_lex_vectors[word])
    
    dist, index = tree.query(context_vector)
    
    return data_lex[word][index]['tags']

2017-03-30 21:01:05,271 : INFO : Loading dictionaries from /usr/local/lib/python3.5/site-packages/pymorphy2_dicts/data
2017-03-30 21:01:05,352 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [37]:
 match('берег', 'людей на _WORD_ _0_ _0_')

['S', 'm', 'inan=sg', 'acc']

In [42]:
 match('берегу', 'я _0_ _WORD_ свои воспоминания')

['S', 'm', 'inan=sg', 'loc2']

In [45]:
import re
import analyzer

def tokenize(text):
    return [word.lower() for word in re.split(r'(?!\b-\b)\W', text) if word]


def desambiguate_sent(sent):
    words = tokenize(sent)
    ambig = []
    poses = []
    result = []
    for word in words:
        tags = morph.parse(word)[:3]
        pos = []
        for t in tags:
            if t.tag.POS not in pos:
                if t.tag.POS is None:
                    pos.append('UNKN')
                else:
                    pos.append(t.tag.POS)
        if len(pos) > 1:
            ambig.append(True)
            poses.append('/'.join(pos))
        else:
            poses.append(pos[0])
            ambig.append(False)
    for i in range(len(ambig)):
        if ambig[i] is True:
            context_lex = analyzer.make_context(words, i)
            context_morph = analyzer.make_context(poses, i)
            result.append([words[i], context_lex, poses[i], context_morph])
    return result

2017-03-30 21:03:05,013 : INFO : Loading dictionaries from /usr/local/lib/python3.5/site-packages/pymorphy2_dicts/data
2017-03-30 21:03:05,078 : INFO : format: 2.4, revision: 393442, updated: 2015-01-17T16:03:56.586168


In [47]:
sentences = desambiguate_sent('мы стали печь пирог')

sentences

[['стали',
  '_0_ мы _WORD_ печь пирог',
  'VERB/NOUN',
  '_0_ NPRO _WORD_ INFN/NOUN NOUN'],
 ['печь',
  'мы стали _WORD_ пирог _0_',
  'INFN/NOUN',
  'NPRO VERB/NOUN _WORD_ NOUN _0_']]

In [48]:
for s in sentences:
    [word, context, *rest] = s
    print(word, '-', match(word, context))

стали - ['S', 'f', 'inan=sg', 'gen']
печь - ['S', 'f', 'inan=sg', 'acc']
