In [16]:
from __future__ import print_function
from __future__ import division
from future import standard_library
import time
import sys
import os
import wget
import re
from ufal.udpipe import Model, Pipeline
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from scipy.cluster import *
import word2vec

def log_progress(sequence, every=10):
    from ipywidgets import IntProgress
    from IPython.display import display

    progress = IntProgress(min=0, max=len(sequence), value=0)
    display(progress)
    
    for index, record in enumerate(sequence):
        if index % every == 0:
            progress.value = index
        yield record

In [17]:
def process(pipeline, text='коты и собаки бегали по грядке', keep_pos=True, keep_punct=False):
    entities = {'PROPN'}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)

    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split('\t') for w in content if w]

    for t in tagged:
        if len(t) != 10:
            continue
        (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        token = clean_token(token, misc)
        lemma = clean_lemma(lemma, pos)
        if not lemma or not token:
            continue
        if pos in entities:
            if '|' not in feats:
                tagged_propn.append('%s_%s' % (lemma, pos))
                continue
            morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')}
            if 'Case' not in morph or 'Number' not in morph:
                tagged_propn.append('%s_%s' % (lemma, pos))
                continue
            if not named:
                named = True
                mem_case = morph['Case']
                mem_number = morph['Number']
            if morph['Case'] == mem_case and morph['Number'] == mem_number:
                memory.append(lemma)
                if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc:
                    named = False
                    past_lemma = '::'.join(memory)
                    memory = []
                    tagged_propn.append(past_lemma + '_PROPN ')
            else:
                named = False
                past_lemma = '::'.join(memory)
                memory = []
                tagged_propn.append(past_lemma + '_PROPN ')
                tagged_propn.append('%s_%s' % (lemma, pos))
        else:
            if not named:
                if pos == 'NUM' and token.isdigit():  # Заменяем числа на xxxxx той же длины
                    lemma = num_replace(token)
                tagged_propn.append('%s_%s' % (lemma, pos))
            else:
                named = False
                past_lemma = '::'.join(memory)
                memory = []
                tagged_propn.append(past_lemma + '_PROPN ')
                tagged_propn.append('%s_%s' % (lemma, pos))

    if not keep_punct:
        tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
    if not keep_pos:
        tagged_propn = [word.split('_')[0] for word in tagged_propn]
    return tagged_propn
def num_replace(word):
    newtoken = 'x' * len(word)
    return newtoken


def clean_token(token, misc):
    """
    :param token:  токен (строка)
    :param misc:  содержимое поля "MISC" в CONLLU (строка)
    :return: очищенный токен (строка)
    """
    out_token = token.strip().replace(' ', '')
    if token == 'Файл' and 'SpaceAfter=No' in misc:
        return None
    return out_token


def clean_lemma(lemma, pos):
    """
    :param lemma: лемма (строка)
    :param pos: часть речи (строка)
    :return: очищенная лемма (строка)
    """
    out_lemma = lemma.strip().replace(' ', '').replace('_', '').lower()
    if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
        return None
    if pos != 'PUNCT':
        if out_lemma.startswith('«') or out_lemma.startswith('»'):
            out_lemma = ''.join(out_lemma[1:])
        if out_lemma.endswith('«') or out_lemma.endswith('»'):
            out_lemma = ''.join(out_lemma[:-1])
        if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
                or out_lemma.endswith('.'):
            out_lemma = ''.join(out_lemma[:-1])
    return out_lemma


def list_replace(search, replacement, text):
    search = [el for el in search if el in text]
    for c in search:
        text = text.replace(c, replacement)
    return text


def unify_sym(text):  # принимает строку в юникоде
    text = list_replace \
        ('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)

    text = list_replace \
        ('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)

    text = list_replace('\u2010\u2011', '\u002D', text)

    text = list_replace \
            (
            '\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
            '\u2002', text)

    text = re.sub('\u2003\u2003', '\u2003', text)
    text = re.sub('\t\t', '\t', text)

    text = list_replace \
            (
            '\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
            '.', text)

    text = list_replace('\u2217', '\u002A', text)

    text = list_replace('…', '...', text)

    text = list_replace('\u2241\u224B\u2E2F\u0483', '\u223D', text)

    text = list_replace('\u00C4', 'A', text)  # латинская
    text = list_replace('\u00E4', 'a', text)
    text = list_replace('\u00CB', 'E', text)
    text = list_replace('\u00EB', 'e', text)
    text = list_replace('\u1E26', 'H', text)
    text = list_replace('\u1E27', 'h', text)
    text = list_replace('\u00CF', 'I', text)
    text = list_replace('\u00EF', 'i', text)
    text = list_replace('\u00D6', 'O', text)
    text = list_replace('\u00F6', 'o', text)
    text = list_replace('\u00DC', 'U', text)
    text = list_replace('\u00FC', 'u', text)
    text = list_replace('\u0178', 'Y', text)
    text = list_replace('\u00FF', 'y', text)
    text = list_replace('\u00DF', 's', text)
    text = list_replace('\u1E9E', 'S', text)

    currencies = list \
            (
            '\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
        )

    alphabet = list \
            (
            '\t\n\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ,.[]{}()=+-−*&^%$#@!~;:0123456789§/\|"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')

    alphabet.append("'")

    allowed = set(currencies + alphabet)

    cleaned_text = [sym for sym in text if sym in allowed]
    cleaned_text = ''.join(cleaned_text)
    print (cleaned_text)
    return cleaned_text


def process(pipeline, text='некий текст', keep_pos=True, keep_punct=False):
    # Если частеречные тэги не нужны (например, их нет в модели), выставьте pos=False
    # в этом случае на выход будут поданы только леммы
    # По умолчанию знаки пунктуации вырезаются. Чтобы сохранить их, выставьте punct=True

    entities = {'PROPN'}
    named = False
    memory = []
    mem_case = None
    mem_number = None
    tagged_propn = []

    # обрабатываем текст, получаем результат в формате conllu:
    processed = pipeline.process(text)
    
    # пропускаем строки со служебной информацией:
    content = [l for l in processed.split('\n') if not l.startswith('#')]

    # извлекаем из обработанного текста леммы, тэги и морфологические характеристики
    tagged = [w.split('\t') for w in content if w]
    
    for t in tagged:
        if len(t) != 10:
            continue
        (word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
        token = clean_token(token, misc)
        lemma = clean_lemma(lemma, pos)
        if not lemma or not token:
            continue
        
    
    if not keep_punct:
        
        tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
    if not keep_pos:
        tagged_propn = [word.split('_')[0] for word in tagged_propn]
    return lemma
def tag_ud(text='некий текст'):
    modelfile='udpipe_syntagrus.model'
    udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
    udpipe_filename = udpipe_model_url.split('/')[-1]

    if not os.path.isfile(modelfile):
        print('UDPipe model not found. Downloading...', file=sys.stderr)
        wget.download(udpipe_model_url)

    #print('\nLoading the model...', file=sys.stderr)
    model = Model.load(modelfile)
    process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

    #print('Processing input...', file=sys.stderr)
    return process(process_pipeline, text=text)
#print (process(Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu'),"ксюша"))
#https://github.com/akutuzov/webvectors/blob/master/preprocessing/rusvectores_tutorial.ipynb

In [155]:
#model = word2vec.load('/Users/ivan/Downloads/182/withOutTypeModel.txt')

In [270]:
%%time
class wordsClass (object):
    model
    vectorOfStr = torch.zeros(2000)
    centroids
    ClasterOfWord = {}
    nomberKnownOfWords = 0
    
    def loadCentroidsFromTXT (self, source):
        self.centroids = torch.from_numpy(np.loadtxt(source, delimiter=' ', dtype=float))
        self.centroids = torch.reshape(self.centroids, (2000, 300)).cuda()
        
    def saveCentroidsToTXT (self, source):
        f = open (source, 'w')
        for i in range (2000):
            for j in range (300):
                f.write(str(self.centroids[i][j]) + " ")
        f.close()
        
    def saveDict (self, source):
        f = open(source, 'w')
        for k, v in self.ClasterOfWord.items():
            f.write(str(k) + " " + str(v) + "\n")
        f.close()
            
    def loadDict (self, source):
        f = open(source, 'r')
        for str in f:
            str = str.split(' ')
            self.ClasterOfWord.update({str[0]: str[1]})
        f.close()
        
    def generateCentroids (self, nomberOfClasters, NomberOfIter):
        #scipy.cluster.vq.kmeans2(model.vectors, 2000, iter=100)
        self.centroids = vq.kmeans(self.model.vectors, nomberOfClasters, iter=NomberOfIter)[0]
        
    def generateDict (self, source): #источник = txt для model
        f = open (source, 'r')
        n=0
        words = []
        for line in f:
            line = line.split()
            words.append(line[0])
            n = n+1
        print (words[1])
        f.close()
        for word in log_progress(words, every=100):
            distToCentroids = torch.sum(((self.centroids - self.model[word])**2),1)
            relationID = int(torch.argmin(distToCentroids, 0))
            ClasterOfWord.update({word: relationID})
    
    def distanceBetween2words (self, word1, word2):
        word1 = word1.lower()
        word2 = word2.lower()
        try:
            len = model.distance(tag_ud(text=word1), tag_ud(text=word2))
        except Exception:
            try:
                len =  model.distance(tag_ud(text=word1.capitalize()), tag_ud(text=word2))
            except Exception:
                try:
                    len =  model.distance(tag_ud(text=word1), tag_ud(text=word2.capitalize()))
                except Exception:
                    len = model.distance(tag_ud(text=word1.capitalize()), tag_ud(text=word2.capitalize()))
        return len
    
    def coordinate (self, word):
        word = word.lower()
        try:
            coord = self.model[tag_ud(text=word)]
        except Exception:
            try:
                coord =  self.model[tag_ud(text=word.capitalize())]
            except Exception:
                coord = "error"
        return torch.from_numpy (coord)
    
    def getKey(value):
        for k, v in self.ClasterOfWord.items():
            if v == value:
                print (k)
            
    def textToVec (self, text):
        words = re.findall(r'[A-Za-zА-Яа-я-]+', text)
        for word in words:
            try :
                relationID = ClasterOfWord[word]
                self.vectorOfStr[relationID] = self.vectorOfStr[relationID] + 1
                self.nomberKnownOfWords = self.nomberKnownOfWords + 1
                print (relationID)
            except Exception:
                try:
                    relationID = ClasterOfWord[tag_ud(text=word)]
                    self.vectorOfStr[relationID] = self.vectorOfStr[relationID] + 1
                    self.nomberKnownOfWords = self.nomberKnownOfWords + 1
                    print (relationID)
                except Exception:
                    relationID = "none"
                    print ("the word is new: " + word)
        return 0
#добавить работу с заглавными буквами
wordsClass = wordsClass()
wordsClass.loadDict('dict words has clasters.txt')

#wordsClass.textToVec("серый синий красный черный ")
#wordsClass.textToVec("hi")
#wordsClass.ClasterOfWord('DVD')
#wordsClass.getKey(1639)

1671
1671
1671
1639
15
CPU times: user 195 ms, sys: 25.8 ms, total: 220 ms
Wall time: 207 ms
