In [41]:
import numpy as np
from bs4 import BeautifulSoup
from heapq import nlargest
import pandas as pd
from CharVectorizer import CharVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm_notebook as tqdm
import re
from collections import defaultdict

**Загружаем таблицы с признаками**

In [3]:
cons_table = pd.read_csv('cons_table.csv', encoding='UTF-8', na_filter = True, header=None, chunksize=1)

vocs_table = pd.read_csv('voc_table.csv', encoding='UTF-8', na_filter = True, header=None, chunksize=1)

cons_line = []
for line in cons_table:
    cons_line.append(line)
vocs_line = []
for line in vocs_table:
    vocs_line.append(line)

In [4]:
letter_vec = {}

**Векторизируем признаки. Словарь формата {'слово' : 'вектор признаков'}**

In [5]:
vectorizer = CharVectorizer("bƀcdðfghjklmnpqrstuvwxzȥaâáäæiîíúüûyoöœóôeéêę")

for i in range(len(vocs_line)):
    word = vocs_line[i][1] + vocs_line[i][2] + vocs_line[i][3]
    windows = [word[i]]
    vec = vectorizer.transform(windows, 100)
    letter_vec[vocs_line[i][0][i]] = vec

letter_vec['a'].sum(), letter_vec['y'].sum()

for i in range(len(cons_line)):
    word = cons_line[i][1] + cons_line[i][2] + cons_line[i][3]
    windows = [word[i]]
    vec = vectorizer.transform(windows, 100)
    letter_vec[cons_line[i][0][i]] = vec

** Иголка **

In [6]:
pt ={'match': 1.5, 'mismatch': -1, 'gap': -1, 'dif_first': -2}

def mch(alpha, beta):
    if alpha == beta:
        sim = pt['match']
        fake_sim = pt['match']
    
    elif alpha == '-' or beta == '-':
        if alpha == 'h' or beta == 'h':
            sim, fake_sim = 0, 0
        else:
            sim = pt['gap']
            fake_sim = pt['gap']
    
    else:
        sim = float(str(cosine_similarity(letter_vec[alpha], letter_vec[beta])).strip('[]'))-1
        #fake_sim = int(float(str(cosine_similarity(letter_vec[alpha], letter_vec[beta])).strip('[]'))-1)
        fake_sim = pt['mismatch']
        
    return sim, fake_sim

def needle(s1, s2):
    m, n = len(s1), len(s2)
    score = np.zeros((m+1, n+1))
    
    #Initialization
    for i in range(m+1):
        score[i][0] = pt['gap'] * i
    for j in range(n+1):
        score[0][j] = pt['gap'] * j
    
    #Fill
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if i == 1 and j == 1 and s1[i-1] != s2[j-1]:
                diag = score[i-1][j-1] + pt['dif_first']
            else:
                diag = score[i-1][j-1] + mch(s1[i-1], s2[j-1])[1]
            delete = score[i-1][j] + pt['gap']
            insert = score[i][j-1] + pt['gap']
            score[i][j] = max(diag, delete, insert)

    align1, align2 = '', ''
    i,j = m,n
    
    #Traceback
    while i > 0 and j > 0:
        score_current = score[i][j]
        score_diag = score[i-1][j-1]
        score_left = score[i][j-1]
        score_up = score[i-1][j]
        if score_current == score_left + pt['gap']:
            a1,a2 = '-',s2[j-1]
            j -= 1
        elif score_current == score_up + pt['gap']:
            a1,a2 = s1[i-1],'-'
            i -= 1
        elif score_current == score_diag + mch(s1[i-1], s2[j-1])[1]:
            a1,a2 = s1[i-1],s2[j-1]
            i,j = i-1,j-1
        elif i == 1 and j == 1 and s1[i-1] != s2[j-1] and score_current == score_diag + pt['dif_first']:
            a1,a2 = s1[i-1],s2[j-1]
            i,j = i-1,j-1
        align1 += a1
        align2 += a2
            

    while i > 0:
        a1,a2 = s1[i-1],'-'
        align1 += a1
        align2 += a2
        i -= 1
        
    while j > 0:
        a1,a2 = '-',s2[j-1]
        align1 += a1
        align2 += a2
        j -= 1
    
    align1 = align1[::-1]
    align2 = align2[::-1]
    seqN = len(align1)
    sym = ''
    seq_score = 0
    true_score = []
    ident = 0
    for i in range(seqN):
        a1 = align1[i]
        a2 = align2[i]
        if a1 == a2:
            sym += a1
            ident += 1
            seq_score += mch(a1, a2)[1]
            true_score.append(mch(a1, a2)[0])
    
        else: 
            if i == 1:
                seq_score += pt['dif_first']
                true_score.append(pt['dif_first'])
            else:
                seq_score += mch(a1, a2)[1]
                true_score.append(mch(a1, a2)[0])
            sym += ' '
        
    if align1[-2:] == '--' or align2[-2:] == '--': true_score[-1] += 2
    elif align1[-1:] == '--' or align2[-1:] == '--': true_score[-1] += 1
    return true_score

In [7]:
needle('argikmuk', 'aerbgze')

[1.5, -2, 1.5, -1, 1.5, -0.22999999999999998, -0.20999999999999996, -1, -1, 1]

In [8]:
sum(needle('folkes', 'folgen')), sum(needle('folkes', 'folk'))

(5.71, 6.0)

** Данные для кластеризации **

In [13]:
with open ('norm_words.txt') as f:
    words = f.read().split(' ')

In [9]:
file = open('prefix_with_gi.csv', encoding = 'UTF-8').read()
prefs = []

f = re.split('\n', file)
prefs = []
for i in f:
    n = re.sub(',', ' ', i)
    n = re.sub('  ', ' ', n)
    if n != '':
        prefs.append(' '.join(n.rsplit()))

In [10]:
html = open('dictionary.html', encoding = "UTF-8")
dictionary = BeautifulSoup(html, 'html.parser')

tokenized_dict = []
for txt in dictionary.find_all('strong'):
    a = re.sub(r'\(.*?\)','',txt.text)
    a = re.sub(r'\W','', a)
    tokenized_dict.append(a)

In [35]:
norm_words = []
for lemma_raw in words:
    lemma = ''
    for l in prefs:
        if l in lemma_raw:
            lemma = lemma_raw.strip(l)
            norm_words.append(lemma)
            break
    if lemma == '':
        norm_words.append(lemma_raw)
norm_words = list(filter(None, norm_words))

In [11]:
len(tokenized_dict)

3840

** Обработка и запись **

In [28]:
len(tokenized_dict), len(set(tokenized_dict))

(3840, 3698)

In [39]:
tokenized_dict = list(set(tokenized_dict))

In [40]:
tokenized_dict.sort()

In [44]:
TD = defaultdict(list)
for i in tokenized_dict:
    if len(i) > 0:
        TD[i[0]].append(i)

In [None]:
result = {}
dict_with_norms = {}
for w in tqdm(range(len(norm_words))):
    try:
        word = norm_words[w].lower()
        scores = {}
        if len(word) > 0:
            for i in tqdm(TD[word[0]], leave=False):
                if abs(len(word) - len(i)) <= 4:
                    score = sum(needle(word, i.lower()))
                    scores[i] = score
            if len(scores) > 0:
                dict_with_norms[word] = scores
                result[word] = max(dict_with_norms[word], key=dict_with_norms[word].get)
        else:
            print (word)
    except:
        print ('Exc', word)

In [51]:
true_result = {}
for word in result:
    if result[word] in true_result:
        true_result[result[word]].append(word)
    else:
        true_result[result[word]] = [word]

In [53]:
import csv
with open('result.csv', mode='w', encoding = 'UTF-8') as f:
    file = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for word in true_result:
        file.writerow([word, true_result[word]])