https://tiefenauer.github.io/blog/smith-waterman/

In [24]:
import itertools
import numpy as np

def build_matrix(a, b, match_score=3, gap_cost=2):
    H = np.zeros((len(a) + 1, len(b) + 1), np.int)

    for i, j in itertools.product(range(1, H.shape[0]), range(1, H.shape[1])):
        match = H[i - 1, j - 1] + (match_score if a[i - 1] == b[j - 1] else - match_score)
        delete = H[i - 1, j] - gap_cost
        insert = H[i, j - 1] - gap_cost
        H[i, j] = max(match, delete, insert, 0)
    return H

In [25]:
def traceback(H, b, b_='', old_i=0):
    # flip H to get index of **last** occurrence of H.max() with np.argmax()
    H_flip = np.flip(np.flip(H, 0), 1)
    i_, j_ = np.unravel_index(H_flip.argmax(), H_flip.shape)
    i, j = np.subtract(H.shape, (i_ + 1, j_ + 1))  # (i, j) are **last** indexes of H.max()
    if H[i, j] == 0:
        return b_, j
    b_ = b[j - 1] + '-' + b_ if old_i - i > 1 else b[j - 1] + b_
    return traceback(H[0:i, 0:j], b, b_, i)


In [26]:
def smith_waterman(a, b, match_score=3, gap_cost=2):
    H = build_matrix(a, b, match_score, gap_cost)
    b_, pos = traceback(H, b)
    return pos, pos + len(b_)

In [27]:
    # prints correct scoring matrix from Wikipedia example
    print(build_matrix('GGTTGACTA', 'TGTTACGG'))

    a, b = 'ggttgacta', 'tgttacgg'
    H = build_matrix(a, b)
    print(traceback(H, b)) # ('gtt-ac', 1)

    a, b = 'GGTTGACTA', 'TGTTACGG'
    start, end = smith_waterman(a, b)
    print(a[start:end])     # GTTGAC

[[ 0  0  0  0  0  0  0  0  0]
 [ 0  0  3  1  0  0  0  3  3]
 [ 0  0  3  1  0  0  0  3  6]
 [ 0  3  1  6  4  2  0  1  4]
 [ 0  3  1  4  9  7  5  3  2]
 [ 0  1  6  4  7  6  4  8  6]
 [ 0  0  4  3  5 10  8  6  5]
 [ 0  0  2  1  3  8 13 11  9]
 [ 0  3  1  5  4  6 11 10  8]
 [ 0  1  0  3  2  7  9  8  7]]
('gtt-ac', 1)
GTTGAC


In [28]:
from difflib import SequenceMatcher
import regex as re
from tqdm import tqdm_notebook as tqdm

In [29]:
txt_ref_path = "DATA/SOMUN.TN.txt"
txt_trg_path = "DATA/DYBG.TN.txt"

with open(txt_ref_path, 'r', encoding="utf-8") as fl:
    txt_ref_raw = fl.read()

with open(txt_trg_path, 'r', encoding="utf-8") as fl:
    txt_trg_raw = fl.read()

In [30]:
# hanzi 범위
cjk_range = "[\p{Han}]"
cjk_range_re = re.compile( cjk_range, re.UNICODE)

# hanzi 추출
txt_ref = "".join( re.findall( cjk_range_re, txt_ref_raw  ) )[10000:30000]
txt_trg = "".join( re.findall( cjk_range_re, txt_trg_raw  ) )[500000:510000]

txt_ref_len = len( txt_ref )
txt_trg_len = len( txt_trg )

In [31]:
start, end = smith_waterman(txt_ref, txt_trg)

In [32]:
print(txt_ref[start:end]) 

治之極於一帝曰何謂一歧伯曰一者因得之帝曰奈何歧伯曰閉戶塞牖繫之病者數問其情以從其意得神者昌失神者亡帝曰善湯液醪醴論篇第十四黃帝問曰爲五穀湯液及醪醴奈何歧伯對曰必


In [None]:
x = "以夜行則喘出於腎淫氣病肺有所墮恐喘出於肝淫氣害AAABBB於肺淫氣傷心度水跌仆喘出於腎與骨當是之時勇者氣行則已怯者則着而爲病也故"
y = "人夜行則喘出於腎淫氣病肺有所墮恐喘出於肝淫氣害脾有所驚恐喘出於肺淫氣傷心渡水跌AAABBB當是之時勇者氣行則已怯者着而爲病也內"
start, end = smith_waterman(x, y)

In [23]:
print(x[start:end]) 

夜行則喘出於腎淫氣病肺有所墮恐喘出於肝淫氣害AAABBB於肺淫氣傷心度水跌
