In [1]:
import os
import pickle
import re
import json
import collections
from vncorenlp import VnCoreNLP

   
from collections import Counter
annotator = VnCoreNLP("../../../../VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx2g') 

In [2]:
with open('../../data/vinmec_data_filter.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
def segment_sentence(text):
    text = annotator.tokenize(text)
    sentences = []
    for words in text:
        words = " ".join(words)
        words = words.replace('_', ' ')
        sentences.append(words)
    return " ".join(sentences)

In [4]:
corpus = []
for i in data:
    corpus.append(i['disease'])
    attributes = i['attributes']
    for att in attributes:
        attribute_text = att['attribute_text']
        contents = att['content']
        for c in contents:
            corpus.append(c)
    faq = i['faq']
    corpus.extend(faq)

In [5]:
new_corpus = []
for i in corpus:
    i = segment_sentence(i).lower()
    new_corpus.append(i)

In [15]:
with open('../../data/big.txt', 'w', encoding='utf-8') as f:
    for item in new_corpus:
        f.write('%s\n' %item)

In [12]:
WORDS = []
for i in new_corpus:
    if len(i) < 5:  continue
    WORDS.extend(i.split())

In [13]:
WORDS_MODEL = collections.Counter(WORDS)

In [15]:
def chunks(l, n):
    for i in range(0, len(l) - n + 1):
        yield l[i:i+n]

In [21]:
WORD_TUPLES = list(chunks(WORDS, 2))

In [22]:
WORD_TUPLES

[['áp', 'xe'],
 ['xe', 'áp'],
 ['áp', 'xe'],
 ['xe', 'là'],
 ['là', 'gì'],
 ['gì', '?'],
 ['?', 'áp'],
 ['áp', 'xe'],
 ['xe', 'là'],
 ['là', 'tên'],
 ['tên', 'gọi'],
 ['gọi', 'của'],
 ['của', 'một'],
 ['một', 'tổ'],
 ['tổ', 'chức'],
 ['chức', 'viêm'],
 ['viêm', 'nhiễm'],
 ['nhiễm', ','],
 [',', 'khu'],
 ['khu', 'trú'],
 ['trú', 'thành'],
 ['thành', 'một'],
 ['một', 'khối'],
 ['khối', 'mềm'],
 ['mềm', ','],
 [',', 'bên'],
 ['bên', 'trong'],
 ['trong', 'chứa'],
 ['chứa', 'đầy'],
 ['đầy', 'mủ'],
 ['mủ', 'cấu'],
 ['cấu', 'tạo'],
 ['tạo', 'từ'],
 ['từ', 'vi'],
 ['vi', 'khuẩn'],
 ['khuẩn', ','],
 [',', 'xác'],
 ['xác', 'bạch'],
 ['bạch', 'cầu'],
 ['cầu', 'và'],
 ['và', 'các'],
 ['các', 'mảnh'],
 ['mảnh', 'vụn'],
 ['vụn', '.'],
 ['.', 'áp'],
 ['áp', 'xe'],
 ['xe', 'dễ'],
 ['dễ', 'dàng'],
 ['dàng', 'được'],
 ['được', 'nhận'],
 ['nhận', 'diện'],
 ['diện', 'trên'],
 ['trên', 'lâm'],
 ['lâm', 'sàng'],
 ['sàng', 'với'],
 ['với', 'các'],
 ['các', 'đặc'],
 ['đặc', 'điểm'],
 ['điểm', 'sau'],
 ['sau',

In [23]:
WORD_TUPLES_MODEL = {first:collections.Counter() for first, second in WORD_TUPLES}

In [24]:
for tup in WORD_TUPLES:
    try:
        WORD_TUPLES_MODEL[tup[0]].update([tup[1]])
    except:
        # hack-y fix for uneven # of elements in WORD_TUPLES
        pass

In [43]:
pickle.dump({'words_model': WORDS_MODEL,
                 'word_tuples_model': WORD_TUPLES_MODEL},
                open('models_compressed.pkl', 'wb'),
                protocol=2)

In [26]:
WORD_TUPLES_MODEL['áp'].update(['xe'])

In [38]:
WORD_TUPLES_MODEL['nhức']['đầu']

78

In [47]:
NEARBY_KEYS = {
    'a': 'qwsz',
    'b': 'vghn',
    'c': 'xdfv',
    'd': 'erfcxs',
    'e': 'rdsw',
    'f': 'rtgvcd',
    'g': 'tyhbvf',
    'h': 'yujnbg',
    'j': 'uikmnh',
    'k': 'iolmj',
    'l': 'opk',
    'm': 'njk',
    'n': 'bhjm',
    'o': 'iklp',
    'p': 'ol',
    'q': 'wa',
    'r': 'edft',
    's': 'wedxza',
    't': 'rfgy',
    'u': 'yhji',
    'v': 'cfgb',
    'w': 'qase',
    'x': 'zsdc',
    'y': 'tghu',
    'z': 'asx'
    }

In [48]:
def get_possible_word_from_fat_finger(word):
    #Hidden step
    possible_words = [word[:-1]+char
                             for char in NEARBY_KEYS[word[-1]]
                             if len(word) > 2]

    possible_words.append(word)
    return possible_words

In [55]:
get_possible_word_from_fat_finger('m')

['m']

In [52]:
models = pickle.load(open('models_compressed.pkl','rb'))

In [56]:
probable_words = {w:c for w, c in
                      WORD_TUPLES_MODEL['ho'.lower()].items()
                      for sec_word in ['m']
                      if w.startswith(sec_word)}

In [57]:
probable_words

{'máu': 2, 'mà': 2, 'mãn': 6, 'mang': 1}

In [59]:
Counter(probable_words).most_common(10)

[('mãn', 6), ('máu', 2), ('mà', 2), ('mang', 1)]