In [13]:
import pandas as pd
import pyarrow.parquet as pq
import nltk
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
nltk.download('punkt') #word tokenize
from nltk.metrics.distance import jaccard_distance 
import numpy as np
import math
import itertools
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
import pythainlp


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/wipawineechaiwino/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
#input data
input_data=pd.read_csv('alldata/dataset.csv')

In [7]:
#dictionary data (separable)
data_SCT = np.unique(np.array(list(pq.read_table(source='alldata/SCTterm').to_pandas().term)))
data_TMT = np.array(list(pd.read_csv("alldata/thaiStandard/TMT.csv").Name))
data_TMLT = np.array(list(pd.read_csv("alldata/thaiStandard/TMLT.csv").TMLT_Name))
data_device = np.array(list(pd.read_csv("alldata/thaiStandard/OpenFDA_MedicalDevice_UniqueDevice.csv").loc[:, 'device name']))
data_CGD = np.array(list(pd.read_csv("alldata/thaiStandard/CGDandNHSO.csv").desc))

In [8]:
#map data to string
data_SCT = list(map(str,data_SCT))
data_TMT = list(map(str,data_TMT))
data_TMLT = list(map(str,data_TMLT))
data_device = list(map(str,data_device))
data_CGD = list(map(str,data_CGD))

#string lower() method
data_SCT = list(map(str.lower,data_SCT))
data_TMT = list(map(str.lower,data_TMT))
data_TMLT = list(map(str.lower,data_TMLT))
data_device = list(map(str.lower,data_device))
data_CGD = list(map(str.lower,data_CGD))

In [9]:
#remove noise words from dictionary

#download conecting word
nltk.download('words')
words = set(nltk.corpus.words.words())

#download stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords
sw_nltk = stopwords.words('english')
#add unit set to sw_nltk
sw_nltk.extend(["ml","mg","mg/ml","tp","tpu","gpu","gp","(",")","/","hours", "patch", "gallon", "tabv", "vial"])

# function remove stopword
def remove_stopwords(data):
    data_new = []
    for i in range(len(data)):
        text = data[i]
        text = text.lower()
        text = nltk.word_tokenize(text)  

        words = [word for word in text if word.lower() not in sw_nltk]
        new_text = " ".join(words)
        data_new.append(new_text)
    return data_new

# function remove number
def remove_number(data):
    data_newnum =[]
    for i in range(len(data)):
        text = data[i]
        output = ''.join(c for c in text if not c.isdigit())
        data_newnum.append(output)
    return data_newnum


[nltk_data] Downloading package words to
[nltk_data]     /Users/wipawineechaiwino/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wipawineechaiwino/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# remove_stopwords
data_SCT_new = remove_stopwords(data_SCT)
data_TMT_new = remove_stopwords(data_TMT)
data_TMLT_new = remove_stopwords(data_TMLT)
#data_device = remove_stopwords(data_device)
#data_CGD = remove_stopwords(data_CGD)


In [11]:
# tokenize each row in data function
def tokenize_text(x: str):
    x = nltk.word_tokenize(x)
    return[i for i in x if len(i) > 1]

def generate_data_token(data):
    return list(map(tokenize_text, data))

In [12]:
data_token_SCT = generate_data_token(data_SCT_new)
data_token_TMT = generate_data_token(data_TMT_new)
data_token_TMLT = generate_data_token(data_TMLT_new)
data_token_device = generate_data_token(data_device)

#data_token_CGD = generate_data_token(data_CGD)
data_token_CGD = []
for i in range(len(data_CGD)):
    text = pythainlp.word_tokenize(data_CGD[i])
    text = [j for j in text if j !=' ']
    data_token_CGD.append(text)
    

In [18]:
#select data function
def select_data(cat: str):
    if cat == 'SCTterm':
        return data_SCT, data_token_SCT
    elif cat == 'Medicine':
        return data_TMT, data_token_TMT
    elif cat == 'Lab':
        return data_TMLT, data_token_TMLT
    elif cat == 'Medical supplies':
        return data_device, data_token_device
    elif cat == 'CGD':
        return data_CGD, data_token_CGD
    else:
        return False

In [23]:
def process2(text,cat ,corr_rate=0.9, show=False):
    # select data from category
    data, data_token = select_data(cat)
    
    if cat != 'CGD':
        text = text.lower()
        #text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())

        text = ''.join([
            c for c in text
            if ord(c) < 3585 or ord(c) > 3675
        ])

        text = ''.join(c for c in text if not c.isdigit())

        text = nltk.word_tokenize(text)  
        text = remove_stopwords(text)
        text = [i for i in text if len(i) > 1]
    else:
        text = text.lower()
        #text = ''.join(c for c in text if not c.isdigit())
        text = pythainlp.word_tokenize(text)
        text = [i for i in text if i !=' ']
        text = [i for i in text if len(i) > 1]


    print(f'tokenize -> {text}')

    results = []
    temp = [jaccard_distance(set(text), set(w)) for w in data_token]

    w = [t for t, w in zip(temp, data) ]
    
    if show:
        print(temp)
    #temp = [w for t, w in zip(temp, data) if t < corr_rate]
    temp = [w for t, w in zip(temp, data)]

    #print(temp)
    ind = np.argsort(w)

    data_np = np.array(data)
    results = data_np[ind][:50]
    print(f'total={len(results)}')
    return results



# Readme : Mannual
Run prcess2('input text you want to search', cat='input category of dictionary')

In [None]:
#Example running the program
process2('sYRUp Zithromax 200mg/5ml (Azithromycin)(15 ml)',cat = 'Medicine')