In [1]:
from evaluation import JSONParseEvaluator

import os, glob, json
import argparse
import numpy as np
from tqdm import tqdm
from os.path import join, basename, splitext

In [2]:
field_list = [ 'current_institute', 'name', 'gender', 'birth', 'age', 'address', 'tel_customer', 'id_bhyt', 'diagnosis', 'date_in', 'doctor_name', 'drug_name', 'drug_dose', 'drug_quantity']

def convert_dict_concern_fileds(raw_dict: dict, concern_fileds=None, is_label=True):
    res = {}
    if concern_fileds is None:
        concern_fileds = list(raw_dict.keys())

    for field,data in raw_dict.items():
        field = field.replace(' ', '_')
        if field in concern_fileds:
            res[field] = data['value'].lower() if is_label else data.lower()
    return res


In [3]:
def convert_output(output):
    list = [ 'current_institute', 'gender', 'birth', 'age', 'address', 'tel_customer', 'id_bhyt', 'diagnosis', 'date_in', 'doctor_name']
    list_drug = ['drug_name', 'drug_dose', 'drug_quantity']
    result = {}
    result['name'] = ''
    if 'name' in output.keys():
        result['name'] = output['name']
    if 'patient_name' in output.keys():
        result['name'] = output['patient_name']
    for field in list:
        try:
            result[field] = output[field]
        except:
            result[field] = ''
    if 'drugs' in output.keys():
        for field in list_drug:
            value = ""
            for drug in output['drugs']:
                for key, val in drug.items():
                    if key == field:
                        value += val
            result[field] = value
    return result

### Read files (labels and predicts)

In [4]:
path_mistral = '/home/vinbig/Documents/PA_Modeling/Prompt/private_test_Pharma_out_full_En/Mistral-7B'
path_mistral_json = '/home/vinbig/Documents/PA_Modeling/Prompt/private_test_Pharma_out_full_En/Mistral-7B-json'

path_labels = '/home/vinbig/Documents/PA_Modeling/Prompt/prescription_label_text/KIEs'

In [5]:
def get_outputs_labels(path_output, path_labels):
    names = os.listdir(path_output)
    name_txts = []
    preds = []
    labels = []
    for name in names:
        # check file json
        if name.split('.')[1] == 'txt':
            name_txts.append(name)
            continue

        # correct -> read output and label
        pred_path = os.path.join(path_output, name)
        with open(pred_path, 'r') as f:
            pred = json.load(f)
            pred = convert_output(pred)

        label_path = os.path.join(path_labels, name)
        with open(label_path, 'r') as f:
            label = json.load(f)
        
        label = convert_dict_concern_fileds(label, concern_fileds=field_list)
        pred  = convert_dict_concern_fileds(pred, concern_fileds=field_list, is_label=False)

        labels.append(label)
        preds.append(pred)
    return preds, labels, name_txts

In [6]:
def compute_acc(preds, labels):
    evaluator = JSONParseEvaluator()
    accs = []
    for i in range (len(labels)):
        acc = evaluator.cal_acc(pred=preds[i], answer=labels[i])
        accs.append(acc)
    return np.mean(accs)

### Read each field


In [7]:
def get_each_field(field, input):
    result = {}
    try:
        result[field] = input[field]
    except:
        result[field] = ""
    return result

# Compute

In [10]:
def compute(preds, labels, name_txts, field_list):
    print("Số lượng file đúng format: ", len(preds), '/249')
    acc_mean = []
    # all field
    acc = compute_acc(preds, labels)
    acc_mean.append(acc)

    #each field
    for field in field_list:
        preds_field = [get_each_field(field=field, input=pred) for pred in preds]
        labels_field = [get_each_field(field=field, input=label) for label in labels]
        acc_mean.append(compute_acc(preds_field, labels_field))

    return acc_mean

In [11]:
print('Mistral')
preds, labels, name_txts = get_outputs_labels(path_labels=path_labels, path_output=path_mistral)
acc_mean = compute(preds, labels, name_txts, field_list)

Mistral
Số lượng file đúng format:  245 /249


  zss.distance(
  zss.distance(


In [15]:
print('Mistral Json')
preds_json, labels_json, name_txts_json = get_outputs_labels(path_labels=path_labels, path_output=path_mistral_json)
acc_mean_json = compute(preds_json, labels_json, name_txts_json, field_list)

Mistral Json
Số lượng file đúng format:  245 /249


  zss.distance(
  zss.distance(


In [16]:
import pandas as pd
model_name = ["mistral", "mistral_json"]
column_name_all = ['all', 'current_institute', 'name', 'gender', 'birth', 'age', 'address', 'tel_customer', 'id_bhyt', 'diagnosis', 'date_in', 'doctor_name', 'drug_name', 'drug_dose', 'drug_quantity']
df = pd.DataFrame([acc_mean, acc_mean_json], columns=column_name_all, index=model_name)
df

Unnamed: 0,all,current_institute,name,gender,birth,age,address,tel_customer,id_bhyt,diagnosis,date_in,doctor_name,drug_name,drug_dose,drug_quantity
mistral,0.701759,0.574971,0.948378,0.797551,0.441965,0.22543,0.852804,0.3864,0.300108,0.836901,0.543002,0.478865,0.734289,0.652821,0.672874
mistral_json,0.701406,0.577779,0.948378,0.797551,0.446046,0.230872,0.854569,0.390482,0.296536,0.837068,0.548889,0.480296,0.732232,0.651322,0.671675


# Drugs

In [9]:

field_list_all = [ 'current_institute', 'name', 'gender', 'birth', 'age', 'address', 'tel_customer', 'id_bhyt', 'diagnosis', 'date_in', 'doctor_name', 'drug_name', 'drug_dose', 'drug_quantity']

path = "/home/vinbig/Documents/PA_Modeling/Prompt/prescription_label_text/KIEs/BV_TH_1001.json"
with open(path, 'r') as f:
    label = json.load(f)

label = convert_dict_concern_fileds(label, field_list_all)
label

{'current_institute': 'bệnh viện đa khoa tỉnh',
 'name': 'nguyễn thị dung',
 'age': '74',
 'tel_customer': '0354 441 597',
 'id_bhyt': 'ht2382799088303',
 'address': 'tổ 16, quảng thắng, thành phố thanh hóa, tinh thanh hóa, việt nam',
 'diagnosis': 'e11 - bệnh đái tháo đường không phụ thuộc insuline / 110 - bệnh lý tăng huyết áp',
 'drug_name': 'beticapc 750 sr - 750mg (metformin) diamicron mr 60mg (gliclazide) lisiplus hct 10/12,5 (lisinopril (dihydrat) 10mg + hydroclorothiazid 12,5mg)',
 'drug_quantity': 'viên 60 viên 120 viên 60',
 'drug_dose': 'uống chiều 1 viên sau ăn ngày uống 02 viên trước ăn sáng uống sáng 1 viên lúc 8h',
 'date_in': 'ngày 18 tháng 10 năm 2022',
 'doctor_name': 'ngày 18 tháng 10 năm 2021'}

In [None]:
preds_json, labels_json, name_txts_json = get_outputs_labels(path_labels=path_labels, path_output=path_mistral)
acc_mean_json = compute(preds_json, labels_json, name_txts_json, field_list_all)

In [31]:
def convert_output(output):
    list = [ 'current_institute', 'gender', 'birth', 'age', 'address', 'tel_customer', 'id_bhyt', 'diagnosis', 'date_in', 'doctor_name']
    list_drug = ['drug_name', 'drug_dose', 'drug_quantity']
    result = {}
    result['name'] = ''
    if 'name' in output.keys():
        result['name'] = output['name']
    if 'patient_name' in output.keys():
        result['name'] = output['patient_name']
    for field in list:
        try:
            result[field] = output[field]
        except:
            result[field] = ''
    if 'drugs' in output.keys():
        for field in list_drug:
            value = ""
            for drug in output['drugs']:
                for key, val in drug.items():
                    if key == field:
                        value += val
            result[field] = value
    return result

In [37]:
def get_outputs_labels_all(path_output, path_labels):
    names = os.listdir(path_output)
    name_txts = []
    preds = []
    labels = []
    for name in names:
        # check file json
        if name.split('.')[1] == 'txt':
            name_txts.append(name)
            continue

        # correct -> read output and label
        pred_path = os.path.join(path_output, name)
        with open(pred_path, 'r') as f:
            pred = json.load(f)
            pred = convert_output(pred)

        label_path = os.path.join(path_labels, name)
        with open(label_path, 'r') as f:
            label = json.load(f)
        
        label = convert_dict_concern_fileds(label, concern_fileds=field_list_all)
        pred  = convert_dict_concern_fileds(pred, concern_fileds=field_list_all, is_label=False)

        labels.append(label)
        preds.append(pred)
    return preds, labels, name_txts

In [38]:
preds_json[5], labels_json[5]

({'current_institute': 'toa thuốc bhyt',
  'name': '',
  'gender': 'nam',
  'birth': '1948',
  'age': '',
  'address': 'xã hưng công- bình lục- hà nam',
  'tel_customer': '',
  'id_bhyt': 'ht 2 35 35 205 86508',
  'diagnosis': 'm47 - hư cột sống (thoái hoá cột sống)',
  'date_in': '03/05/2021 18:03:59',
  'doctor_name': 'bs. trương văn nghĩa',
  'drug_name': 'panactol 500mg slfengshi-opc viên phong thấp sl',
  'drug_dose': 'sáng 2 viên chiều 2 viênsáng 2 viên chiều 2 viên',
  'drug_quantity': '20 viên20 viên'},
 {'current_institute': 'huyện lý nhân',
  'name': 'nguyễn văn tiến',
  'birth': '1948',
  'gender': 'nam',
  'address': 'xã hưng công - bình lục - hà nam',
  'id_bhyt': 'ht 2 35 35 205 86508',
  'diagnosis': 'm47 - hư cột sống (thoái hoá cột sống)',
  'drug_name': 'panactol 500mg fengshi-opc viên phong thấp 0,7mg+852mg+232mg+50mg',
  'drug_quantity': '20 viên 20 viên',
  'drug_dose': 'uống sáng 2 viên chiều 2 viên uống sáng 2 viên chiều 2 viên',
  'date_in': 'ngày 30 / 09 / 2020

In [30]:
os.listdir(path_mistral)[0]

'Long_Chau_275.json'

In [39]:
preds_json, labels_json, name_txts_json = get_outputs_labels_all(path_labels=path_labels, path_output=path_mistral)
acc_mean_json = compute(preds_json, labels_json, name_txts_json, field_list_all)

Số lượng file lỗi format:  18


  zss.distance(
  zss.distance(


In [40]:
acc_mean_json

[0.701759171349908,
 0.574970694442844,
 0.9483783274951847,
 0.7975510204081634,
 0.44196483788320523,
 0.22543030199092626,
 0.8528039194103687,
 0.3864002664002664,
 0.30010767024935986,
 0.8369013801764318,
 0.5430020193340258,
 0.478864786589648,
 0.7342894563652456,
 0.6528209710584773,
 0.672873526040459]

In [41]:
import pandas as pd

In [42]:
column_name_all = ['all', 'current_institute', 'name', 'gender', 'birth', 'age', 'address', 'tel_customer', 'id_bhyt', 'diagnosis', 'date_in', 'doctor_name', 'drug_name', 'drug_dose', 'drug_quantity']

model_name = ["mistral", "mistral_json"]
df = pd.DataFrame([acc_mean_json, acc_mean_json], columns=column_name_all, index=model_name)
df

Unnamed: 0,all,current_institute,name,gender,birth,age,address,tel_customer,id_bhyt,diagnosis,date_in,doctor_name,drug_name,drug_dose,drug_quantity
mistral,0.701759,0.574971,0.948378,0.797551,0.441965,0.22543,0.852804,0.3864,0.300108,0.836901,0.543002,0.478865,0.734289,0.652821,0.672874
mistral_json,0.701759,0.574971,0.948378,0.797551,0.441965,0.22543,0.852804,0.3864,0.300108,0.836901,0.543002,0.478865,0.734289,0.652821,0.672874
