In [5]:
# import numpy pandas matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

In [64]:
# read data from thuoc_raw.json
df = pd.read_json('thuoc_raw.json')
# print first 5 rows
df.head()

Unnamed: 0,id,images,tenThuoc,dotPheDuyet,soQuyetDinh,pheDuyet,hieuLuc,soDangKy,hoatChat,phanLoai,...,nhomThuoc,isHide,rate,rutSdk,rutSdkFile,chuY,ten,meta,rows,state
0,VD-10418-10,[],Amogentine 500mg/125mg,119,42/QĐ-QLD,24/02/2010,,VD-10418-10,"amoxicilin 500mg, acid clavulanic 125mg",,...,Tân dược,Yes,18.5,0,[],,,{},[],202
1,VD-10419-10,[],PQAlevo,119,42/QĐ-QLD,24/02/2010,,VD-10419-10,levofloxacin 250mg,,...,Tân dược,Yes,5.0,0,[],,,{},[],202
2,VD-10420-10,[],"Berberin 0,01g",119,42/QĐ-QLD,24/02/2010,,VD-10420-10,"Berberin clorid 0,01g",,...,Tân dược,Yes,5.0,0,[],,,{},[],202
3,VD-10421-10,[],"Berberin 0,05g",119,42/QĐ-QLD,24/02/2010,,VD-10421-10,"Berberin clorid 0,05g",,...,Tân dược,Yes,5.0,0,[],,,{},[],202
4,VD-10422-10,[],Doxycyclin TW3 100 mg,119,42/QĐ-QLD,24/02/2010,,VD-10422-10,Doxycyclin hydroclorid 100mg,,...,Tân dược,Yes,5.0,0,[],,,{},[],202


In [108]:


class APIRegexHelper:
	def __init__(self, string):
		self.string = string
		
	# remove mass and unit
	def remove_mass_and_unit(self):
		# remove mass
		self.string = re.sub(r'\d+((,|\.)\d+)*\s*(mg|mcg|IU|UI|g|ml|l|kg|mm|\%|đơn vị USP)', '', self.string)
		return self

	# remove leading and trailing spaces
	def remove_space(self):
		self.string = self.string.strip()
		return self

	# remove all the string after the first parenthesis
	def remove_parenthesis(self):
		self.string = re.sub(r'(\(|\)).*', '', self.string)
		return self
	
	# remove all the slash
	def remove_slash(self):
		self.string = re.sub(r'\s*/', '', self.string)
		return self

	# remove all the string before the first colon
	def remove_colon(self):
		self.string = re.sub(r'.*:', '', self.string)
		return self
	
	# remove ratio
	def remove_ratio(self):
		self.string = re.sub(r'\d+:\d+', '', self.string)
		return self
	
	# remove line break
	def remove_line_break(self):
		self.string = re.sub(r'\n', '', self.string)
		return self
	
	# to lower case
	def to_lower_case(self):
		self.string = self.string.lower()
		return self
	
	# remove similar strings
	def remove_similar_string(self):
		self.string = re.sub(r'(tương đương|tương ứng|dưới dạng|dạng).*', '', self.string)
		return self

class UltimateAPIRegex:
	def __init__(self, string):
		self.string = string
		self.apis = []
		
	def get_apis(self):
		# split by comma or semicolon that not followed by digit
		temp_list = re.split(r'(,|;)\s*(?![0-9])', self.string)
		for s in temp_list:
			temp = APIRegexHelper(s) \
					.remove_line_break() \
					.remove_mass_and_unit() \
			 		.remove_parenthesis() \
					.remove_ratio() \
			 		.remove_colon() \
			 		.remove_slash() \
					.remove_similar_string() \
					.remove_space() \
					.to_lower_case()					
			self.apis.append(temp.string)
		self.remove_incorrect_string().add_vitamin()
		return self.apis
	
	# remove incorrect strings
	def remove_incorrect_string(self):
		incorrect_strings = [';', '(', ')', ':', '', ' ', ',', '...', '--', '…']
		self.apis = [s for s in self.apis if s not in incorrect_strings]
		return self

	# add vitamin
	def add_vitamin(self):
		for i in range(len(self.apis)):
			if len(self.apis[i]) <= 2:
				self.apis[i] = 'vitamin ' + self.apis[i]
		return self

In [109]:
hoatChat = df['hoatChat'].tolist()
api_list = []
for s in hoatChat:
	api = UltimateAPIRegex(s).get_apis()
	api_list.append(api)
api_list

[['amoxicilin', 'acid clavulanic'],
 ['levofloxacin'],
 ['berberin clorid'],
 ['berberin clorid'],
 ['doxycyclin hydroclorid'],
 ['paracetamol'],
 ['paracetamol'],
 ['tetracyclin'],
 ['thiamin nitrat'],
 ['ích mẫu', 'hương phụ', 'ngải cứu'],
 ['adrenalin'],
 ['alpha chymotrypsin'],
 ['cefradin'],
 ['cimetidin'],
 ['biphenyl dimethyl dicarboxylat'],
 ['colchicin'],
 ['ephedrin hydroclorid'],
 ['ezetimibe'],
 ['famciclovir'],
 ['famciclovir'],
 ['hyaluronidase'],
 ['piracetam'],
 ['lincomycin'],
 ['galantamin hydrobromid'],
 ['nước cất pha tiêm'],
 ['nước cất pha tiêm'],
 ['nofloxacin'],
 ['orlistat'],
 ['ouabain'],
 ['papaverin hydroclorid'],
 ['paracetamol'],
 ['paracetamol'],
 ['paracetamol'],
 ['paracetamol'],
 ['piracetam'],
 ['methyl prednisolon'],
 ['pravastatin natri'],
 ['pravastatin natri'],
 ['ciprofloxacin'],
 ['seratiopeptidase'],
 ['acid tranexamic'],
 ['cefoperazon am'],
 ['acid acetylsalicylic'],
 ['vitamin b1', 'vitamin b2', 'vitamin b6', 'vitamin pp', 'vitamin c'],
 ['c

In [111]:
# unit test for UltimateAPIRegex
def test(s):
	pattern = r"^(?=.*\d)(?=.*\bvitamin\b)|^(\d|,|-|\s)+(?!\d).*|^(?!\d).*$"
	return re.match(pattern, s)

for idx, l in enumerate(api_list):
	for s in l:
		if not test(s):
			print("Failed at index: {}, string: {}".format(idx, s))
print("Test passed")

Test passed
