# library


In [115]:
import numpy as np
import random

DAUCAU = ",."

def words_to_word( words ):
	return "_".join(words)

def Longest_matching( sen, words_list):
	words = sen_to_words(sen)
	tokens = []
	n = len( words )
	s = 0
	while s < n :
		e = n
		word = words_to_word( words[s:e] )

		while ( not word in words_list ) and e>s+1:
			e -= 1
			word = words_to_word( words[s:e] )

		tokens.append( word )
		if e <= s :
			s += 1
		else:
			s = e

	return tokens

def preprocess_sen(sen):
	sen = sen.lower()
	sen = sen.replace("!", ".")
	sen = sen.replace("?", ".")
	sen = sen.replace(";", "")
	sen = sen.replace(":", "")
	sen = sen.replace("%", "")
	sen = sen.replace("\n", "")

	
	for c in DAUCAU:
		sen = sen.replace(c, " "+c+" ")

	return sen


def file_to_tagged_data( path ):
	file = open(path, "r", encoding="utf8")
	text = file.readlines()

	tagged_data = []

	for line in text:
		tagged_line = []

		line_words = sen_to_words( line )

		for w in line_words:
			if not "/" in w:
				tagged_line.append((w,w))
			else:
				s = w.split("/")
				tagged_line.append((s[0], s[1]))

		tagged_data.append( tagged_line )

	return tagged_data

def sen_to_words( sen ):

	def fullOfSpace( str ):
		for c in str:
			if c != ' ':
				return False 
		
		return True 

	sen = preprocess_sen( sen )

	sen_words = [ w for w in sen.split(" ") if not fullOfSpace(w) ]

	
	return sen_words

def Probality_words_pos( sen_words, sen_pos, hidden_markov_matrix, hidden_to_observed_matrix ):
	if len(sen_words) != len(sen_pos):
		return 0

	pro = 1
	for i in range( len( sen_pos)):
		x_index = tags_list.index( sen_pos[i] )
		y_index = 0
		if i == 0:
			y_index = len( tags_list)
		else:
			y_index = tags_list.index( sen_pos[i-1] )

		pro *= hidden_markov_matrix[y_index,x_index]

	for word, pos in zip( sen_words, sen_pos ):
		y_index = tags_list.index(pos)
		x_index = 0
		if word in words_list:
			x_index = words_list.index( word )
		else:
			x_index = len(words_list)

		pro *= hidden_to_observed_matrix[y_index, x_index]

	return pro

def get_words_list_from_tagged_data( tagged_data ):
	words = []
	for line in tagged_data:
		for w, _ in line :
			if not w in words:
				words.append(w)
	
	return words

def find_pos_for_words( words, hidden_markov_matrix, hidden_to_observed_matrix ):
	n = len(words)
	return_pos = []
	return_pro = 1
	for i in range( n ):
		w = words[i]
		if w in DAUCAU:
			return_pos.append(w)
		else:
			accept_tags_list = [ tag for tag in tags_list if tag not in DAUCAU ]
			pro = 0
			accept_pos = ""
			for pos in accept_tags_list:
				temp_pro = 1
				#calculate prepos-pos
				x_index = tags_list.index( pos )
				y_index = 0
				if i == 0:
					y_index = len(tags_list)
				else:
					y_index = tags_list.index( return_pos[i-1] )

				temp_pro *= hidden_markov_matrix[y_index,x_index]
				#calculate pos-word
				y_index = tags_list.index( pos )
				x_index = 0
				if w in words_list:
					x_index = words_list.index(w)
				else:
					x_index = len(words_list)

				temp_pro *= hidden_to_observed_matrix[y_index, x_index]

				if temp_pro > pro:
					pro = temp_pro
					accept_pos = pos
			
			return_pos.append( accept_pos )
			return_pro *= pro

	return return_pos, return_pro


def recur_find_pos_for_words( words, hidden_markov_matrix, hidden_to_observed_matrix ):
	n = len( words )

	def recur_find_pos( pos_list,words, n, hidden_markov_matrix, hidden_to_observed_matrix):
		# stop
		if len(pos_list) == n:
			return pos_list, Probality_words_pos( words, pos_list, hidden_markov_matrix, hidden_to_observed_matrix )
		
		pro = 0
		return_pos_list = []
		word_index = len(pos_list)

		accept_tags_list = []
		if words[ word_index ] not in DAUCAU:
			accept_tags_list = [ tag for tag in tags_list if tag not in DAUCAU ]
		else:
			accept_tags_list = [words[word_index]]

		for pos in accept_tags_list:
			new_pos_list, new_pro = recur_find_pos( pos_list[:] + [pos], words, n, hidden_markov_matrix, hidden_to_observed_matrix)
			if new_pro > pro:
				return_pos_list = new_pos_list
				pro = new_pro
		
		return return_pos_list, pro
	
	return recur_find_pos([], words, n, hidden_markov_matrix, hidden_to_observed_matrix)
	
def words_and_pos_to_output_sen( sen_words, sen_pos ):
	output = ""
	n = len( sen_words)
	comma = ','
	dot = '.'

	# hide pos of ',' and '.'
	for i in range(n ):
		if i ==0 or sen_words[i] in (comma, dot):
			output += sen_words[i]
		else:
			output += ' ' + sen_words[i]
		
		if not sen_words[i] in (comma,dot):
			output += "/" + sen_pos[i].upper()



	return output
	
def get_only_tags_from_tagged_data ( tagged_data ):
	tags = []
	for line in tagged_data :
		tags_line = []
		for _, tag in line:
			tags_line.append(tag)
		
		tags.append( tags_line )
	
	return tags

def get_only_words_from_tagged_data ( tagged_data ):
	tags = []
	for line in tagged_data :
		tags_line = []
		for tag,_ in line:
			tags_line.append(tag)
		
		tags.append( tags_line )
	
	return tags
def get_tags_list_from_tagged_data( tagged_data ):
	tags = []

	for line in tagged_data:
		for _, tag in line :
			if not tag in tags:
				tags.append(tag)
	
	return tags

# code


## data

In [116]:
tagged_data = file_to_tagged_data( "ready_for_tag.txt")

In [117]:
random.shuffle( tagged_data )
test_size = 5
test_data = tagged_data[-test_size:]
tagged_data = tagged_data[:-test_size]

## hidden markov


In [118]:
#  hidden markov matrix
#
#  ---------------------------
#  	   |tag0|tag1|...|tagN|
#  --------+-------------------
#  tag0    |    |    |   |    |
#  --------+----+----+---+----+
#  ...     |    |    |   |    |
#  --------+----+----+---+----+
#  tagN    |    |    |   |    |
#  --------+----+----+---+----+
#  <s>     |    |    |   |    |
#  ----------------------------


#  hidden to observed matrix
#
#  --------------------------------------+
#  	   |word0|word1|...|wordM|unknown|
#  --------+-----+-----+---+-----+-------+
#  tag0    |     |     |   |     |       |
#  --------+-----+-----+---+-----+-------+
#  ...     |     |     |   |     |       |
#  --------+-----+-----+---+-----+-------+
#  tagN    |     |     |   |     |       |
#  --------+-----+-----+---+-----+-------+

In [119]:
# get words list (dictionary), tags list, tags chains
words_list = get_words_list_from_tagged_data( tagged_data )+ ["unknown"]
tags = get_only_tags_from_tagged_data( tagged_data )
tags_list = get_tags_list_from_tagged_data(tagged_data)


# create new matrix reference to hidden markov chain
n = len(tags_list)
hidden_markov_matrix = np.zeros((n+1,n))
for line in tags:
	i = 0
	n = len( line )
	while i < n:
		if i == 0:
			x_index = tags_list.index(line[i])
			y_index = len(tags_list)
		else:
			x_index = tags_list.index(line[i])
			y_index = tags_list.index(line[i-1])

		hidden_markov_matrix[y_index,x_index] += 1
		i+=1 

# create matrix reference to hidden-to-observed connect
n_tags = len(tags_list)
n_words = len(words_list)

hidden_to_observed_matrix = np.zeros((n_tags,n_words+1))
for line in tagged_data:
	for w, tag in line :
		y_index = tags_list.index(tag)
		x_index = words_list.index(w)

		hidden_to_observed_matrix[y_index,x_index] += 1



## smooth and calculate probality

In [120]:
#smooth
hidden_markov_matrix = hidden_markov_matrix + 1
hidden_to_observed_matrix = hidden_to_observed_matrix + 1

#calculate probality
for i in range( hidden_markov_matrix.shape[0] ):
	total = np.sum( hidden_markov_matrix[i] )
	hidden_markov_matrix[i] = hidden_markov_matrix[i] / total

for i in range( hidden_to_observed_matrix.shape[0] ):
	total = np.sum( hidden_to_observed_matrix[i] )
	hidden_to_observed_matrix[i] = hidden_to_observed_matrix[i] / total


# find pos

In [121]:
# example
sen = "Tôi yêu bóng đá và đất nước"
sen_words = Longest_matching( sen , words_list)
sen_pos, probality = find_pos_for_words( sen_words, hidden_markov_matrix, hidden_to_observed_matrix )

print( sen_words )
print( sen_pos )
print(words_and_pos_to_output_sen( sen_words, sen_pos ))

['tôi', 'yêu', 'bóng_đá', 'và', 'đất_nước']
['n', 'v', 'n', 'q', 'n']
tôi/N yêu/V bóng_đá/N và/Q đất_nước/N


# accuracy

In [122]:
test_pos = get_only_tags_from_tagged_data(test_data)
test_words = get_only_words_from_tagged_data( test_data )

total_n_pos = 0
for sen_pos in test_pos:
	total_n_pos += len(sen_pos)

test_sens = []
for sen_words in test_words:
	sen = " ".join(sen_words)
	sen = sen.replace("_"," ")
	test_sens.append( sen )


correct = 0
for i in range( len( test_sens )):
	print("--------------------------------")
	test_sen_words = Longest_matching( test_sens[i], words_list )
	result_pos, pro = find_pos_for_words( test_sen_words, hidden_markov_matrix, hidden_to_observed_matrix )
	print(words_and_pos_to_output_sen( test_sen_words, result_pos))
	for j in range( len(test_pos[i]) ):
		try:
			if result_pos[j] == test_pos[i][j] and result_pos[j] not in DAUCAU:
				correct += 1
		except:
			pass

correct, total_n_pos


--------------------------------
tôi/N đang/V viết/N những/V dòng/N ngu/V ngốc/N.
--------------------------------
em/N cám/V ơn/N ạ/V.
--------------------------------
đường/N sá/V sài/N gòn/V hôm_nay/N đông/V đúc/N.
--------------------------------
hoa/N hồng/V là/V một/M loài/N hoa/V đẹp/N.
--------------------------------
bố/N mẹ/V là/V tuyệt/N vời/V nhất/N đối_với/Q chúng/N ta/V.


(10, 29)