-
Notifications
You must be signed in to change notification settings - Fork 2
/
test.py
49 lines (44 loc) · 1.7 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
from numpy import load
import argparse
import json
import pickle
from nltk import word_tokenize
from hmm import build_vocab2idx, create_dictionaries, create_transition_matrix, create_emission_matrix, initialize, viterbi_forward, viterbi_backward
from utils import processing
from hmm import training_data
corpus_path = "WSJ_02-21.pos"
alpha = 0.001
def parse_argument():
parser = argparse.ArgumentParser(description='Predict Part of Speech Tags')
parser.add_argument('--sent', help='Enter your sentence.')
return parser.parse_args()
def predict():
args = parse_argument()
sample = args.sent
sample = str(sample) + ' #'
# print(sample)
tokens = word_tokenize(sample)
# print(tokens)
# vocab2idx = build_vocab2idx(corpus_path)
file = open('vocab.pkl', 'rb')
vocab2idx = pickle.load(file)
file.close()
prep_tokens = processing(vocab2idx, tokens)
training_corpus = training_data(corpus_path)
emission_counts, transition_counts, tag_counts = create_dictionaries(training_corpus, vocab2idx)
states = sorted(tag_counts.keys())
alpha = 0.001
# A = create_transition_matrix(transition_counts, tag_counts, alpha)
# B = create_emission_matrix(emission_counts, tag_counts, list(vocab2idx), alpha)
A = load('A.npy')
B = load('B.npy')
best_probs, best_paths = initialize(A, B, tag_counts, vocab2idx, states, prep_tokens)
best_probs, best_paths = viterbi_forward(A, B, prep_tokens, best_probs, best_paths, vocab2idx)
pred = viterbi_backward(best_probs, best_paths, states)
res = []
for tok, tag in zip(prep_tokens[:-1], pred[:-1]):
res.append((tok, tag))
print(res)
if __name__ == "__main__":
predict()