In [1]:
import os
import numpy as np
import pandas as pd
import pickle
from HMM import *

def save_pickle(obj, name):
    with open("obj/" + name + ".pkl", "wb") as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_pickle(name):
    with open("obj/" + name + ".pkl", "rb") as f:
        return pickle.load(f)

Obtained list of possible tags from the trng set - 21 tags
Obtained list of vocab from the trng set - 18212 words
               B-NP      I-NP      B-VP    B-ADVP    B-ADJP    I-ADJP  \
##START##  0.648049  0.000000  0.018661  0.054287  0.003262  0.000000   
B-NP       0.028898  0.684706  0.130303  0.009809  0.003213  0.000000   
I-NP       0.047645  0.406679  0.134912  0.015332  0.004103  0.000000   
B-VP       0.345217  0.000000  0.007229  0.031214  0.039209  0.000000   
B-ADVP     0.210379  0.000000  0.215989  0.016269  0.016550  0.000000   
B-ADJP     0.051970  0.000000  0.110794  0.015991  0.001142  0.279840   
I-ADJP     0.088850  0.000000  0.069686  0.013937  0.012195  0.146341   
B-PP       0.928047  0.000000  0.026595  0.003318  0.002611  0.000000   
O          0.347185  0.000000  0.115030  0.029197  0.008755  0.000000   
B-SBAR     0.872565  0.000000  0.038441  0.008952  0.003160  0.000000   
I-VP       0.355350  0.000000  0.008269  0.037602  0.029038  0.000000   
I-ADVP    

## Part 2
This part deals with the estimation of the emission parameters using MLE. Refer to HMM.py for all the functions being referenced.

<img src="screenshots/p2_1.png">

```calc_emission_pairwise(df, word, tag)```

<img src="screenshots/p2_2.png">

```calc_emission_UNK_pairwise(df, word, tag, k=0.5)```

<img src="screenshots/p2_3.png">
Functions referenced can be found in HMM.py. Run the following script:

In [None]:
datasets = ["CN", "EN", "SG"]

In [None]:
dataset = "CN"
traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append("#UNK#")
# print(possible_tags)
# print(vocab)

emission_parameters = calc_all_emission_paras(dataset, traindf, possible_tags, vocab)
predict_dict = get_predictions_dict(possible_tags, vocab, emission_parameters)

dev_in = read_input_data(dataset, vocab)

dev_in["tag"] = dev_in["unseen"].map(predict_dict)
dev_in["tag"] = dev_in["tag"].fillna("")
preddf = dev_in.drop(["unseen"], axis=1)

preddf.to_csv(dataset + "/dev.p2.out", header=None, index=None, sep=" ", mode="w")
print(f'Write output for {dataset} done at "{dataset}/dev.p2.out"')

# save_pickle(emission_parameters, dataset + "_emission_paras")
# save_pickle(predict_dict, dataset + "_pred_dict")

In [None]:
dataset = "EN"
traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append("#UNK#")
# print(possible_tags)
# print(vocab)

emission_parameters = calc_all_emission_paras(dataset, traindf, possible_tags, vocab)
predict_dict = get_predictions_dict(possible_tags, vocab, emission_parameters)

dev_in = read_input_data(dataset, vocab)

dev_in["tag"] = dev_in["unseen"].map(predict_dict)
dev_in["tag"] = dev_in["tag"].fillna("")
preddf = dev_in.drop(["unseen"], axis=1)

preddf.to_csv(dataset + "/dev.p2.out", header=None, index=None, sep=" ", mode="w")
print(f'Write output for {dataset} done at "{dataset}/dev.p2.out"')


# save_pickle(emission_parameters, dataset + "_emission_paras")
# save_pickle(predict_dict, dataset + "_pred_dict")

In [None]:
dataset = "SG"
traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append("#UNK#")
# print(possible_tags)
# print(vocab)

emission_parameters = calc_all_emission_paras(dataset, traindf, possible_tags, vocab)
predict_dict = get_predictions_dict(possible_tags, vocab, emission_parameters)

dev_in = read_input_data(dataset, vocab)

dev_in["tag"] = dev_in["unseen"].map(predict_dict)
dev_in["tag"] = dev_in["tag"].fillna("")
preddf = dev_in.drop(["unseen"], axis=1)

preddf.to_csv(dataset + "/dev.p2.out", header=None, index=None, sep=" ", mode="w")
print(f'Write output for {dataset} done at "{dataset}/dev.p2.out"')

# save_pickle(emission_parameters, dataset + "_emission_paras")
# save_pickle(predict_dict, dataset + "_pred_dict")

In [None]:
print('CN')
!python3 EvalScript/evalResult.py CN/dev.out CN/dev.p2.out

print('EN')
!python3 EvalScript/evalResult.py EN/dev.out EN/dev.p2.out

print('SG')
!python3 EvalScript/evalResult.py SG/dev.out SG/dev.p2.out

## Part 3
This part deals with the estimation of the transmission parameters using MLE. Refer to HMM.py for all the functions being referenced.

### Estimate transition parameters

<img src="screenshots/p3_1.png">

In [13]:
datasets = ["CN", "EN", "SG"]

In [14]:
# datasets = ["EN"]
for dataset in datasets:
    print(f'-----Estimating transmission parameters for {dataset}-----')
    traindf = read_train_data(dataset)
    possible_tags = get_tags(traindf)
    vocab = get_vocab(traindf)

    full_tags_list = read_train_file(dataset) # get full list of tags

    trans_para = transition_para(full_tags_list, possible_tags)
    trans_arr = trans_para.to_numpy()
    pi0 = trans_arr[0]
    a = np.delete(trans_arr, 0, 0)
    a = np.delete(a,len(a[0])-1,1)
    print(f'Completed for {dataset}')

    save_pickle(trans_para, dataset+'_transmissionparas')
    save_pickle(a, dataset+'_a')
    save_pickle(pi0, dataset+'_pi0')

-----Estimating transmission parameters for CN-----
Obtained list of possible tags from the trng set - 7 tags
Obtained list of vocab from the trng set - 16935 words
Completed for CN
-----Estimating transmission parameters for EN-----
Obtained list of possible tags from the trng set - 21 tags
Obtained list of vocab from the trng set - 18212 words
Completed for EN
-----Estimating transmission parameters for SG-----
Obtained list of possible tags from the trng set - 7 tags
Obtained list of vocab from the trng set - 42809 words
Completed for SG


## Part 3

### Viterbi algorithm
<img src="screenshots/p3_2.png">

### load parameters

In [17]:
dataset = 'EN'

traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append('#UNK#')

datalist = read_train_file(dataset)

obs_map = {k: v for v, k in enumerate(vocab)}
state_map = {k: v for v, k in enumerate(possible_tags)}

pi = load_pickle(dataset+'_pi0')

a = load_pickle(dataset+'_a')
b = load_pickle(dataset+'_emissionparas')

obs = list(range(len(vocab)))

Obtained list of possible tags from the trng set - 21 tags
Obtained list of vocab from the trng set - 18212 words


In [19]:
print(obs_map)
print(state_map)
print(pi)
print(a)
print(b)
print(obs)

{'B-NP': 0, 'I-NP': 1, 'B-VP': 2, 'B-ADVP': 3, 'B-ADJP': 4, 'I-ADJP': 5, 'B-PP': 6, 'O': 7, 'B-SBAR': 8, 'I-VP': 9, 'I-ADVP': 10, 'B-PRT': 11, 'I-PP': 12, 'B-CONJP': 13, 'I-CONJP': 14, 'B-INTJ': 15, 'I-INTJ': 16, 'I-SBAR': 17, 'B-UCP': 18, 'I-UCP': 19, 'B-LST': 20}
[6.48049067e-01 0.00000000e+00 1.86610988e-02 5.42868328e-02
 3.26242986e-03 0.00000000e+00 1.08704163e-01 1.41850450e-01
 2.25760146e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.60994389e-04 0.00000000e+00 1.30497194e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.04397755e-03 0.00000000e+00]
[[2.88975795e-02 6.84705634e-01 1.30303351e-01 9.80868830e-03
  3.21319099e-03 0.00000000e+00 5.80065532e-02 8.09639573e-02
  3.40344572e-03 0.00000000e+00 0.00000000e+00 3.59370045e-04
  0.00000000e+00 8.45576578e-05 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 2.11394144e-05 0.00000000e+00
  0.00000000e+00]
 [4.76452162e-02 4.06678757e-01 1.34912348e-01 1.53321976e-02
  4.103