In [3]:
import os
import numpy as np
import pandas as pd
import pickle
from HMM import *

def save_pickle(obj, name):
    with open("obj/" + name + ".pkl", "wb") as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_pickle(name):
    with open("obj/" + name + ".pkl", "rb") as f:
        return pickle.load(f)

## Part 2
This part deals with the estimation of the emission parameters using MLE. Refer to HMM.py for all the functions being referenced.

<img src="p2_1.png">

```calc_emission_pairwise(df, word, tag)```

<img src="p2_2.png">

```calc_emission_UNK_pairwise(df, word, tag, k=0.5)```

<img src="p2_3.png">
Functions referenced can be found in HMM.py. Run the following script:

In [None]:
datasets = ["CN", "EN", "SG"]

In [None]:
dataset = "CN"
traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append("#UNK#")
# print(possible_tags)
# print(vocab)

emission_parameters = calc_all_emission_paras(dataset, traindf, possible_tags, vocab)
predict_dict = get_predictions_dict(possible_tags, vocab, emission_parameters)

dev_in = read_input_data(dataset, vocab)

dev_in["tag"] = dev_in["unseen"].map(predict_dict)
dev_in["tag"] = dev_in["tag"].fillna("")
preddf = dev_in.drop(["unseen"], axis=1)

preddf.to_csv(dataset + "/dev.p2.out", header=None, index=None, sep=" ", mode="w")
print(f'Write output for {dataset} done at "{dataset}/dev.p2.out"')

# save_pickle(emission_parameters, dataset + "_emission_paras")
# save_pickle(predict_dict, dataset + "_pred_dict")

In [None]:
dataset = "EN"
traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append("#UNK#")
# print(possible_tags)
# print(vocab)

emission_parameters = calc_all_emission_paras(dataset, traindf, possible_tags, vocab)
predict_dict = get_predictions_dict(possible_tags, vocab, emission_parameters)

dev_in = read_input_data(dataset, vocab)

dev_in["tag"] = dev_in["unseen"].map(predict_dict)
dev_in["tag"] = dev_in["tag"].fillna("")
preddf = dev_in.drop(["unseen"], axis=1)

preddf.to_csv(dataset + "/dev.p2.out", header=None, index=None, sep=" ", mode="w")
print(f'Write output for {dataset} done at "{dataset}/dev.p2.out"')


# save_pickle(emission_parameters, dataset + "_emission_paras")
# save_pickle(predict_dict, dataset + "_pred_dict")

In [None]:
dataset = "SG"
traindf = read_train_data(dataset)
possible_tags = get_tags(traindf)
vocab = get_vocab(traindf)
vocab.append("#UNK#")
# print(possible_tags)
# print(vocab)

emission_parameters = calc_all_emission_paras(dataset, traindf, possible_tags, vocab)
predict_dict = get_predictions_dict(possible_tags, vocab, emission_parameters)

dev_in = read_input_data(dataset, vocab)

dev_in["tag"] = dev_in["unseen"].map(predict_dict)
dev_in["tag"] = dev_in["tag"].fillna("")
preddf = dev_in.drop(["unseen"], axis=1)

preddf.to_csv(dataset + "/dev.p2.out", header=None, index=None, sep=" ", mode="w")
print(f'Write output for {dataset} done at "{dataset}/dev.p2.out"')

# save_pickle(emission_parameters, dataset + "_emission_paras")
# save_pickle(predict_dict, dataset + "_pred_dict")

In [None]:
print('CN')
!python3 EvalScript/evalResult.py CN/dev.out CN/dev.p2.out

print('EN')
!python3 EvalScript/evalResult.py EN/dev.out EN/dev.p2.out

print('SG')
!python3 EvalScript/evalResult.py SG/dev.out SG/dev.p2.out

## Part 3
This part deals with the estimation of the transmission parameters. Refer to HMM.py for all the functions being referenced.

<img src="p2_1.png">


```calc_emission_pairwise(df, word, tag)```

<img src="p2_2.png">

```calc_emission_UNK_pairwise(df, word, tag, k=0.5)```

<img src="p2_3.png">
Functions referenced can be found in HMM.py. Run the following script:

In [11]:
datasets = ["CN", "EN", "SG"]

In [15]:
for dataset in datasets:
    print(f'-----Estimating transmission parameters for {dataset}-----')
    traindf = read_train_data(dataset)
    possible_tags = get_tags(traindf)
    vocab = get_vocab(traindf)

    full_tags_list = read_train_file(dataset) # get full list of tags

    trans_para = transition_para(full_tags_list, possible_tags)
    trans_arr = trans_para.to_numpy()
    a = np.delete(trans_arr, 0, 0)
    a = np.delete(a,len(a[0])-1,1)
    print(trans_para)
    print(f'Completed for {dataset}')

    # save_pickle(trans_para, dataset+'_transmissionparas')
    # save_pickle(a, dataset+'_a')

-----Estimating transmission parameters for CN-----
Obtained list of possible tags from the trng set - 7 tags
Obtained list of vocab from the trng set - 16935 words
                   O  B-neutral  I-neutral  B-positive  I-positive  \
##START##   0.765560   0.136515   0.000000    0.087137    0.000000   
O           0.919671   0.035496   0.000000    0.012552    0.000000   
B-neutral   0.467573   0.017603   0.500000    0.000926    0.000000   
I-neutral   0.435130   0.014827   0.538352    0.001141    0.000000   
B-positive  0.387490   0.000000   0.000000    0.011373    0.588140   
I-positive  0.504342   0.003618   0.000000    0.008683    0.476122   
B-negative  0.655797   0.003623   0.000000    0.000000    0.000000   
I-negative  0.614815   0.000000   0.000000    0.000000    0.000000   

            B-negative  I-negative  ##STOP##  
##START##     0.010788    0.000000  0.000000  
O             0.003062    0.000000  0.029220  
B-neutral     0.000000    0.000000  0.013897  
I-neutral     0.