In [5]:
import numpy as np
import pandas as pd
import time

# 01.112 Machine Learning Design Project

## About the Project

We have 4 datasets in the `/data` folder. For each dataset, there is: 
- a labelled training set train, 
- an unlabelled development set `dev.in`
- a labelled development set `dev.out` 

The labelled data has the format of: `token` `\t` `tag`
- one token per line
- token and tag separated by tab 
- single empty lines that separates sentences

For the labels, they are slightly different for different datasets.
- SG, CN (Entity):
    - B-*: Beginning of entity
    - I-*: Inside of entity
    - O: Outside of any entity
- EN, AL (Phrase):
    - B-VP: Beginning of Verb Phrase
    - I-VP: Inside of Verb Phrase
    - *-NP: Noun Phrase
    - *PP: Propositional Phrase
    - O: Outside of any phrase

*Goal*: Build sequence labelling systems from training data (x) and use it to predict tag sequences for new sentences (y).

## Team members 
- Andri Setiawan Susanto
- Eldon Lim 
- Tey Siew Wen

## Part 1
Already completed individually.

## Part 2

a) Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation):

b)

1. Make a modified training set by replacing those words that appear $<k$ times in the training set with a special word token `#UNK#` before training.
2. During testing phase, ifaworddoesnot appear in the modified training set, we also replace that wordwith `#UNK#`.
3. Compute Emission Paramters with the function in (a)

For all the four datasets EN, AL, CN, and SG, learn these parameters with `train`, and evaluate your
system on the development set `dev.in` for each of the dataset. Write your output to `dev.p2.out`
for the four datasets respectively. Compare your outputs and the gold-standard outputs in `dev.out`
and report the precision, recall and F scores of such a baseline system for each dataset.

In [6]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import time

def emissionPara(df):
    x_y_lists = df['x_y'].str.split(" ")
    x_y_tuples = x_y_lists.apply(lambda x: tuple(x)).to_numpy()
    x_y_counter = Counter(x_y_tuples)
    
    y_counter = Counter(x_y_lists.apply(lambda s: s[1]))
    
    emission_params = {}
    
    for x_y, x_y_count in x_y_counter.items():
        y = x_y[1]
        emission_params[x_y] = x_y_count / y_counter[y]
    return emission_params

k = 3
replaceWord = "#UNK#"
test_data = "./data/EN/train"
test_data = pd.read_csv(test_data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")

def preprocess(df, k, replaceWord):
    """
    Function to modify train/test data based on the occurence of words. 
    If a word appears <= k times in the data, replace it with the replaceWord.
    Returns a new df.
    """
    x_y_lists = df['x_y'].str.split(" ")
    x = x_y_lists.apply(lambda s: s[0])
    x_counter = Counter(x).keys()
    
    invalid_x = x.value_counts()[x.value_counts() < 3].index.to_list()
    print("There are ", len(x), "observations")
    print("Out of those observations, ", len(invalid_x), "is to be replaced.")
    
    def replace_with_string(s):
        x, y = s
        if x in invalid_x:
            return "{} {}".format(replaceWord, y)
        else:
            return "{} {}".format(x, y)
        
    new_df = pd.DataFrame(x_y_lists.apply(replace_with_string), columns=["x_y"])
    return new_df, x_counter

def preprocess_test(data,k):
    global replaceWord
    
    start = time.process_time()   

    testdf1= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python")
    testdf= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False,skip_blank_lines=False, engine="python")

    x_dic = {}

    uniqueX, uniqueCountX= np.unique(testdf1['original'].astype(str),return_counts=True)
    for i in range(len(uniqueX)):
        x_dic[uniqueX[i]] = uniqueCountX[i]

    testdf['modified']=''
#     print(testdf)
    for i, text in enumerate(testdf['original']):
    #         df['x'][i] = replaceWord
        try:
            if text not in xy_pred_dic:
            
                testdf['modified'][i]=testdf['original'][i].replace(text,replaceWord)
            else:
                testdf['modified'][i]=testdf['original'][i]
        except:
            continue
    testdf['predict_label']=''
    for index, word in enumerate(testdf['modified']):
#     print(word)
        try:
            testdf['predict_label'][index]= xy_pred_dic[word]
        except:
            continue
    print("Time taken for test data: ",time.process_time() - start)
    return testdf

In [7]:
data_folders = ["AL", "EN","CN","SG"]
for x in ["EN"]:
    print("Performing sentiment analysis for data folder ", x)
    train_data = "./data/{}/train".format(x)
    test_data = "./data/{}/dev.in".format(x)
    test_result = "./data/{}/dev.out".format(x)
    
    train_data = pd.read_csv(train_data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")
    predata, x_counter = preprocess(train_data, k, replaceWord)
    print(x_counter)
    emission_dict = emissionPara(predata)
    
    
    testdf = preprocess_test(test_data,k, replaceWord)
    final = pd.DataFrame()
    final['result'] = testdf['modified'] + ' ' + testdf['predict_label']

    print("Writing the final result to dev.p2..out...")
    f = open('./output/{}/dev.p2.out'.format(x) ,'w')
    for word in final['result']:
        f.write(word + '\n')
    f.close()
    
#     print("Writing the final result to dev.p2..out...")
#     testdf.to_csv('./output/{}/dev.p2.out'.format(x))

Performing sentiment analysis for data folder  EN
There are  181628 observations
Out of those observations,  12026 is to be replaced.


TypeError: preprocess_test() takes 2 positional arguments but 3 were given

## Part 3

Write a function that estimates the transition parameters from the training set using MLE (maximum likelihood estimation):

In [None]:
def split_into_columns(df_column):
    new = df_column.str.split(" ", n=1, expand=True)
    return new[0], new[1]

In [None]:
from collections import Counter, defaultdict

def transitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    x, y = split_into_columns(train_data_blank["original"])
    xy_dic = dict(zip(x, y))
    
    # Get bottom count (Count(yi))
    y_count = Counter(y)
    
    # Get top count (Count(yi-1, yi))
    subseq_count = defaultdict(int)
    for i in range(len(y)-1):    
        y1 = y[i]
        y2 = y[i+1]
        
        if i == 0:
            subseq_count[("START", y1)] +=1
            y_count["START"] +=1
        if pd.isna(y1):
            subseq_count[("START", y2)] +=1
            y_count["START"] +=1
        elif i == len(y)-1 or pd.isna(y2):
            subseq_count[(y1, "END")] +=1
            y_count["END"] +=1
        else:
            subseq_count[y1,y2] += 1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    transition_dict = {}
    
    for k,v in subseq_count.items():
        y1 = k[0]
        y2 = k[1]
        transition_dict[y1,y2] = subseq_count[y1,y2] / y_count[y1]
     
    return transition_dict, subseq_count, y_count

# transition_dic, subseq_count, y_count = transitionPara(test)

# Viterbi

In [None]:
import numpy as np

def viterbi(unique_word_list):
    #This is for the starting for viterbi
    store=[]   #store = the storage for scores for all the nodes.
    scorelist=[]
    
    #This is for the start
    for i in range(len(nodes)):
        emission_score = emission(nodes[i],unique_word_list[0], x_counter)
        transition_score = transition("START",nodes[i])
        score_at_start = np.log(emission_score)+np.log(transition_score)
        store.append(score_at_start)    
        
    scorelist.append(store)
    store=[]
    score_per_node=[]
    
    #This is for the middle portion for viterbi
    if len(unique_word_list) > 1:
        for j in range(len(unique_word_list) - 1): #for the whole length in sentence
            for k in range(len(nodes)): # for each node
                #score per node = prevnode*emission*transition
                
                for l in range(len(nodes)): # l = iterate thru previous node, k= iterate thru current node, j= iterate thru sentence
                    # This is to calculate the current node scores.
                    prev_node = scorelist[j][l]
                    emission_score = emission(nodes[k],unique_word_list[j+1], x_counter)
                    curr_node = nodes[k]
                    score_per_node.append(prev_node+np.log(emission_score)+np.log(transition(nodes[l],curr_node)))
                    
                store.append(max(score_per_node)) # found max path
                
#                 max_score = max(score_per_node)
#                 max_index = np.argmax(score_per_node)                
#                 label_max = labels[max_index]
                score_per_node=[]
            
            #print(store)
            scorelist.append(store) # store the scores for nodes
            store=[]

                      
        score_at_stop=[]
        #This is for the stop for viterbi
        for m in range(len(nodes)):
            score_at_stop.append(np.log(transition(nodes[m],"END"))+ (scorelist[len(unique_word_list)-1][m])) #at stop.
        scorelist.append(max(score_at_stop))
     
    return scorelist

def emission(node,word,x_dict):
    global emission_dict
    pair = word,node
    detector = 0 # this is used to find if word exist in the dictionary
    if pair not in emission_dict.keys(): #if the combination cannot be found in the dictionary
                                         #Either the word exists, or word is new. 
        if word in x_dict:
            score=0   #this means that this node is not the correct node.
        else:
            replaced_text = ("#UNK#", node)
            if replaced_text in emission_dict.keys():
                score = emission_dict[replaced_text] #if label have #unk#
            else:
                score = 0   #if label does not have #unk#, then set to 0.
    else:
        score = emission_dict[pair]
    return score

def transition(x1,x2):
    global transition_dic
    #will use this to search the transition from x1 to x2
    pair = x1,x2
    if pair not in transition_dic.keys():
        score = 0
    else:
        score = transition_dic[x1,x2]
    return score

In [None]:
('#UNK#', "B-NP") in emission_dict

In [None]:
def viterbi_backtrack(scorelist):
    ####### back tracking for viterbi
    # node value*transition = array, then find max, then find position. use position for next step.
    #np.argmax returns index of max in the element.
    # The final score on the score list is for end
    scorelist = scorelist[::-1] #reverse the score list so easier to calculate.
    node_holder=[]
    path = []
    max_node_index=0
    length_of_scorelist=len(scorelist)
    length_of_nodes=len(nodes)

    if (length_of_scorelist == 1):
        for k in range (length_of_nodes):
            calculate_max_node = (scorelist[0][k]) + np.log(transition(nodes[k],"END"))
            node_holder.append(calculate_max_node)
        path.append(nodes[np.argmax(node_holder)])
        node_holder=[]
        return(path[::-1])

    for i in range (1,length_of_scorelist): # for length of sentence

        for j in range(length_of_nodes): #for each node
            #each node*own path, find max
            if (i==1):
                calculate_max_node = (scorelist[i][j]) + np.log(transition(nodes[j],"END"))
                node_holder.append(calculate_max_node)
                #print(np.exp(calculate_max_node))
            else:
                
                calculate_max_node = (scorelist[i][j]) + np.log(transition(nodes[j],nodes[max_node_index]))#
                node_holder.append(calculate_max_node)
        
        max_node_index=(np.argmax(node_holder))
        path.append(nodes[np.argmax(node_holder)])
        node_holder=[]
        

    return(path[::-1])

In [None]:
def preprocess_training_blank_row(data):
    start = time.process_time()   
    
    df= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False,engine="python",skip_blank_lines=False)
    # dropping null value columns to avoid errors 
    
    # new data frame with split value columns 
    df["x"], df["y"] = split_into_columns(df["original"])
    return df

In [None]:
def sentenceList(data):
    lines=[]
    line=[]
    x= data
    for label in x['x']:
        if pd.isnull(label)==False:
            line.append(label)
        else:
    #         line += ' stop'
            lines.append(line)
            line = []
    return lines


In [None]:
def finalresult(sequence_log,predata_blank):
    dataframe = []
    count=0
    for i in range(len(sequence_log)):
        for text in sequence_log[i]:
            dataframe.append(text)
            count+=1
        dataframe.append("")
    dftest=pd.DataFrame(dataframe)
    final = pd.DataFrame()
    final['result'] = predata_blank['x'] + " " +dftest[0]
    return final

In [None]:
data_folders = ["AL", "EN","CN","SG"]
for x in ["EN"]:
#     print("Performing sentiment analysis for data folder ", x)
    train_data = "./data/{}/train".format(x)
    test_data = "./data/{}/dev.in".format(x)
#     test_result = "./data/{}/dev.out".format(x)
    
#     predata = preprocess_training(train_data)
#     countData=uniqueCount(predata,k,replaceWord)
#     emissiondf = emissionCalcu(countData)
#     emission_dict = emissiondf[1]
#     xy_pred_dic = xyPrediction(emissiondf[0])
#     testdf_unprocess = pd.read_csv(test_data, sep='/n', delimiter=None, names=['x'],index_col=False,skip_blank_lines=False, engine="python")
#     testdf = preprocess_test(test_data,k)
    
#     testresultdf = pd.read_csv(test_result, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python")
#     new = testresultdf["original"].str.split(" ", n=1,expand=True) 

#     # making separate first name column from new data frame 
#     testresultdf["x"]= new[0] 

#     # making separate last name column from new data frame 
#     testresultdf["y"]= new[1]
#     final = pd.DataFrame()
    
#     final['result'] = testdf['modified'] + ' ' + testdf['predict_label']
# #     print(final.head(3))
    
#     print("Writing the final result to dev.p2..out...")
#     f = open('./output/{}/dev.p2.out'.format(x) ,'w')
#     for word in final['result']:
#         f.write(word + '\n')
#     f.close()
    
# ##############################PART 3########################################################
    transition_dic, subseq_count, y_count = transitionPara(train_data)
    predata_blank=preprocess_training_blank_row(train_data)
    node = list(y_count.keys())
#     print(testdf_unprocess)
    testdf_unprocess = pd.read_csv(test_data, sep='/n', delimiter=None, names=['x'],index_col=False,skip_blank_lines=False, engine="python")
    lines= sentenceList(testdf_unprocess)
    
    nodes = node
    log_array =[]
    sequence_log=[]

    for i in range(len(lines)):
        viterbioutput=viterbi(lines[i])
        log_array.append(viterbioutput)
        sequence_log.append(viterbi_backtrack(viterbioutput))
    print(sequence_log)
    
    result = finalresult(sequence_log,testdf_unprocess)
    print(result)
    
    print("Writing the final result to dev.p3.out...")
#     f = open('./dev.p3.out'.format(x) ,'w')
    f = open('./output/{}/dev.p3.out'.format(x) ,'w')
    for word in result['result']:
        if pd.isnull(word) == False:
            f.write(word + '\n')
        else:
            f.write("" +"\n")
    f.close()

In [None]:
x = pd.DataFrame.from_dict(emission_dict, orient='index').reset_index(drop=False)
x

# Part 5

## Multistate Viterbi

In [8]:
import numpy as np
import pandas as pd
import time

In [9]:
def split_into_columns(df_column):
    new = df_column.str.split(" ", n=1, expand=True)
    return new[0], new[1]

In [10]:
from collections import Counter, defaultdict

# funtion to find the transition parameter from current label -> next label -> next next label
def MultistateTransitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    train_ycyf_data = pd.read_csv(data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")
    x, y = split_into_columns(train_data_blank["original"])
#     xy_dic = dict(zip(x, y))
    
    # Get unique label count 
    y_count = Counter(y)
    print("y_count: ",y_count)
    
    
    # yc = current label
    # yf = next label
    # yff = next next label
    
    # Get ycyf count of count(yc,yf)
    ycyf_count =ycyf(train_ycyf_data)
    print("ycyf_count: ", ycyf_count)

    # subseq_count is a dictionary that store the value of count(yc,yf,yff)
    subseq_count = defaultdict(int)
    
    for i in range(len(y)-1):  
       
        y1 = y[i] # third node -> yff
        if i == 1:
            y2 = y[i-1] 
        elif i >1:
            y2 = y[i-1] # second node -> yf
            y3 = y[i-2] # first node -> yc
        
        if i == 0:
            subseq_count[("NONE_S","START", y1)] +=1
            ycyf_count["NONE_S","START"] +=1
        elif i == 1:
            subseq_count[("START", y2, y1)] +=1
            ycyf_count["START", y2] +=1
        elif pd.isna(y1) and pd.isna(y3):
            subseq_count[("NONE_S", "START", y2)] +=1
            ycyf_count["NONE_S", "START"] +=1
        elif pd.isna(y1):
            subseq_count[(y3, y2, "END")] +=1
        
        elif i !=0 and i != 1 and pd.isna(y2):
            subseq_count[("NONE_S","START",y1)] +=1
            ycyf_count["NONE_S","START"] +=1
            subseq_count[(y3,"END","NONE_E")] +=1
            ycyf_count[y3,"END"] +=1
        elif i !=0 and pd.isna(y2):
            subseq_count[("NONE_S","START",y1)] +=1
            ycyf_count["NONE_S","START"] +=1
        elif i == len(y)-2:
            subseq_count[(y1, "END", "NONE_E")] +=1
            ycyf_count[y1, "END"] +=1
        elif pd.isna(y1) and pd.isna(y3):
            subseq_count[("START", y2 ,"END")] +=1
            ycyf_count["START", y2] +=1
        elif pd.isna(y3):
            subseq_count["START",y2,y1] += 1
            ycyf_count["START", y2] +=1
        else:
            subseq_count[y3,y2,y1] += 1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    multistate_transition_params = {}
    
    for k,v in subseq_count.items():
        y3 = k[0] # first node
        y2 = k[1] # second node
        y1 = k[2] # third node
        multistate_transition_params[y3,y2,y1] = subseq_count[y3,y2,y1] / ycyf_count[y3,y2]
       
    return multistate_transition_params, subseq_count, y_count,ycyf_count

# transition_dic, subseq_count, y_count = transitionPara(test)

In [11]:
def MultistatePreprocess(df, k, replaceWord):
    """
    Function to modify train/test data based on the occurence of words. 
    If a word appears <= k times in the data, replace it with the replaceWord.
    Returns a new df.
    """
    x_y_lists = df['x_y'].str.split(" ")
    x = x_y_lists.apply(lambda s: s[0])
    x_counter = Counter(x).keys()
    
    invalid_x = x.value_counts()[x.value_counts() < 3].index.to_list()
    print("There are ", len(x), "observations")
    print("Out of those observations, ", len(invalid_x), "is to be replaced.")
    
    def replace_with_string(s):
        x, yc, yp = s
        if x in invalid_x:
            return "{} {} {}".format(replaceWord, yc, yp)
        else:
            return "{} {} {}".format(x, yc, yp)
        
    new_df = pd.DataFrame(x_y_lists.apply(replace_with_string), columns=["x_y"])
    return new_df, x_counter

In [12]:
# import pandas as pd
# train_data ="./data/EN/train"
# train_data = pd.read_csv(train_data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")
# x_y_lists = train_data
# y_df=split_into_columns(x_y_lists['x_y'])[1]

# def xyy(xydf, ydf):
#     for i in range(len(xydf)):
#         if i == 0:
#             xydf['x_y'][i] = xydf['x_y'][i] + " START"
#         if i != 0:
#             previousLabel = ydf[i-1]
# #             print("previousLabel: ",previousLabel)
#             xydf['x_y'][i] = xydf['x_y'][i] + " " + previousLabel 

#     return xydf

# xylist = xyy(x_y_lists,y_df)

In [13]:
from collections import Counter, defaultdict

def ycyf (data):
    x_y_lists1 = data
    y_df=split_into_columns(x_y_lists1['x_y'])
    y_mod_df = pd.Series(["START"])
    y_df1=pd.Series(y_df[1])
    y2 = pd.concat([y_mod_df,y_df1[:len(y_df1)-1]],axis=0,ignore_index=True)
    xyy = pd.concat([y2,y_df[1]], axis=1)
    xyy['ycyf'] = xyy[0].str.cat(xyy[1],sep=" ")
    print(xyy)
    ycyf_lists = xyy['ycyf'].str.split(" ")
    ycyf_tuples = ycyf_lists.apply(lambda x: tuple(x)).to_numpy()
    ycyf_counter = Counter(ycyf_tuples)
    return ycyf_counter
    

# ycyp_counter=ycyp(train_data)

In [None]:
# from collections import Counter, defaultdict
# train_data = './data/EN/train'

# # funtion to find the transition parameter from current label -> next label 

# def y1y2transitionPara(data):
#     train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
#     x, y = split_into_columns(train_data_blank["original"])
#     xy_dic = dict(zip(x, y))
    
    
    
#     # Get count(y)
#     y_count = Counter(y)
    
#     #y1 is current label
#     #y2 is next label
    
#     # y1y2subseq_count is a dictionary that store the value of count(y1,y2)
#     y1y2subseq_count = defaultdict(int)
#     for i in range(len(y)-1):    
#         y1 = y[i] #current label
#         y2 = y[i+1] #next label
        
#         if i == 0:
#             y1y2subseq_count[("START", y1)] +=1
#             y_count["START"] +=1
#         if pd.isna(y1):
#             y1y2subseq_count[("START", y2)] +=1
#             y_count["START"] +=1
#         elif i == len(y)-1 or pd.isna(y2):
#             y1y2subseq_count[(y1, "END")] +=1
#             y_count["END"] +=1
#         else:
#             y1y2subseq_count[y1,y2] += 1
    
    
#     # Calculation of transition params
#     result = np.empty(len(y)+2)
#     y1y2transition_dict = {}
#     for k,v in y1y2subseq_count.items():
#         y1 = k[0]
#         y2 = k[1]
#         y1y2transition_dict[y1,y2] = y1y2subseq_count[y1,y2] / y_count[y1]
     
#     return y1y2transition_dict, y1y2subseq_count, y_count

# y1y2transition_dic, y1y2subseq_count, y_count = y1y2transitionPara(train_data)

In [14]:
train_data ="./data/EN/train"
multitransition_dic, subseq_count, y_count, ycyf_count = MultistateTransitionPara(train_data)

y_count:  Counter({'I-NP': 54591, 'B-NP': 47305, 'O': 23872, 'B-PP': 18387, 'B-VP': 18261, 'I-VP': 10159, nan: 7663, 'B-ADVP': 3565, 'B-SBAR': 1899, 'B-ADJP': 1751, 'I-ADJP': 574, 'B-PRT': 468, 'I-ADVP': 363, 'I-PP': 223, 'I-CONJP': 64, 'B-CONJP': 49, 'I-SBAR': 48, 'B-INTJ': 26, 'B-LST': 11, 'I-INTJ': 7, 'I-UCP': 4, 'B-UCP': 1})
             0       1           ycyf
0        START    B-NP     START B-NP
1         B-NP    I-NP      B-NP I-NP
2         I-NP    B-VP      I-NP B-VP
3         B-VP  B-ADVP    B-VP B-ADVP
4       B-ADVP  B-ADJP  B-ADVP B-ADJP
5       B-ADJP  I-ADJP  B-ADJP I-ADJP
6       I-ADJP  I-ADJP  I-ADJP I-ADJP
7       I-ADJP    B-PP    I-ADJP B-PP
8         B-PP    B-NP      B-PP B-NP
9         B-NP    I-NP      B-NP I-NP
10        I-NP    B-PP      I-NP B-PP
11        B-PP    B-NP      B-PP B-NP
12        B-NP    I-NP      B-NP I-NP
13        I-NP       O         I-NP O
14           O       O            O O
15           O  B-ADJP       O B-ADJP
16      B-ADJP  I-ADJP 

# Finding P(w|u,v)
#### Probability for current label given previous 2 labels

In [15]:
from collections import Counter, defaultdict

#v,w
def vwtransitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    x, y = split_into_columns(train_data_blank["original"])
    xy_dic = dict(zip(x, y))
    
    # Get count(y)
    y_count = Counter(y)
    
    #subseq_count is a dictionary that store the value of count(current label, next label)
    subseq_count = defaultdict(int)
    for i in range(len(y)-1):    
        y1 = y[i] #current label
        y2 = y[i+1] #next label
        
        if i == 0:
            subseq_count[("START", y1)] +=1
            y_count["START"] +=1
        if pd.isna(y1):
            subseq_count[("START", y2)] +=1
            y_count["START"] +=1
        elif i == len(y)-1 or pd.isna(y2):
            subseq_count[(y1, "END")] +=1
            y_count["END"] +=1
        else:
            subseq_count[y1,y2] += 1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    
    # yytransition_dict stores the transition parameter from current label to next label
    yytransition_dict = {}
    
    for k,v in subseq_count.items():
        y1 = k[0] #current label
        y2 = k[1] #next label
        yytransition_dict[y1,y2] = subseq_count[y1,y2] / y_count[y1]
     
    return yytransition_dict, subseq_count, y_count

vwtransition_dic, vwsubseq_count, y_count = vwtransitionPara(train_data)
print(vwtransition_dic)

{('START', 'B-NP'): 0.6480490669450607, ('B-NP', 'I-NP'): 0.6847056336539478, ('I-NP', 'B-VP'): 0.13491234818926195, ('B-VP', 'B-ADVP'): 0.031214062756694597, ('B-ADVP', 'B-ADJP'): 0.016549789621318374, ('B-ADJP', 'I-ADJP'): 0.27984009137635635, ('I-ADJP', 'I-ADJP'): 0.14634146341463414, ('I-ADJP', 'B-PP'): 0.2857142857142857, ('B-PP', 'B-NP'): 0.9280469897209985, ('I-NP', 'B-PP'): 0.15650931472220694, ('I-NP', 'O'): 0.2273268487479621, ('O', 'O'): 0.11352211796246649, ('O', 'B-ADJP'): 0.008755026809651475, ('B-NP', 'B-VP'): 0.13030335059718845, ('B-VP', 'B-PP'): 0.09873500903564975, ('I-NP', 'I-NP'): 0.4066787565715961, ('O', 'END'): 0.3182808310991957, ('B-VP', 'B-SBAR'): 0.025573626855046272, ('B-SBAR', 'B-NP'): 0.8725645076355977, ('B-VP', 'B-NP'): 0.3452165817863206, ('B-VP', 'O'): 0.06741142325173868, ('O', 'B-NP'): 0.34718498659517427, ('I-NP', 'B-NP'): 0.0476452162444359, ('B-ADVP', 'B-PP'): 0.17054698457223003, ('B-VP', 'I-VP'): 0.3739116149170363, ('I-VP', 'B-NP'): 0.35534993

In [16]:
from collections import Counter, defaultdict
train_data = './data/EN/train'
def wtransitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    x, y = split_into_columns(train_data_blank["original"])
#     xy_dic = dict(zip(x, y))
    y1transition_dict = {}
    
    # Get count(y)
    y_count = Counter(y)

    for i in range(len(y)-1):    
        y1 = y[i] #for input in current dataframe index
        y2 = y[i+1] #for input in next dataframe index
        
        if i == 0:
            y_count["START"] +=1
        if pd.isna(y1):
            y_count["START"] +=1
        elif i == len(y)-1 or pd.isna(y2):
            y_count["END"] +=1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    yytransition_dict = {}
    
    total_count = sum(y_count.values())
    
    for e in y_count.keys():
        y1transition_dict[e] = y_count[e]/total_count

    return y1transition_dict

wtransition_dict = wtransitionPara(train_data)
print(wtransition_dict)

{'B-NP': 0.23118802445544603, 'I-NP': 0.2667960140164307, 'B-VP': 0.08924478415771905, 'B-ADVP': 0.017422794782447206, 'B-ADJP': 0.008557451238166917, 'I-ADJP': 0.0028052410112551743, 'B-PP': 0.0898605687699458, 'O': 0.1166667481196577, nan: 0.03745045621820279, 'B-SBAR': 0.009280753798560236, 'I-VP': 0.04964885615564689, 'I-ADVP': 0.0017740461447484813, 'B-PRT': 0.002287199988270769, 'I-PP': 0.0010898410200520974, 'B-CONJP': 0.0002394717936437344, 'I-CONJP': 0.0003127794855754898, 'B-INTJ': 0.00012706666601504274, 'I-INTJ': 3.4210256234819196e-05, 'I-SBAR': 0.00023458461418161735, 'B-UCP': 4.887179462117028e-06, 'I-UCP': 1.9548717848468114e-05, 'B-LST': 5.3758974083287313e-05, 'START': 0.03745045621820279, 'END': 0.03745045621820279}


In [18]:
k3_dict={}
def k3_coef(dict):
    for key, val in dict.items():
        k3_dict[key] = (np.log(val+1)+1)/(np.log(val+1)+2)
    return k3_dict

k3 = k3_coef(subseq_count)

k2_dict={}
def k2_coef(dict):
    for key, val in dict.items():
        k2_dict[key] = (np.log(val+1)+1)/(np.log(val+1)+2)
    return k2_dict

k2 = k2_coef(vwtransition_dic)

In [19]:

lamda1=k3

lamda2={}
va=0
for key, val in k3.items():
    
    for key2,val2 in k2.items():
        v = key2[0]
        w = key2[1]
        if key[1].startswith(v) and key[2].startswith(w):
            va+=1-val
            va = va*val2
            if key not in lamda2.keys():
                lamda2[key] = va
                va =0

lamda3={}
va=0
for key, val in k3.items():
    
    for key2,val2 in k2.items():
        v = key2[0]
        w = key2[1]
        if key[1].startswith(v) and key[2].startswith(w):
            va+=1-val
            va = va*(1-val2)
            if key not in lamda3.keys():
                lamda3[key] = va
                va =0


In [None]:
# multitransition_dic

In [20]:
from collections import Counter, defaultdict
def multistateEmissionPara(df):
    x_y_lists = df['x_y'].str.split(" ")
    x_y_tuples = x_y_lists.apply(lambda x: tuple(x)).to_numpy()
    x_y_counter = Counter(x_y_tuples)
    
    y_counter = Counter(x_y_lists.apply(lambda s: s[1]))
    
    multistate_emission_params = {}
    for x_y, x_y_count in x_y_counter.items():

        try:
            yc = x_y[2]
            yf = x_y[1]

    #         y = x_y[1]
            multistate_emission_params[x_y] = x_y_count / ycyf_count[yc,yf]
        except:
            print(x_y)
            print("yc ",yc)
            print("yf " ,yf)
            print(x_y_count)
            break
    return multistate_emission_params

In [None]:
# k =3
# replaceWord = "#UNK#"
# predata, x_counter = MultistatePreprocess(xylist, k, replaceWord)
# multiemission_dict=multistateEmissionPara(predata)

In [None]:
# multiemission_dict

In [21]:
#correct
import numpy as np

def multistateviterbi(unique_word_list):
    #This is for the starting for viterbi
    store=[]   #store = the storage for scores for all the nodes.
    scorelist=[]
    
#     print("sentence: ", unique_word_list)
    
    #This is for the start
    
    # mutlitransition(curr_label,next_label,next_next_label)
    # emission(current_label,current_word,x_dict)
    for i in range(len(nodes)):
        emission_score = emission(nodes[i],unique_word_list[0], x_counter)
        transition_score = multitransition("NONE_S","START",nodes[i])
        if transition_score == 0:
            score_at_start = np.NINF
        else:
            score_at_start = (np.log(emission_score)+np.log(transition_score))
        store.append(score_at_start)    
        
    scorelist.append(store)
    store=[]
    score_per_node=[]
    
    
    # This is for sentence with one word
    if len(unique_word_list) == 1:
        for i in range(len(nodes)):
            transition_score = multitransition("START",nodes[i],"END")
            if transition_score == 0:
                score = np.NINF
            else:
                score = scorelist[0][i]+np.log(transition_score)
            store.append(score)    
        
        scorelist.append(max(store))
        store=[]
        
    # This is for sentence with more than one word
    #This is for the node just right after start
    if len(unique_word_list)>1:
        for a in range(len(nodes)):
            for b in range(len(nodes)):
                prev_prev_node = "START"
                prev_node = node[b]
                curr_node = node[a]
                prev_node_score =scorelist[0][b]
                emission_score = emission(node[a],unique_word_list[1], x_counter)
                transition_score = multitransition(prev_prev_node,prev_node,curr_node)
                if transition_score ==0:
                    score = np.NINF
                else:
                    score =(prev_node_score+np.log(emission_score)+np.log(transition_score))
                score_per_node.append(score)

            store.append(max(score_per_node)) # found max path
            score_per_node=[]

        scorelist.append(store) # store the scores for nodes
        store=[]
    
    #This is for sentence with more than 2 words
    #This is for the middle portion for viterbi
    if len(unique_word_list)>2:
        for i in range(len(unique_word_list)-2):
            for j in range(len(node)):
                for k in range(len(node)):
                    for l in range(len(node)):
                        prev_prev_node = node[k]
                        prev_node = node[l]
                        curr_node = node[j]
                        
                        prev_node_score = scorelist[i+1][l]
                        transition_score = multitransition(prev_prev_node, prev_node, curr_node)
                        emission_score = emission(curr_node,unique_word_list[i+2],x_counter)
                        if transition_score==0:
                            score = np.NINF
                        else:
                            score = (prev_node_score+np.log(emission_score)+np.log(transition_score))
                        score_per_node.append(score)
                store.append(max(score_per_node)) # found max path
                score_per_node=[]
            scorelist.append(store) # store the scores for nodes
            store=[]
            
   
       
        #This is for the stop for viterbi
    
        score_at_stop=[]
        for m in range(len(node)):
            for n in range(len(node)):
                prev_prev_node = node[n]
                prev_node = node[m]
                curr_node = "END"
                transition_score = multitransition(prev_prev_node, prev_node, curr_node)
                if transition_score == 0:
                    score = np.NINF
                else:
                    score = (np.log(transition_score)+ (scorelist[len(unique_word_list)-1][m]))
                score_at_stop.append(score) #at stop.
        scorelist.append(max(score_at_stop))
        
    return scorelist

def emission(node,word,x_dict):
    global emission_dict
    pair = word,node
    # this is used to find if word exist in the dictionary
    if pair not in emission_dict.keys(): #if the combination cannot be found in the dictionary
                                         #Either the word exists, or word is new. 
        if word in x_dict:
            score=0   #this means that this node is not the correct node.
        else:
            replaced_text = ("#UNK#", node)
            if replaced_text in emission_dict.keys():
                score = emission_dict[replaced_text] #if label have #unk#
            else:
                score = 0   #if label does not have #unk#, then set to 0.
    else:
        score = emission_dict[pair]
    return score

def multitransition(x1,x2,x3): 
    global transition_dic
    #will use this to search the transition from x1 to x2 to x3
    pair = x1,x2,x3
    if pair not in multitransition_dic.keys():
        
        score = 0
    else:
        score = lamda1[x1,x2,x3]* multitransition_dic[x1,x2,x3] + lamda2[x1,x2,x3]*vwtransition_dic[x2,x3] + lamda3[x1,x2,x3] * wtransition_dict[x3]
    return score

In [22]:
#correct
def multistateviterbi_backtrack(scorelist):
    ####### back tracking for viterbi
    # node value*transition = array, then find max, then find position. use position for next step.
    #np.argmax returns index of max in the element.
    # The final score on the score list is for end
    scorelist = scorelist[::-1] #reverse the score list so easier to calculate.
    node_holder=[]
    path = []
    max_node_index=0
    length_of_scorelist=len(scorelist)
    length_of_nodes=len(nodes)
#     print(scorelist)

    # for sentence with only one word
    if length_of_scorelist == 1:
        for k in range(length_of_nodes):
            calculate_max_node = ((scorelist[0][k]) +np.log(multitransition("START",nodes[k],"END")))
            node_holder.append(calculate_max_node)
        path.append(nodes[np.argmax(node_holder)])
        node_holder=[]
        return(path[::-1])
    

    # for sentence with only two word
    if length_of_scorelist == 2:
        max_each_label_val = []
        max_each_label_index = []
        temp_node_holder = []
        temp_node_holder_index =[]
        
        for i in range(length_of_nodes):
            for k in range(length_of_nodes):
                calculate_max_node = ((scorelist[1][k]) +np.log(multitransition(nodes[k],nodes[i],"END")))
                temp_node_holder.append(calculate_max_node)
                temp_node_holder_index.append(k)
            max_each_label_val.append(max(temp_node_holder)) #finding the maximum value of the node from the previous n nodes
            max_each_label_index.append(temp_node_holder_index[np.argmax(temp_node_holder)]) #finding the index that give the maximum value of the node from the previous n nodes
            temp_node_holder = []
            temp_node_holder_index =[]
        max_node_index=(np.argmax(max_each_label_val)) # finding the index that give the true maximum value from the n maximum nodes
        path.append(nodes[ max_node_index]) # get the label from the maximum index
        node_holder=[]
        max_each_label_val = []
        max_each_lable_index = []
        
        # this is for the stop node viterbi
        for m in range(length_of_nodes):
            calculate_max_node = ((scorelist[length_of_scorelist-1][m]) +np.log(multitransition("START",nodes[m],nodes[max_node_index])))
            temp_node_holder.append(calculate_max_node)



        max_node_index=(np.argmax(temp_node_holder))
        path.append(nodes[np.argmax(temp_node_holder)])
        node_holder=[]
        
        return(path[::-1])
    
    #for sentence with more than 2 words
    if length_of_scorelist>2:
        max_each_label_val = []
        max_each_label_index = []
        temp_node_holder = []
        temp_node_holder_index =[]
        
        
        for i in range(length_of_nodes): # for length of sentence
        
            for j in range(length_of_nodes):
                calculate_max_node = ((scorelist[1][i]) + np.log(multitransition(nodes[j],nodes[i],"END")))
                temp_node_holder.append(calculate_max_node)
                temp_node_holder_index.append(j)
            max_each_label_index.append(temp_node_holder_index[np.argmax(temp_node_holder)]) #finding the index that give the maximum value of the node from the previous n nodes
            max_each_label_val.append(max(temp_node_holder)) #finding the maximum value of the node from the previous n nodes
            temp_node_holder = []
            temp_node_holder_index =[]
        max_node_index=(np.argmax(max_each_label_val))# finding the index that give the true maximum value from the n maximum nodes
        path.append(nodes[ max_node_index]) # get the label from the maximum index
        node_holder=[]
        max_each_label_val = []
        max_each_lable_index = []
    
    #for sentence with more than 3 words
    if length_of_scorelist>3:
        max_each_label_val = []
        max_each_lable_index = []
        temp_node_holder = []
        temp_node_holder_index =[]
        node_holder = []
        node_holder_index =[]
        for i in range(2,length_of_scorelist-1): # for length of sentence
            for j in range(length_of_nodes):
                for k in range(length_of_nodes):

                    calculate_max_node = ((scorelist[i][k]) +np.log(multitransition(nodes[j],nodes[k],nodes[max_node_index])))

                    temp_node_holder.append(calculate_max_node)
                    temp_node_holder_index.append(k)
                node_holder_index.append(temp_node_holder_index[np.argmax(temp_node_holder)])#finding the index that give the maximum value of the node from the previous n nodes
                node_holder.append(temp_node_holder[np.argmax(temp_node_holder)]) #finding the maximum value of the node from the previous n nodes
                
                temp_node_holder=[]
                temp_node_holder_index=[]
            max_each_label_val.append(max(node_holder))
            max_each_label_index.append(node_holder_index[np.argmax(node_holder)])
            max_node_index=node_holder_index[np.argmax(node_holder)]# finding the index that give the true maximum value from the n maximum nodes
            path.append(nodes[max_node_index])# get the label from the maximum index
            node_holder=[]
            node_holder_index=[]
            max_each_label_val = []
            max_each_lable_index = []
        
        for m in range(length_of_nodes):
            calculate_max_node = ((scorelist[length_of_scorelist-1][m]) +np.log(multitransition("START",nodes[m],nodes[max_node_index])))
            temp_node_holder.append(calculate_max_node)



        max_node_index=(np.argmax(temp_node_holder))
        path.append(nodes[np.argmax(temp_node_holder)])
        node_holder=[]
            
        

    return(path[::-1])

In [23]:
def preprocess_training_blank_row(data):
    start = time.process_time()   
    
    df= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False,engine="python",skip_blank_lines=False)

    
    # new data frame with split value columns 
    df["x"], df["y"] = split_into_columns(df["original"])
    return df

In [24]:
def sentenceList(data):
    lines=[]
    line=[]
    x= data
    for label in x['x']:
        if pd.isnull(label)==False:
            line.append(label)
        else:
    #         line += ' stop'
            lines.append(line)
            line = []
    return lines


In [25]:
def finalresult(sequence_log,predata_blank):
    dataframe = []
    count=0
    for i in range(len(sequence_log)):
        for text in sequence_log[i]:
            dataframe.append(text)
            count+=1
        dataframe.append("")
    dftest=pd.DataFrame(dataframe)
    final = pd.DataFrame()
    final['result'] = predata_blank['x'] + " " +dftest[0]
    return final

In [26]:
data_folders = ["AL", "EN","CN","SG"]
for x in ["EN"]:
#     print("Performing sentiment analysis for data folder ", x)
    train_data = "./data/{}/train".format(x)
    test_data = "./data/{}/dev.in".format(x)
#     test_result = "./data/{}/dev.out".format(x)
    
# ##############################PART 3########################################################
#     transition_dic, subseq_count, y_count = transitionPara(train_data)
    multitransition_dic, subseq_count, y_count, ycyf_count = MultistateTransitionPara(train_data)
    predata_blank=preprocess_training_blank_row(train_data)
    node = list(y_count.keys())
#     print(testdf_unprocess)
    testdf_unprocess = pd.read_csv(test_data, sep='/r/n', delimiter=None, names=['x'],index_col=False,skip_blank_lines=False, engine="python")
    lines= sentenceList(testdf_unprocess)
    
    nodes = node
    log_array =[]
    sequence_log=[]

#     for i in range(len(lines)):
    print('Start training')
    for i in range(len(lines)):
#         print("lines ", lines[1])
        multiviterbioutput=multistateviterbi(lines[i])
        log_array.append(multiviterbioutput)
        sequence_log.append(multistateviterbi_backtrack(multiviterbioutput))
    print(sequence_log)
    
    result = finalresult(sequence_log,testdf_unprocess)
    print(result)
    
    print("Writing the final result to dev.p5.out...")
#     f = open('./dev.p3.out'.format(x) ,'w')
    f = open('./output/{}/dev.p5.out'.format(x) ,'w')
    for word in result['result']:
        if pd.isnull(word) == False:
            f.write(word + '\n')
        else:
            f.write("" +"\n")
    f.close()

y_count:  Counter({'I-NP': 54591, 'B-NP': 47305, 'O': 23872, 'B-PP': 18387, 'B-VP': 18261, 'I-VP': 10159, nan: 7663, 'B-ADVP': 3565, 'B-SBAR': 1899, 'B-ADJP': 1751, 'I-ADJP': 574, 'B-PRT': 468, 'I-ADVP': 363, 'I-PP': 223, 'I-CONJP': 64, 'B-CONJP': 49, 'I-SBAR': 48, 'B-INTJ': 26, 'B-LST': 11, 'I-INTJ': 7, 'I-UCP': 4, 'B-UCP': 1})
             0       1           ycyf
0        START    B-NP     START B-NP
1         B-NP    I-NP      B-NP I-NP
2         I-NP    B-VP      I-NP B-VP
3         B-VP  B-ADVP    B-VP B-ADVP
4       B-ADVP  B-ADJP  B-ADVP B-ADJP
5       B-ADJP  I-ADJP  B-ADJP I-ADJP
6       I-ADJP  I-ADJP  I-ADJP I-ADJP
7       I-ADJP    B-PP    I-ADJP B-PP
8         B-PP    B-NP      B-PP B-NP
9         B-NP    I-NP      B-NP I-NP
10        I-NP    B-PP      I-NP B-PP
11        B-PP    B-NP      B-PP B-NP
12        B-NP    I-NP      B-NP I-NP
13        I-NP       O         I-NP O
14           O       O            O O
15           O  B-ADJP       O B-ADJP
16      B-ADJP  I-ADJP 



[['B-NP', 'B-VP', 'I-VP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'O', 'B-SBAR', 'B-NP', 'O', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP'], ['B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'B-NP', 'B-NP'], ['B-NP', 'B-VP', 'I-VP', 'I-VP', 'I-VP', 'B-NP', 'I-NP', 'O', 'B-VP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP'], ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'B-PP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP'], ['B-NP', 'O', 'B-ADVP', 'O', 'B-VP', 'B-PP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP', 'B-NP'], ['B-NP', 'I-NP', 'O', 'B-NP', 'I-NP