# 01.112 Machine Learning Design Project

## About the Project

We have 4 datasets in the `/data` folder. For each dataset, there is: 
- a labelled training set train, 
- an unlabelled development set `dev.in`
- a labelled development set `dev.out` 

The labelled data has the format of: `token` `\t` `tag`
- one token per line
- token and tag separated by tab 
- single empty lines that separates sentences

For the labels, they are slightly different for different datasets.
- SG, CN (Entity):
    - B-*: Beginning of entity
    - I-*: Inside of entity
    - O: Outside of any entity
- EN, AL (Phrase):
    - B-VP: Beginning of Verb Phrase
    - I-VP: Inside of Verb Phrase
    - *-NP: Noun Phrase
    - *PP: Propositional Phrase
    - O: Outside of any phrase

*Goal*: Build sequence labelling systems from training data (x) and use it to predict tag sequences for new sentences (y).

## Team members 
- Andri Setiawan Susanto
- Eldon Lim 
- Tey Siew Wen

# Part 5
## Second Order HMM - viterbie

In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
def emissionPara(df):
    x_y_lists = df['x_y'].str.split(" ")
    x_y_tuples = x_y_lists.apply(lambda x: tuple(x)).to_numpy()
    x_y_counter = Counter(x_y_tuples)
    
    y_counter = Counter(x_y_lists.apply(lambda s: s[1]))
    
    emission_params = {}
    
    for x_y, x_y_count in x_y_counter.items():
        y = x_y[1]
        emission_params[x_y] = x_y_count / y_counter[y]
    return emission_params

In [3]:
def preprocess(data, k, replaceWord):
    """
    Function to modify train/test data based on the occurence of words. 
    If a word appears <= k times in the data, replace it with the replaceWord.
    Returns a new df.
    """
    df = pd.read_csv(data, sep='/r/n', delimiter=None, names=['x_y'],index_col=False, engine="python", encoding='utf-8')
    x_y_lists = df['x_y'].str.split(" ")
    x = x_y_lists.apply(lambda s: s[0])
    x_counter = Counter(x).keys()
    
    invalid_x = x.value_counts()[x.value_counts() < 3].index.to_list()
    print("There are ", len(x), "observations")
    print("Out of those observations, ", len(invalid_x), "is to be replaced.")
    
    def replace_with_string(s):
        x, y = s
        if x in invalid_x:
            return "{} {}".format(replaceWord, y)
        else:
            return "{} {}".format(x, y)
        
    new_df = pd.DataFrame(x_y_lists.apply(replace_with_string), columns=["x_y"])
    return new_df, x_counter

In [4]:
def split_into_columns(df_column):
    new = df_column.str.split(" ", n=1, expand=True)
    return new[0], new[1]

In [5]:
from collections import Counter, defaultdict

# funtion to find the transition parameter from current label -> next label -> next next label
def MultistateTransitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    train_ycyf_data = pd.read_csv(data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")
    x, y = split_into_columns(train_data_blank["original"])
#     xy_dic = dict(zip(x, y))
    
    # Get unique label count 
    y_count = Counter(y)
    print("y_count: ",y_count)
    
    # yc = current label
    # yf = next label
    # yff = next next label
    
    # Get ycyf count of count(yc,yf)
    ycyf_count =ycyf(train_ycyf_data)
    print("ycyf_count: ", ycyf_count)

    
    
    # subseq_count is a dictionary that store the value of count(yc,yf,yff)
    subseq_count = defaultdict(int)
    for i in range(len(y)-1):  
       
        y1 = y[i] # third node -> yff
        if i == 1:
            y2 = y[i-1] 
        elif i >1:
            y2 = y[i-1] # second node -> yf
            y3 = y[i-2] # first node -> yc
        
        if i == 0:
            subseq_count[("NONE_S","START", y1)] +=1
            ycyf_count["NONE_S","START"] +=1
        elif i == 1:
            subseq_count[("START", y2, y1)] +=1
            ycyf_count["START", y2] +=1
        elif pd.isna(y1) and pd.isna(y3):
            subseq_count[("NONE_S", "START", y2)] +=1
            ycyf_count["NONE_S", "START"] +=1
        elif pd.isna(y1):
            subseq_count[(y3, y2, "END")] +=1
        
        elif i !=0 and i != 1 and pd.isna(y2):
            subseq_count[("NONE_S","START",y1)] +=1
            ycyf_count["NONE_S","START"] +=1
            subseq_count[(y3,"END","NONE_E")] +=1
            ycyf_count[y3,"END"] +=1
        elif i !=0 and pd.isna(y2):
            subseq_count[("NONE_S","START",y1)] +=1
            ycyf_count["NONE_S","START"] +=1
        elif i == len(y)-2:
            subseq_count[(y1, "END", "NONE_E")] +=1
            ycyf_count[y1, "END"] +=1
        elif pd.isna(y1) and pd.isna(y3):
            subseq_count[("START", y2 ,"END")] +=1
            ycyf_count["START", y2] +=1
        elif pd.isna(y3):
            subseq_count["START",y2,y1] += 1
            ycyf_count["START", y2] +=1
        else:
            subseq_count[y3,y2,y1] += 1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    multistate_transition_params = {}
    
    for k,v in subseq_count.items():
        y3 = k[0] # first node
        y2 = k[1] # second node
        y1 = k[2] # third node
        multistate_transition_params[y3,y2,y1] = subseq_count[y3,y2,y1] / ycyf_count[y3,y2]
       
    return multistate_transition_params, subseq_count, y_count,ycyf_count

# transition_dic, subseq_count, y_count = transitionPara(test)

In [6]:
def MultistatePreprocess(df, k, replaceWord):
    """
    Function to modify train/test data based on the occurence of words. 
    If a word appears <= k times in the data, replace it with the replaceWord.
    Returns a new df.
    """
    x_y_lists = df['x_y'].str.split(" ")
    x = x_y_lists.apply(lambda s: s[0])
    x_counter = Counter(x).keys()
    
    invalid_x = x.value_counts()[x.value_counts() < 3].index.to_list()
    print("There are ", len(x), "observations")
    print("Out of those observations, ", len(invalid_x), "is to be replaced.")
    
    def replace_with_string(s):
        x, yc, yp = s
        if x in invalid_x:
            return "{} {} {}".format(replaceWord, yc, yp)
        else:
            return "{} {} {}".format(x, yc, yp)
        
    new_df = pd.DataFrame(x_y_lists.apply(replace_with_string), columns=["x_y"])
    return new_df, x_counter

In [7]:
# import pandas as pd
# train_data ="./data/EN/train"
# train_data = pd.read_csv(train_data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")
# x_y_lists = train_data
# y_df=split_into_columns(x_y_lists['x_y'])[1]

def xyy(data):
    xydf = pd.read_csv(data, sep='/n', delimiter=None, names=['x_y'],index_col=False, engine="python")
    ydf=split_into_columns(xydf['x_y'])[1]
    for i in range(len(xydf)):
        if i == 0:
            xydf['x_y'][i] = xydf['x_y'][i] + " START"
        if i != 0:
            previousLabel = ydf[i-1]
#             print("previousLabel: ",previousLabel)
            xydf['x_y'][i] = xydf['x_y'][i] + " " + previousLabel 

    return xydf

# xylist = xyy(x_y_lists,y_df)

In [8]:
from collections import Counter, defaultdict

def ycyf (data):
    x_y_lists1 = data
    y_df=split_into_columns(x_y_lists1['x_y'])
    y_mod_df = pd.Series(["START"])
    y_df1=pd.Series(y_df[1])
    y2 = pd.concat([y_mod_df,y_df1[:len(y_df1)-1]],axis=0,ignore_index=True)
    xyy = pd.concat([y2,y_df[1]], axis=1)
    xyy['ycyf'] = xyy[0].str.cat(xyy[1],sep=" ")
    print(xyy)
    ycyf_lists = xyy['ycyf'].str.split(" ")
    ycyf_tuples = ycyf_lists.apply(lambda x: tuple(x)).to_numpy()
    ycyf_counter = Counter(ycyf_tuples)
    return ycyf_counter
    

##############################################################################
# Uncomment the below code if want to run the updating emission and transition parameter during training phase before 
# predicting the final label
    
# # ycyp_counter=ycyp(train_data)

# from collections import Counter, defaultdict

# def ycyf (data):
#     x_y_lists1 = data
#     y_df=split_into_columns(x_y_lists1['x_y'])
#     y_mod_df = pd.Series(["START"])
#     y_df1=pd.Series(y_df[1])
#     y2 = pd.concat([y_mod_df,y_df1[:len(y_df1)-1]],axis=0,ignore_index=True)
#     xyy = pd.concat([y2,y_df[1]], axis=1)
#     xyy['ycyf'] = xyy[0].str.cat(xyy[1],sep=" ")
#     print(xyy)
#     ycyf_lists = xyy['ycyf'].str.split(" ")
#     ycyf_tuples = ycyf_lists.apply(lambda x: tuple(x) if type(x)!=float else None ).to_numpy()
#     ycyf_counter = Counter(ycyf_tuples)
#     return ycyf_counter
    

# ycyp_counter=ycyp(train_data)

# Finding P(w|u,v)

In [9]:
from collections import Counter, defaultdict

#v,w
def vwtransitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    x, y = split_into_columns(train_data_blank["original"])
    xy_dic = dict(zip(x, y))
    
    # Get count(y)
    y_count = Counter(y)
    
    #subseq_count is a dictionary that store the value of count(current label, next label)
    subseq_count = defaultdict(int)
    for i in range(len(y)-1):    
        y1 = y[i]
        y2 = y[i+1]
        
        if i == 0:
            subseq_count[("START", y1)] +=1
            y_count["START"] +=1
        if pd.isna(y1):
            subseq_count[("START", y2)] +=1
            y_count["START"] +=1
        elif i == len(y)-1 or pd.isna(y2):
            subseq_count[(y1, "END")] +=1
            y_count["END"] +=1
        else:
            subseq_count[y1,y2] += 1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    
    # yytransition_dict stores the transition parameter from current label to next label
    yytransition_dict = {}
    
    for k,v in subseq_count.items():
        y1 = k[0]
        y2 = k[1]
        yytransition_dict[y1,y2] = subseq_count[y1,y2] / y_count[y1]
     
    return yytransition_dict, subseq_count, y_count

# vwtransition_dic, vwsubseq_count, y_count = vwtransitionPara(train_data)
# print(vwtransition_dic)

In [10]:
from collections import Counter, defaultdict
# train_data = './data/EN/train'
def wtransitionPara(data):
    train_data_blank=pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    x, y = split_into_columns(train_data_blank["original"])
#     xy_dic = dict(zip(x, y))
    y1transition_dict = {}
    
    # Get count(y)
    y_count = Counter(y)

    for i in range(len(y)-1):    
        y1 = y[i] #for input in current dataframe index
        y2 = y[i+1] #for input in next dataframe index
        
        if i == 0:
            y_count["START"] +=1
        if pd.isna(y1):
            y_count["START"] +=1
        elif i == len(y)-1 or pd.isna(y2):
            y_count["END"] +=1
    
    # Calculation of transition params
    result = np.empty(len(y)+2)
    yytransition_dict = {}
    
    total_count = sum(y_count.values())
    
    for e in y_count.keys():
        y1transition_dict[e] = y_count[e]/total_count


    return y1transition_dict

# wtransition_dict = wtransitionPara(train_data)
# print(wtransition_dict)

In [11]:

def k3_coef(dict):
    k3_dict={}
    for key, val in dict.items():
        k3_dict[key] = (np.log(val+1)+1)/(np.log(val+1)+2)
    return k3_dict

# k3 = k3_coef(subseq_count)


def k2_coef(dict):
    k2_dict={}
    for key, val in dict.items():
        k2_dict[key] = (np.log(val+1)+1)/(np.log(val+1)+2)
    return k2_dict



In [12]:
# lamda1=k3

def lamda2_coef(k3,k2):
    lamda2={}
    va=0
    for key, val in k3.items():

        for key2,val2 in k2.items():
            v = key2[0]
            w = key2[1]
            if key[1].startswith(v) and key[2].startswith(w):
                va+=1-val
                va = va*val2
                if key not in lamda2.keys():
                    lamda2[key] = va
                    va =0
    return lamda2

def lamda3_coef(k3,k2):
    lamda3={}
    va=0
    for key, val in k3.items():

        for key2,val2 in k2.items():
            v = key2[0]
            w = key2[1]
            if key[1].startswith(v) and key[2].startswith(w):
                va+=1-val
                va = va*(1-val2)
                if key not in lamda3.keys():
                    lamda3[key] = va
                    va =0
    return lamda3
# print(lamda2)

# print(lamda1)


In [13]:
# multitransition_dic

In [14]:
from collections import Counter, defaultdict
def multistateEmissionPara(df):
    x_y_lists = df['x_y'].str.split(" ")
    x_y_tuples = x_y_lists.apply(lambda x: tuple(x)).to_numpy()
    x_y_counter = Counter(x_y_tuples)
    
    y_counter = Counter(x_y_lists.apply(lambda s: s[1]))
    
    multistate_emission_params = {}
    for x_y, x_y_count in x_y_counter.items():
        yc = x_y[2]
        yf = x_y[1]
        multistate_emission_params[x_y] = x_y_count / ycyf_count[yc,yf]

    return multistate_emission_params

In [15]:
# k =3
# replaceWord = "#UNK#"
# predata, x_counter = MultistatePreprocess(xylist, k, replaceWord)
# multiemission_dict=multistateEmissionPara(predata)

In [16]:
# multiemission_dict

In [17]:
#correct
import numpy as np

def multistateviterbi(unique_word_list):
    #This is for the starting for viterbi
    store=[]   #store = the storage for scores for all the nodes.
    scorelist=[]
    
#     print("sentence: ", unique_word_list)
    
    #This is for the start
    # mutlitransition(curr_label,next_label,next_next_label)
    # emission(current_label,current_word,x_dict)
    for i in range(len(nodes)):
        emission_score = emission(nodes[i],unique_word_list[0], x_counter)
        transition_score = multitransition("NONE_S","START",nodes[i])
        if transition_score == 0:
            score_at_start = np.NINF
        else:
            score_at_start = (np.log(emission_score)+np.log(transition_score))
        store.append(score_at_start)    
        
    scorelist.append(store)
    store=[]
    score_per_node=[]
    
    # This is for sentence with one word
    if len(unique_word_list) == 1:
        for i in range(len(nodes)):
            transition_score = multitransition("START",nodes[i],"END")
            if transition_score == 0:
                score = np.NINF
            else:
                score = scorelist[0][i]+np.log(transition_score)
            store.append(score)    
        
        scorelist.append(max(store))
        store=[]
    
    # This is for sentence with more than one word
    # This is for the node just right after start
    if len(unique_word_list)>1:
        for a in range(len(nodes)):
            for b in range(len(nodes)):
                prev_prev_node = "START"
                prev_node = node[b]
                curr_node = node[a]
                prev_node_score =scorelist[0][b]
                emission_score = emission(node[a],unique_word_list[1], x_counter)
                transition_score = multitransition(prev_prev_node,prev_node,curr_node)
                if transition_score ==0:
                    score = np.NINF
                else:
                    score =(prev_node_score+np.log(emission_score)+np.log(transition_score))
                score_per_node.append(score)

            store.append(max(score_per_node)) # found max path
            score_per_node=[]

        scorelist.append(store) # store the scores for nodes
        store=[]
        
    #This is for sentence with more than 2 words
    #This is for the middle portion for viterbi
    if len(unique_word_list)>2:
        for i in range(len(unique_word_list)-2):
            for j in range(len(node)):
                for k in range(len(node)):
                    for l in range(len(node)):
                        prev_prev_node = node[k]
                        prev_node = node[l]
                        curr_node = node[j]
                        
                        prev_node_score = scorelist[i+1][l]
                        transition_score = multitransition(prev_prev_node, prev_node, curr_node)
                        emission_score = emission(curr_node,unique_word_list[i+2],x_counter)
                        if transition_score==0:
                            score = np.NINF
                        else:
                            score = (prev_node_score+np.log(emission_score)+np.log(transition_score))
                        score_per_node.append(score)
                store.append(max(score_per_node)) # found max path
                score_per_node=[]
            scorelist.append(store) # store the scores for nodes
            store=[]
            
   

        #This is for the stop for viterbi
    
        score_at_stop=[]
        for m in range(len(node)):
            for n in range(len(node)):
                prev_prev_node = node[n]
                prev_node = node[m]
                curr_node = "END"
                transition_score = multitransition(prev_prev_node, prev_node, curr_node)
                if transition_score == 0:
                    score = np.NINF
                else:
                    score = (np.log(transition_score)+ (scorelist[len(unique_word_list)-1][m]))
                score_at_stop.append(score) #at stop.
        scorelist.append(max(score_at_stop))
        
    return scorelist

def emission(node,word,x_dict):
    global emission_dict
    pair = word,node
    # this is used to find if word exist in the dictionary
    if pair not in emission_dict.keys(): #if the combination cannot be found in the dictionary
                                         #Either the word exists, or word is new. 
        if word in x_dict:
            score=0   #this means that this node is not the correct node.
        else:
            replaced_text = ("#UNK#", node)
            if replaced_text in emission_dict.keys():
                score = emission_dict[replaced_text] #if label have #unk#
            else:
                score = 0   #if label does not have #unk#, then set to 0.
    else:
        score = emission_dict[pair]
    return score

def multitransition(x1,x2,x3): 
    global transition_dic
    #will use this to search the transition from x1 to x2 to x3
    pair = x1,x2,x3
#     print(pair)
    if pair not in multitransition_dic.keys():
        
        score = 0
    else:
        score = lamda1[x1,x2,x3]* multitransition_dic[x1,x2,x3] + lamda2[x1,x2,x3]*vwtransition_dic[x2,x3] + lamda3[x1,x2,x3] * wtransition_dict[x3]
    return score

In [18]:
#correct
def multistateviterbi_backtrack(scorelist):
    ####### back tracking for viterbi
    # node value*transition = array, then find max, then find position. use position for next step.
    #np.argmax returns index of max in the element.
    # The final score on the score list is for end
    scorelist = scorelist[::-1] #reverse the score list so easier to calculate.
    node_holder=[]
    path = []
    max_node_index=0
    length_of_scorelist=len(scorelist)
    length_of_nodes=len(nodes)
#     print(scorelist)
    
    # for sentence with only one word
    if length_of_scorelist == 1:
        for k in range(length_of_nodes):
            calculate_max_node = ((scorelist[0][k]) +np.log(multitransition("START",nodes[k],"END")))
            node_holder.append(calculate_max_node)
        path.append(nodes[np.argmax(node_holder)])
        node_holder=[]
        return(path[::-1])
    
    # for sentence with only two word
    if length_of_scorelist == 2:
        max_each_label_val = []
        max_each_label_index = []
        temp_node_holder = []
        temp_node_holder_index =[]
        
        for i in range(length_of_nodes):
            for k in range(length_of_nodes):
                calculate_max_node = ((scorelist[1][k]) +np.log(multitransition(nodes[k],nodes[i],"END")))
                temp_node_holder.append(calculate_max_node)
                temp_node_holder_index.append(k)
            max_each_label_val.append(max(temp_node_holder)) #finding the maximum value of the node from the previous n nodes
            max_each_label_index.append(temp_node_holder_index[np.argmax(temp_node_holder)]) #finding the index that give the maximum value of the node from the previous n nodes
            temp_node_holder = []
            temp_node_holder_index =[]
        max_node_index=(np.argmax(max_each_label_val)) # finding the index that give the true maximum value from the n maximum nodes
        path.append(nodes[ max_node_index]) # get the label from the maximum index
        node_holder=[]
        max_each_label_val = []
        max_each_lable_index = []

        # this is for the stop node viterbi
        for m in range(length_of_nodes):
            calculate_max_node = ((scorelist[length_of_scorelist-1][m]) +np.log(multitransition("START",nodes[m],nodes[max_node_index])))
            temp_node_holder.append(calculate_max_node)



        max_node_index=(np.argmax(temp_node_holder))
        path.append(nodes[np.argmax(temp_node_holder)])
        node_holder=[]
        
        return(path[::-1])
    
    #for sentence with more than 2 words
    if length_of_scorelist>2:
        max_each_label_val = []
        max_each_label_index = []
        temp_node_holder = []
        temp_node_holder_index =[]
        
        
        for i in range(length_of_nodes): # for length of sentence
        
            for j in range(length_of_nodes):
                calculate_max_node = ((scorelist[1][i]) + np.log(multitransition(nodes[j],nodes[i],"END")))
                temp_node_holder.append(calculate_max_node)
                temp_node_holder_index.append(j)
            max_each_label_index.append(temp_node_holder_index[np.argmax(temp_node_holder)]) #finding the index that give the maximum value of the node from the previous n nodes
            max_each_label_val.append(max(temp_node_holder)) #finding the maximum value of the node from the previous n nodes
            temp_node_holder = []
            temp_node_holder_index =[]
        max_node_index=(np.argmax(max_each_label_val)) # finding the index that give the true maximum value from the n maximum nodes
        path.append(nodes[ max_node_index]) # get the label from the maximum index
        node_holder=[]
        max_each_label_val = []
        max_each_lable_index = []
                
    #for sentence with more than 3 words
    if length_of_scorelist>3:
        max_each_label_val = []
        max_each_lable_index = []
        temp_node_holder = []
        temp_node_holder_index =[]
        node_holder = []
        node_holder_index =[]
        for i in range(2,length_of_scorelist-1): # for length of sentence
            for j in range(length_of_nodes):
                for k in range(length_of_nodes):

                    calculate_max_node = ((scorelist[i][k]) +np.log(multitransition(nodes[j],nodes[k],nodes[max_node_index])))

                    temp_node_holder.append(calculate_max_node)
                    temp_node_holder_index.append(k)
                node_holder_index.append(temp_node_holder_index[np.argmax(temp_node_holder)]) #finding the index that give the maximum value of the node from the previous n nodes
                node_holder.append(temp_node_holder[np.argmax(temp_node_holder)]) #finding the maximum value of the node from the previous n nodes
                
                temp_node_holder=[]
                temp_node_holder_index=[]
            max_each_label_val.append(max(node_holder))
            max_each_label_index.append(node_holder_index[np.argmax(node_holder)])
            max_node_index=node_holder_index[np.argmax(node_holder)] # finding the index that give the true maximum value from the n maximum nodes
            path.append(nodes[max_node_index]) # get the label from the maximum index
            node_holder=[]
            node_holder_index=[]
            max_each_label_val = []
            max_each_lable_index = []
        
        for m in range(length_of_nodes):
            calculate_max_node = ((scorelist[length_of_scorelist-1][m]) +np.log(multitransition("START",nodes[m],nodes[max_node_index])))
            temp_node_holder.append(calculate_max_node)



        max_node_index=(np.argmax(temp_node_holder))
        path.append(nodes[np.argmax(temp_node_holder)])
        node_holder=[]
            

    return(path[::-1])

In [19]:
def preprocess_training_blank_row(data):
    start = time.process_time()   
    
    df= pd.read_csv(data, sep='/n', delimiter=None, names=['original'],index_col=False,engine="python",skip_blank_lines=False)
    
    # new data frame with split value columns 
    df["x"], df["y"] = split_into_columns(df["original"])
    return df

In [20]:
def sentenceList(data):
    lines=[]
    line=[]
    x= data
    for label in x['x']:
        if pd.isnull(label)==False:
            line.append(label)
        else:
    #         line += ' stop'
            lines.append(line)
            line = []
    return lines


In [21]:
def finalresult(sequence_log,predata_blank):
    dataframe = []
    count=0
    for i in range(len(sequence_log)):
        for text in sequence_log[i]:
            dataframe.append(text)
            count+=1
        dataframe.append("")
    dftest=pd.DataFrame(dataframe)
    final = pd.DataFrame()
    final['result'] = predata_blank['x'] + " " +dftest[0]
    return final

## Normal Second order HMM 

In [25]:
%%time
data_folders = ["AL", "EN","CN","SG"]
for x in ["EN","AL"]:
#     print("Performing sentiment analysis for data folder ", x)
    train_data = "./data/{}/train".format(x)
    test_data = "./data/{}/test.in".format(x)
#     test_result = "./data/{}/dev.out".format(x)
    k =3
    replaceWord = "#UNK#"
# ##############################PART 3########################################################
#     transition_dic, subseq_count, y_count = transitionPara(train_data)

#     print('----------------- xylisT ---------------')
#     xylist = xyy(train_data)
#     print('----------------- MultistatePreprocess ---------------')
#     predata, x_counter = MultistatePreprocess(xylist, k, replaceWord)
    print('----------------- EmissionPara ---------------')
    predata, x_counter = preprocess(train_data, k, replaceWord)
    emission_dict = emissionPara(predata)
    print('----------------- vwtransitionPara ---------------')
    vwtransition_dic, vwsubseq_count, y_count = vwtransitionPara(train_data)
    print('----------------- wtransitionPara ---------------')
    wtransition_dict = wtransitionPara(train_data)
    print('----------------- MultistateTransitionPara ---------------')
    multitransition_dic, subseq_count, y_count, ycyf_count = MultistateTransitionPara(train_data)
    print('----------------- multistateEmissionPara ---------------')
#     multiemission_dict=multistateEmissionPara(predata)
    print('----------------- k3 ---------------')
    k3 = k3_coef(subseq_count)
    print('----------------- k2 ---------------')
    k2 = k2_coef(vwsubseq_count)
    print('----------------- lamda1 ---------------')
    lamda1 = k3
    print('----------------- lamda2 ---------------')
    lamda2 =lamda2_coef(k3,k2)
    print('----------------- lamda3 ---------------')
    lamda3 =lamda3_coef(k3,k2)
    print('----------------- predata_blank ---------------')
    predata_blank=preprocess_training_blank_row(train_data)
    node = list(y_count.keys())
#     print(testdf_unprocess)
    
    testdf_unprocess = pd.read_csv(test_data, sep='/r/n', delimiter=None, names=['x'],index_col=False,skip_blank_lines=False, engine="python", encoding='utf-8')
    lines= sentenceList(testdf_unprocess)
    
    nodes = node
    log_array =[]
    sequence_log=[]

#     for i in range(len(lines)):
    print('Start training')
    for i in range(len(lines)):
#         print("lines ", lines[1])
        multiviterbioutput=multistateviterbi(lines[i])
        log_array.append(multiviterbioutput)
        sequence_log.append(multistateviterbi_backtrack(multiviterbioutput))
    print(sequence_log)
    
    result = finalresult(sequence_log,testdf_unprocess)
    print(result)
    
    print("Writing the final result to dev.p5.out...")
#     f = open('./dev.p3.out'.format(x) ,'w')
    f = open('./output/{}/test.p5.out'.format(x) ,'w', encoding='utf-8')
    for word in result['result']:
        if pd.isnull(word) == False:
            f.write(word + '\n')
        else:
            f.write("" +"\n")
    f.close()

----------------- EmissionPara ---------------
There are  174949 observations
Out of those observations,  2624 is to be replaced.
----------------- vwtransitionPara ---------------
----------------- wtransitionPara ---------------
----------------- MultistateTransitionPara ---------------
y_count:  Counter({'I-POI': 28472, 'I-ROAD': 15178, 'I-DISTRICT': 13364, 'I-TOWN': 10748, nan: 10448, 'I-CITY': 10430, 'I-PROV': 8666, 'B-POI': 7348, 'B-DISTRICT': 6856, 'B-ROAD': 6324, 'B-CITY': 5697, 'B-ROADNO': 5061, 'I-ROADNO': 4666, 'B-TOWN': 4612, 'I-SUBPOI': 4557, 'B-PROV': 4488, 'B-REDUNDANT': 4137, 'I-REDUNDANT': 3668, 'B-HOUSENO': 3455, 'I-COMMUNITY': 3148, 'B-ROOMNO': 3107, 'I-HOUSENO': 3045, 'I-PERSON': 2232, 'I-DEVZONE': 1834, 'B-SUBPOI': 1650, 'I-CELLNO': 1488, 'B-COMMUNITY': 1479, 'B-CELLNO': 1326, 'I-FLOORNO': 1237, 'B-FLOORNO': 1229, 'I-ROOMNO': 1204, 'B-ASSIST': 801, 'I-ASSIST': 741, 'I-SUBROAD': 714, 'B-PERSON': 713, 'B-DEVZONE': 401, 'B-SUBROAD': 387, 'B-SUBROADNO': 202, 'I-SUBROAD

----------------- multistateEmissionPara ---------------
----------------- k3 ---------------
----------------- k2 ---------------
----------------- lamda1 ---------------
----------------- lamda2 ---------------
----------------- lamda3 ---------------
----------------- predata_blank ---------------
Start training




[['B-PROV', 'I-PROV', 'I-PROV', 'B-CITY', 'I-CITY', 'I-CITY', 'B-DISTRICT', 'I-DISTRICT', 'I-DISTRICT', 'B-ROAD', 'I-ROAD', 'I-ROAD', 'B-ROADNO', 'I-ROADNO'], ['B-PROV', 'I-PROV', 'I-PROV', 'B-CITY', 'I-CITY', 'I-CITY', 'B-DISTRICT', 'I-DISTRICT', 'I-DISTRICT', 'B-TOWN', 'I-TOWN', 'B-COMMUNITY', 'I-COMMUNITY'], ['B-PROV', 'I-PROV', 'I-PROV', 'B-REDUNDANT', 'B-CITY', 'I-CITY', 'B-DISTRICT', 'I-DISTRICT', 'I-DISTRICT', 'B-TOWN', 'I-TOWN', 'I-TOWN', 'B-COMMUNITY', 'I-COMMUNITY', 'I-COMMUNITY', 'B-POI', 'I-POI', 'I-POI'], ['B-DISTRICT', 'I-DISTRICT', 'I-DISTRICT', 'B-ROAD', 'I-ROAD', 'I-ROAD', 'B-ROADNO', 'I-ROADNO', 'B-ASSIST', 'I-ASSIST', 'B-REDUNDANT', 'B-SUBPOI', 'I-SUBPOI', 'I-SUBPOI', 'I-SUBPOI', 'I-SUBPOI', 'I-SUBPOI', 'I-SUBPOI'], ['B-ROAD', 'I-ROAD', 'I-ROAD', 'I-ROAD', 'B-ROADNO', 'I-ROADNO', 'B-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI', 'I-POI'], ['B-PROV', 'I-PROV', 'I-PROV', 'B-CITY', 'I-CITY', 'I-CITY', 'B-CITY', 'I-CITY', 'I-CITY',

Wall time: 1h 41min 54s


## Run this code to update the transition and emission parameter during training phase before predicting the label for test phase

In [28]:
def weight(pred_data,train_data, predata):
    traindf= pd.read_csv(train_data, sep='/r/n', delimiter=None, names=['original'],index_col=False,engine="python")
    preddf= pd.read_csv(pred_data, sep='/r/n', delimiter=None, names=['original'],index_col=False,engine="python")
    predata_copy = predata.copy()
#     preddf = pred_data
    preddf.rename(columns={'result':'original'},inplace=True)
    x_train, y_train = split_into_columns(traindf["original"])
#     xy_dic = dict(zip(x, y))
 
    
    ##############################################################################
    train_data_blank=pd.read_csv(train_data, sep='/r/n', delimiter=None, names=['original'],index_col=False, engine="python", skip_blank_lines=False)
    train_ycyf_data = pd.read_csv(train_data, sep='/r/n', delimiter=None, names=['x_y'],index_col=False,engine="python")
    x, y = split_into_columns(train_data_blank["original"])
#     xy_dic = dict(zip(x, y))
    
    # Get unique label count 
    y_count = Counter(y)
#     print("y_count: ",y_count)
    
    # yc = current label
    # yf = next label
    # yff = next next label
    
    # Get ycyf count of count(yc,yf)
    train_ycyf_count =ycyf(train_ycyf_data)
#     print("ycyf_count: ", ycyf_count)

    
    
    # subseq_count is a dictionary that store the value of count(yc,yf,yff)
    train_subseq_count = defaultdict(int)
    for i in range(len(y)-1):  
       
        y1 = y[i] # third node -> yff
        if i == 1:
            y2 = y[i-1] 
        elif i >1:
            y2 = y[i-1] # second node -> yf
            y3 = y[i-2] # first node -> yc
        
        if i == 0:
            train_subseq_count[("NONE_S","START", y1)] +=1
            train_ycyf_count["NONE_S","START"] +=1
        elif i == 1:
            train_subseq_count[("START", y2, y1)] +=1
            train_ycyf_count["START", y2] +=1
        elif pd.isna(y1) and pd.isna(y3):
            train_subseq_count[("NONE_S", "START", y2)] +=1
            train_ycyf_count["NONE_S", "START"] +=1
        elif pd.isna(y1):
            train_subseq_count[(y3, y2, "END")] +=1
        
        elif i !=0 and i != 1 and pd.isna(y2):
            train_subseq_count[("NONE_S","START",y1)] +=1
            train_ycyf_count["NONE_S","START"] +=1
            train_subseq_count[(y3,"END","NONE_E")] +=1
            train_ycyf_count[y3,"END"] +=1
        elif i !=0 and pd.isna(y2):
            train_subseq_count[("NONE_S","START",y1)] +=1
            train_ycyf_count["NONE_S","START"] +=1
        elif i == len(y)-2:
            train_subseq_count[(y1, "END", "NONE_E")] +=1
            train_ycyf_count[y1, "END"] +=1
        elif pd.isna(y1) and pd.isna(y3):
            train_subseq_count[("START", y2 ,"END")] +=1
            train_ycyf_count["START", y2] +=1
        elif pd.isna(y3):
            train_subseq_count["START",y2,y1] += 1
            train_ycyf_count["START", y2] +=1
        else:
            train_subseq_count[y3,y2,y1] += 1
    
    # Calculation of transition params
    
    Ttransition_weight = {}
    
    for k,v in train_subseq_count.items():
        y3 = k[0] # first node
        y2 = k[1] # second node
        y1 = k[2] # third node
#         Ttransition_weight[y3,y2,y1] = np.log(train_subseq_count[y3,y2,y1] / train_ycyf_count[y3,y2]) * train_subseq_count[y3,y2,y1]
        Ttransition_weight[y3,y2,y1] = (train_subseq_count[y3,y2,y1] / train_ycyf_count[y3,y2])
    
    
    #################################################################################
#     # Get unique label count 
    y_train_count = Counter(y_train)
        
     
    train_x_y_lists = predata_copy['x_y'].str.split(" ")
    print(train_x_y_lists)
    train_x_y_tuples = train_x_y_lists.apply(lambda x: tuple(x)).to_numpy()
    train_x_y_counter = Counter(train_x_y_tuples)
    
    
    Temission_weight = {}
    
    for Tx_y, Tx_y_count in train_x_y_counter.items():
        y = Tx_y[1]
        Temission_weight[Tx_y] = (Tx_y_count / y_train_count[y]) 
     
 ############################################################################  
    
    preddf_noblank =preddf.copy()
    
    
    x, y = split_into_columns(preddf_noblank["original"])

    
    print('preddf_noblank', preddf_noblank)
    preddf_noblank.rename(columns={'original':'x_y'},inplace=True)
    print('after preddf_noblank', preddf_noblank)
    # Get ycyf count of count(yc,yf)
    pred_ycyf_count =ycyf(preddf_noblank)
    print("ycyf_count: ", pred_ycyf_count)

    
    
    # subseq_count is a dictionary that store the value of count(yc,yf,yff)
    pred_subseq_count = defaultdict(int)
    for i in range(len(y)-1):  
       
        y1 = y[i] # third node -> yff
        if i == 1:
            y2 = y[i-1] 
        elif i >1:
            y2 = y[i-1] # second node -> yf
            y3 = y[i-2] # first node -> yc
        
        if i == 0:
            pred_subseq_count[("NONE_S","START", y1)] +=1
            pred_ycyf_count["NONE_S","START"] +=1
        elif i == 1:
            pred_subseq_count[("START", y2, y1)] +=1
            pred_ycyf_count["START", y2] +=1
        elif pd.isna(y1) and pd.isna(y3):
            pred_subseq_count[("NONE_S", "START", y2)] +=1
            pred_ycyf_count["NONE_S", "START"] +=1
        elif pd.isna(y1):
            pred_subseq_count[(y3, y2, "END")] +=1
        
        elif i !=0 and i != 1 and pd.isna(y2):
            pred_subseq_count[("NONE_S","START",y1)] +=1
            pred_ycyf_count["NONE_S","START"] +=1
            pred_subseq_count[(y3,"END","NONE_E")] +=1
            pred_ycyf_count[y3,"END"] +=1
        elif i !=0 and pd.isna(y2):
            pred_subseq_count[("NONE_S","START",y1)] +=1
            pred_ycyf_count["NONE_S","START"] +=1
        elif i == len(y)-2:
            pred_subseq_count[(y1, "END", "NONE_E")] +=1
            pred_ycyf_count[y1, "END"] +=1
        elif pd.isna(y1) and pd.isna(y3):
            pred_subseq_count[("START", y2 ,"END")] +=1
            pred_ycyf_count["START", y2] +=1
        elif pd.isna(y3):
            pred_subseq_count["START",y2,y1] += 1
            pred_ycyf_count["START", y2] +=1
        else:
            pred_subseq_count[y3,y2,y1] += 1
    
    # Calculation of transition params
    
    Ptransition_weight = {}
    
    for k,v in pred_subseq_count.items():
        y3 = k[0] # first node
        y2 = k[1] # second node
        y1 = k[2] # third node
        Ptransition_weight[y3,y2,y1] = (pred_subseq_count[y3,y2,y1] / pred_ycyf_count[y3,y2]) 
    
   
    
    
    ################################################################################
        
     
    pred_x_y_lists = predata_copy['x_y'].str.split(" ")
    pred_x_y_tuples = pred_x_y_lists.apply(lambda x: tuple(x)).to_numpy()
    pred_x_y_counter = Counter(pred_x_y_tuples)
    
    
    Pemission_weight = {}
    
    for Px_y, Px_y_count in pred_x_y_counter.items():
        y = Px_y[1]
        Pemission_weight[Px_y] = (Px_y_count / y_train_count[y]) 
    
    rate = 0.2
    for key1, val1 in Temission_weight.items():

        for key2,val2 in Pemission_weight.items():
            
            if key2==key1:
#                 if val1 != val2:
#                     Pemission_weight[key1] = val1 - val2*rate
                Pemission_weight[key1] = val1 + val1 *(pred_subseq_count[key2]-train_subseq_count[key2])
                    
    for key1, val1 in Ttransition_weight.items():

        for key2,val2 in Ptransition_weight.items():
            
            if key2==key1:
#                 if val1 != val2:
#                     Ttransition_weight[key1] = val1 - val2*rate
#                     print(key1,val1,val2)
                Ttransition_weight[key1] = val1 +val1 *(pred_x_y_counter[key2]-train_x_y_counter[key2])
          
    
    return Ttransition_weight, Pemission_weight

In [29]:
%%time
data_folders = ["AL", "EN","CN","SG"]
for x in ["EN","AL""]:
    
#     print("Performing sentiment analysis for data folder ", x)
    train_data = "./data/{}/train".format(x)
    test_data = "./data/{}/dev.in".format(x)
#     test_result = "./data/{}/dev.out".format(x)
    k =3
    replaceWord = "#UNK#"
# ##############################PART 3########################################################
#     transition_dic, subseq_count, y_count = transitionPara(train_data)

#     print('----------------- xylisT ---------------')
#     xylist = xyy(train_data)
#     print('----------------- MultistatePreprocess ---------------')
#     predata, x_counter = MultistatePreprocess(xylist, k, replaceWord)
    print('----------------- EmissionPara ---------------')
    predata, x_counter = preprocess(train_data, k, replaceWord)
    emission_dict = emissionPara(predata)
    print('----------------- vwtransitionPara ---------------')
    vwtransition_dic, vwsubseq_count, y_count = vwtransitionPara(train_data)
    print('----------------- wtransitionPara ---------------')
    wtransition_dict = wtransitionPara(train_data)
    print('----------------- MultistateTransitionPara ---------------')
    multitransition_dic, subseq_count, y_count, ycyf_count = MultistateTransitionPara(train_data)
    print('----------------- multistateEmissionPara ---------------')
#     multiemission_dict=multistateEmissionPara(predata)
    print('----------------- k3 ---------------')
    k3 = k3_coef(subseq_count)
    print('----------------- k2 ---------------')
    k2 = k2_coef(vwsubseq_count)
    print('----------------- lamda1 ---------------')
    lamda1 = k3
    print('----------------- lamda2 ---------------')
    lamda2 =lamda2_coef(k3,k2)
    print('----------------- lamda3 ---------------')
    lamda3 =lamda3_coef(k3,k2)
    print('----------------- predata_blank ---------------')
    predata_blank=preprocess_training_blank_row(train_data)
    node = list(y_count.keys())
#     print(testdf_unprocess)


    traindf_unprocess = pd.read_csv(train_data, sep='/r/n', delimiter=None, names=['original'],index_col=False,skip_blank_lines=False, engine="python", encoding='utf-8')
#     train_x, train_y = split_into_columns(train_data_blank["original"])
    train_x, train_y = split_into_columns(traindf_unprocess["original"])
    train_x = pd.DataFrame(train_x)
    train_x.rename(columns={0:'x'},inplace=True)
    print(train_x)
    train_lines= sentenceList(train_x)

    
    
    ##################################################################################
    
    for i in range(2):
        if i == 0:
            print('epoch: ',i)
            nodes = node
            log_array =[]
            sequence_log=[]

        #     for i in range(len(lines)):
            print('Start training')
            for i in range(len(train_lines)):
        #         print("lines ", lines[1])
                multiviterbioutput=multistateviterbi(train_lines[i])
                log_array.append(multiviterbioutput)
                sequence_log.append(multistateviterbi_backtrack(multiviterbioutput))
    #         print(sequence_log)

            result = finalresult(sequence_log,train_x)
            print(result)

        #         pred_data = "./output/EN/dev.p5.out"
#             multitransition_dic,emission_dict = weight(result,train_data, predata)

            print("Writing the final result to dev.p5.out...")
        #     f = open('./dev.p3.out'.format(x) ,'w')
            f = open('./output/{}/dev.train.p5.out'.format(x) ,'w', encoding='utf-8')
            for word in result['result']:
                if pd.isnull(word) == False:
                    f.write(word + '\r\n')
                else:
                    f.write("" +"\r\n")
            f.close()
        else:
            print('epoch: ',i)
            nodes = node
            log_array =[]
            sequence_log=[]
            pred_data = "./output/EN/dev.train.p5.out"
            multitransition_dic,emission_dict = weight(pred_data,train_data, predata)
            
            print('Start training')
            for i in range(len(train_lines)):
        #         print("lines ", lines[1])
                multiviterbioutput=multistateviterbi(train_lines[i])
                log_array.append(multiviterbioutput)
                sequence_log.append(multistateviterbi_backtrack(multiviterbioutput))
    #         print(sequence_log)

            result = finalresult(sequence_log,train_x)
            print(result)
            

            print("Writing the final result to dev.p5.out...")
        #     f = open('./dev.p3.out'.format(x) ,'w')
            f = open('./output/{}/dev.train.p5.out'.format(x) ,'w', encoding='utf-8')
            for word in result['result']:
                if pd.isnull(word) == False:
                    f.write(word + '\n')
                else:
                    f.write("" +"\n")
            f.close()
    
    print('------------------------- Train finished ------------------------')
    print(multitransition_dic)
    print(emission_dict)
    
    
    #####################################################################################


###################################################################################
    testdf_unprocess = pd.read_csv(test_data, sep='/r/n', delimiter=None, names=['x'],index_col=False,skip_blank_lines=False, engine="python", encoding='utf-8')
    lines= sentenceList(testdf_unprocess)

    nodes = node
    log_array =[]
    sequence_log=[]

#     for i in range(len(lines)):
    print('Start training')
    for i in range(len(lines)):
#         print("lines ", lines[1])
        multiviterbioutput=multistateviterbi(lines[i])
        log_array.append(multiviterbioutput)
        sequence_log.append(multistateviterbi_backtrack(multiviterbioutput))
#         print(sequence_log)

    result = finalresult(sequence_log,testdf_unprocess)
    print(result)

#         pred_data = "./output/EN/dev.p5.out"
#             multitransition_dic,emission_dict = weight(result,train_data, predata)

    print("Writing the final result to dev.p5.out...")
#     f = open('./dev.p3.out'.format(x) ,'w')
    f = open('./output/{}/dev.p5.out'.format(x) ,'w', encoding='utf-8')
    for word in result['result']:
        if pd.isnull(word) == False:
            f.write(word + '\n')
        else:
            f.write("" +"\n")
    f.close()
        

----------------- EmissionPara ---------------
There are  181628 observations
Out of those observations,  12026 is to be replaced.
----------------- vwtransitionPara ---------------
----------------- wtransitionPara ---------------
----------------- MultistateTransitionPara ---------------
y_count:  Counter({'I-NP': 54591, 'B-NP': 47305, 'O': 23872, 'B-PP': 18387, 'B-VP': 18261, 'I-VP': 10159, nan: 7663, 'B-ADVP': 3565, 'B-SBAR': 1899, 'B-ADJP': 1751, 'I-ADJP': 574, 'B-PRT': 468, 'I-ADVP': 363, 'I-PP': 223, 'I-CONJP': 64, 'B-CONJP': 49, 'I-SBAR': 48, 'B-INTJ': 26, 'B-LST': 11, 'I-INTJ': 7, 'I-UCP': 4, 'B-UCP': 1})
             0       1           ycyf
0        START    B-NP     START B-NP
1         B-NP    I-NP      B-NP I-NP
2         I-NP    B-VP      I-NP B-VP
3         B-VP  B-ADVP    B-VP B-ADVP
4       B-ADVP  B-ADJP  B-ADVP B-ADJP
5       B-ADJP  I-ADJP  B-ADJP I-ADJP
6       I-ADJP  I-ADJP  I-ADJP I-ADJP
7       I-ADJP    B-PP    I-ADJP B-PP
8         B-PP    B-NP      B-PP B-N



                   result
0          Municipal B-NP
1              bonds B-NP
2                are B-NP
3          generally B-NP
4                  a B-NP
5                bit B-NP
6              safer B-NP
7               than B-NP
8          corporate B-NP
9              bonds B-NP
10                in B-NP
11                 a B-NP
12         recession B-NP
13                 , B-NP
14               but B-NP
15               not B-NP
16                as B-NP
17              safe B-NP
18                as B-NP
19             bonds B-NP
20            issued B-NP
21                by B-NP
22               the B-NP
23           federal B-NP
24        government B-NP
25                 . B-NP
26                    NaN
27                He B-NP
28             added B-VP
29            that B-SBAR
...                   ...
189269            of I-NP
189270          history O
189271             , B-NP
189272          said B-NP
189273            in B-NP
189274           his B-NP
189275      

             0       1         ycyf
0        START    B-NP   START B-NP
1         B-NP    B-NP    B-NP B-NP
2         B-NP    B-NP    B-NP B-NP
3         B-NP    B-NP    B-NP B-NP
4         B-NP    B-NP    B-NP B-NP
5         B-NP    B-NP    B-NP B-NP
6         B-NP    B-NP    B-NP B-NP
7         B-NP    B-NP    B-NP B-NP
8         B-NP    B-NP    B-NP B-NP
9         B-NP    B-NP    B-NP B-NP
10        B-NP    B-NP    B-NP B-NP
11        B-NP    B-NP    B-NP B-NP
12        B-NP    B-NP    B-NP B-NP
13        B-NP    B-NP    B-NP B-NP
14        B-NP    B-NP    B-NP B-NP
15        B-NP    B-NP    B-NP B-NP
16        B-NP    B-NP    B-NP B-NP
17        B-NP    B-NP    B-NP B-NP
18        B-NP    B-NP    B-NP B-NP
19        B-NP    B-NP    B-NP B-NP
20        B-NP    B-NP    B-NP B-NP
21        B-NP    B-NP    B-NP B-NP
22        B-NP    B-NP    B-NP B-NP
23        B-NP    B-NP    B-NP B-NP
24        B-NP    B-NP    B-NP B-NP
25        B-NP    B-NP    B-NP B-NP
26        B-NP    B-NP    B-

Start training
                 result
0              HBO B-NP
1              has B-VP
2            close I-VP
3               to B-PP
4               24 B-NP
5          million I-NP
6      subscribers I-NP
7               to B-PP
8              its B-NP
9              HBO I-NP
10                and O
11         Cinemax B-NP
12        networks I-NP
13                  , O
14         while B-SBAR
15        Showtime B-NP
16                and O
17             its B-NP
18          sister B-NP
19         service B-NP
20               , B-NP
21             The B-NP
22           Movie B-NP
23         Channel B-NP
24               , B-NP
25            have B-NP
26            only B-NP
27           about B-NP
28              10 B-NP
29         million B-NP
...                 ...
27195           is B-VP
27196        due B-ADJP
27197       only B-ADVP
27198     partly B-ADVP
27199           to B-PP
27200          the B-NP
27201    austerity B-NP
27202      program B-NP
27203     launched B-NP
2