In [7]:
import json
import re
import numpy as np

In [2]:
def StateFileProcessing(State_File,Smooth):
    with open (State_File,'r') as file:
        N = int(file.readline())
        stateSet = {}
        matrixA = np.zeros((N, N))
        pi = [0 for i in range(N)]
        end = [0 for i in range(N)]
        
        ID = 0
        while ID < N:
            stateName = file.readline().strip()
            stateSet[stateName] = ID
            ID += 1
            
        while True:
            line = file.readline()
            if not line:
                break
            items = line.split()
            
            statePrev = int(items[0])
            stateNext = int(items[1])
            frequency = int(items[2])
            
            matrixA[statePrev][stateNext] = frequency

        for i in range(0, N):
            if i == stateSet['END']:
                continue
            total = matrixA[i].sum()
            for j in range(0, N):
                if j == stateSet['BEGIN']:
                    continue
                matrixA[i][j] = (matrixA[i][j] + Smooth) / (total + (N - 1) * Smooth)
                
        #### PI的赋值
        for i in range(N):
            pi[i] = matrixA[stateSet['BEGIN']][i]
            end[i] = matrixA[i][-1] 
        
    file.close()
    return N, stateSet, matrixA, pi, end

In [3]:
def SymbolFileProcessing(Symbol_File, Smooth):
    with open(Symbol_File,'r') as file:
        M = int(file.readline())
        symbolSet = {}
        matrixB = np.zeros((M+2, M+1))

        ID = 0
        while ID < M:
            symbol = file.readline().strip()
            symbolSet[symbol] = ID
            ID += 1
        symbolSet["UNK"] = ID
        
        while True:
            line = file.readline()
            if not line:
                break
            items = line.split()
            
            state = int(items[0])
            symbol = int(items[1])
            frequency = int(items[2])
            
            matrixB[state][symbol] = frequency
            
        for i in range(0, M):
            total = matrixB[i].sum()
            for j in range(0, M+1):
                if j == ID or matrixB[i][j] == 0:
                    matrixB[i][j] = 1 / (total + M + 1)
                else:
                    matrixB[i][j] = (matrixB[i][j] + (1 * Smooth)) / (total + M * Smooth + 1)
        
    file.close()
    return symbolSet, matrixB

In [4]:
def query_to_token(line, symbolSet): 
    tokens = re.findall(r"[A-Za-z0-9.]+|[,|\.|/|;|\'|`|\[|\]|<|>|\?|:|\"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|\-|=|\_|\+]", line)
    Obs = [0 for i in range(len(tokens))]
    for i in range(len(tokens)):
        if tokens[i] in symbolSet.keys():
            Obs[i] = symbolSet[tokens[i]]
        else:
            Obs[i] = symbolSet["UNK"]
    # print(Obs)
    return Obs

In [5]:
def viterbi(N,Obs,PI,END,A,B):
    path = []
    T = len(Obs)
    delta = np.zeros((N, T))
    record = np.zeros((N, T), int)
    psi = [[[]] * T for i in range(N)]

    delta[:, 0] = PI * B[:, Obs[0]]   
    for ts in range(1, T):       #  timeStamp
        for sn in range(N):     #  stateNext
            for sp in range(N):  #  statePrev
                prob = delta[sp][ts-1] * A[sp][sn] * B[sn][Obs[ts]]
                if prob > delta[sn][ts]:
                    delta[sn][ts] = prob
                    record[sn][ts] = sp
    # 最后要乘stateEnd的概率，每个s转移到end的概率都不一样
    # 同理，begin也是，begin到每个s的概率都不一样
    # 最后输出概率应该是结合begin end 的概率的乘积才对
    delta[:, -1] = END * delta[:, -1]

    maxProb = 0
    maxIndex = 0
    for index in range(len(delta)):
        if delta[index][-1] > maxProb:
            maxProb = delta[index][-1]
            maxIndex = index
    
    #  backtracking
    path = [0 for i in range(T+1)]
    path[-2] = maxIndex
    col = -1
    while True:
        if T <= -col:
            break
        maxState = record[maxIndex][col]
        maxIndex = maxState
        col -= 1
        path[col-1] = maxState
    path[-1] = round(np.log(maxProb),6)
    
    return path

In [6]:
def viterbi_algorithm(State_File, Symbol_File, Query_File):
    N, stateSet, A, PI, END = StateFileProcessing(State_File,Smooth=1)
    symbolSet, B = SymbolFileProcessing(Symbol_File, Smooth=1)

    results = []
    with open(Query_File, 'r') as file:
        while True:
            line = file.readline()
            if not line:
                break
            
            Obs = query_to_token(line, symbolSet)
            result = viterbi(N,Obs,PI,END,A,B)
            result.insert(0, stateSet["BEGIN"])
            result.insert(-1, stateSet["END"])
            results.append(result)
    file.close()

    return results

In [7]:
State_File ='./toy_example/State_File'
Symbol_File='./toy_example/Symbol_File'
Query_File ='./toy_example/Query_File'
viterbi_algorithm(State_File, Symbol_File, Query_File)

[[3, 0, 0, 1, 2, 4, -9.843403], [3, 2, 1, 2, 4, -9.397116]]

In [8]:
def top_k(N,Obs,PI,END,A,B,K):
    
    T = len(Obs)
    
    delta = np.zeros((N, K, T), float)
    record = np.zeros((N, K, T), int)
    
    for state in range(N):
        delta[state, 0, 0] = PI[state] * B[state][Obs[0]] 
        record[state, 0, 0] = state
        
        for k in range(1, K):
            delta[state, k, 0] = 0.0
            record[state, k, 0] = state
            
    for ts in range(1, T):
        for sn in range(N):
            prob_state = []
            for sp in range(N):
                for k in range(K):
                    prob = delta[sp, k, ts-1] * A[sp, sn] * B[sn, Obs[ts]]
                    state = sp
                    prob_state.append((prob, state))
            prob_state_sorted = sorted(prob_state, key=lambda x: x[0], reverse=True)
            
            for k in range(K):
                delta[sn, k, ts] = prob_state_sorted[k][0]
                record[sn, k, ts] = prob_state_sorted[k][1]
                    
    prob_state = []
    for state in range(N):
        for k in range(K):
            prob = delta[state, k, T-1]
            prob_state.append((prob, state))
            
    prob_state_sorted = sorted(prob_state, key=lambda x: x[0], reverse=True)
    
    path = [[0 for i in range(T+1)] for j in range(K)]
    for k in range(K):
        maxProb = prob_state_sorted[k][0]
        maxIndex = prob_state_sorted[k][1]
        
        path[k][-1] = maxProb
        path[k][-2] = maxIndex
        col = -1
        while True:
            if T <= -col:
                break
            maxState = record[maxIndex][k][col]
            maxIndex = maxState
            col -= 1
            path[k][col-1] = maxState
        maxProb = np.log(maxProb * END[path[k][-2]])
        path[k][-1] = round(maxProb,6)        
        
    return path

In [9]:
def top_k_viterbi(State_File, Symbol_File, Query_File, k): # do not change the heading of the function
    N, stateSet, A, PI, END = StateFileProcessing(State_File,Smooth=1)
    symbolSet, B = SymbolFileProcessing(Symbol_File, Smooth=1)
    results = [[]for i in range(k)]
    
    with open(Query_File, 'r') as file:
        while True:
            line = file.readline()
            if not line:
                break
            
            Obs = query_to_token(line, symbolSet)
            result = top_k(N,Obs,PI,END,A,B,k)
            for index in range(len(result)):
                result[index].insert(0, stateSet["BEGIN"])
                result[index].insert(-1, stateSet["END"]) 
                results[index].append(result[index])
    file.close()

    return results

In [10]:
State_File ='./toy_example/State_File'
Symbol_File='./toy_example/Symbol_File'
Query_File ='./toy_example/Query_File'
# viterbi_result = viterbi_algorithm(State_File, Symbol_File, Query_File)
viterbi_result1 = viterbi_algorithm(State_File, Symbol_File, Query_File)
viterbi_result2 = top_k_viterbi(State_File, Symbol_File, Query_File, k=2)
# print(viterbi_result1)
for row in viterbi_result2:
    print(row)

[[3, 0, 0, 1, 2, 4, -9.843403], [3, 2, 1, 2, 4, -9.397116]]
[[3, 2, 0, 0, 2, 4, -10.131085], [3, 2, 0, 2, 4, -9.551267]]


In [9]:
def StateFileProcessing(State_File,Smooth):
    with open (State_File,'r') as file:
        N = int(file.readline())
        stateSet = {}
        matrixA = np.zeros((N, N))
        pi = [0 for i in range(N)]
        end = [0 for i in range(N)]
        
        ID = 0
        while ID < N:
            stateName = file.readline().strip()
            stateSet[stateName] = ID
            ID += 1
            
        while True:
            line = file.readline()
            if not line:
                break
            items = line.split()
            
            statePrev = int(items[0])
            stateNext = int(items[1])
            frequency = int(items[2])
            
            matrixA[statePrev][stateNext] = frequency

        for i in range(0, N):
            if i == stateSet['END']:
                continue
            total = matrixA[i].sum()
            for j in range(0, N):
                if j == stateSet['BEGIN']:
                    continue
                matrixA[i][j] = (matrixA[i][j] + Smooth) / (total + (N - 1) * Smooth)
                
        #### PI的赋值
        for i in range(N):
            pi[i] = matrixA[stateSet['BEGIN']][i]
            end[i] = matrixA[i][-1] 
        
    file.close()
    print("state processing finished")
    return N, stateSet

In [16]:
def SymbolFileProcessing(N, Symbol_File, Smooth):
    with open(Symbol_File,'r') as file:
        M = int(file.readline())
        symbolSet = {}
        matrixB = np.zeros((N, M+1))

        ID = 0
        while ID < M:
            symbol = file.readline().strip()
            symbolSet[symbol] = ID
            ID += 1
        symbolSet["UNK"] = ID
#         print("ok")
        
        while True:
            line = file.readline()
            if not line:
                break
            items = line.split(' ')
            state = int(items[0])
            symbol = int(items[1])
            frequency = int(items[2])
            
            matrixB[state][symbol] = frequency
            
        for i in range(0, N):
            total = matrixB[i].sum()
            for j in range(0, M+1):
                if j == ID or matrixB[i][j] == 0:
                    matrixB[i][j] = 1 / (total + M + 1)
                else:
                    matrixB[i][j] = (matrixB[i][j] + (1 * Smooth)) / (total + M * Smooth + 1)
        print("ok")

    file.close()
    return  [] symbolSet, matrixB

In [17]:
State_File = "./dev_set/State_File"
Symbol_File = "./dev_set/Symbol_File"
N, state_set = StateFileProcessing(State_File,Smooth=1)

state processing finished


In [18]:
SymbolFileProcessing(N, Symbol_File, Smooth=1)

ok


[]

In [20]:
def read_state(State_File):
    '''
    :param State_File: file includes state set and state transition matrix
    :return N: number of states
    :return state_set: a dict contains all states' ID and name
    :return transition_prob: a dict contains transition probability 
    :return state_prob: a dict contains states and their probability
    '''
    with open(State_File, 'r') as file:
        N = int(file.readline().strip('\n'))     # read the first line to get N value
        state_set = dict()                       # store the set of state
        transition_prob = dict()                 # store transition probability  
        state_prob = dict()                      # store state initialising probability
        ID = 0                                   # ID of states
        cnt = 0                                  # number of transitions
        
        # Scan descriptive name of the states.
        while ID < N:
            state = file.readline().strip('\n').rstrip()  # one state in each line
            state_set[state] = ID
            ID = ID + 1
        
        # Scan the frequency of transitions.
        while True:
            line = file.readline()
            if not line:
                break
            items = line.split(' ')
            # Add new probability with key + value.
            transition_prob.setdefault(int(items[0]),{})[int(items[1])] = int(items[2])
            cnt = cnt + 1
        
        # Convert frequency into probability.
        for keys,values in transition_prob.items():
            total = 0
            for value in values.values():
                total = total + value
            # Scan each state in state_set.
            for state in state_set.values():
                # Case-I: state is already existing
                if state in values.keys():
#                     transition_prob[keys][state] = round((transition_prob[keys][state]+1)/(total+N-1),1)
                    transition_prob[keys][state] = (transition_prob[keys][state]+1)/(total+N-1)
                # Case-II: state is not existing
                else:
                    if state == state_set['BEGIN']:
                        transition_prob.setdefault(keys,{})[state] = 0.0
                    else:
#                         transition_prob.setdefault(keys,{})[state] = round(1/(total+N-1),1)
                        transition_prob.setdefault(keys,{})[state] = 1/(total+N-1)
            
        # Initialize state probability and Add "END" state with no outing states.
        for state in state_set.values():
            transition_prob.setdefault(state_set['END'],{})[state] = 0.0
#             state_prob[state] = round(1/N,1)
            state_prob[state] = 1/N
            
    return state_set

def read_symbol(Symbol_File, state_set):
    '''
    :param Symbol_File: file includes symbol set and emission probability
    :param state_set: a set of state
    :return M: number of symbol
    :return symbol_set: a dict contains all symbols' ID and name
    :return emission_prob: a dict contains emission probability 
    '''
    with open(Symbol_File, 'r') as file:
        M = int(file.readline().strip('\n'))     # read the first line to get M value
        symbol_set = dict()                      # store the set of symbol
        emission_prob = dict()                   # store emission probability        
        ID = 0                                   # ID of symbols
        
        # Scan descriptive name of the symbols.
        while ID < M:
            symbol = file.readline().strip('\n').rstrip()  # one symbol in each line
#             symbol_set[ID] = symbol
            symbol_set[symbol] = ID
            ID = ID + 1
        
        # Scan the frequency of emissions.
        while True:
            line = file.readline()
            if not line:
                break
            items = line.split(' ')
            # Add new probability with key + value.
            emission_prob.setdefault(int(items[0]),{})[int(items[1])] = int(items[2])
        
        # Convert frequency into probability.
        for keys,values in emission_prob.items():
            total = 0
            for value in values.values():
                total = total + value
            # Scan each symbol in symbol_set.
            for symbol in symbol_set.values():
                # Case-I: symbol is already existing
                if symbol in values.keys():
#                     emission_prob[keys][symbol] = round((emission_prob[keys][symbol]+1)/(total+M+1),1)
                    emission_prob[keys][symbol] = (emission_prob[keys][symbol]+1)/(total+M+1)
                # Case-II: symbol is not existing
                else:
#                     emission_prob.setdefault(keys,{})[symbol] = round(1/(total+M+1),1)
                    emission_prob.setdefault(keys,{})[symbol] = 1/(total+M+1)
            # Add special symbol "UNK".
#             emission_prob.setdefault(keys,{})[M] = round(1/(total+M+1),1)
            emission_prob.setdefault(keys,{})[M] = 1/(total+M+1)
    
    print(emission_prob)
                                      
    return M, symbol_set, emission_prob

In [21]:
State_File ='./toy_example/State_File'
Symbol_File='./toy_example/Symbol_File'
Query_File ='./toy_example/Query_File'

f = read_state(State_File)
read_symbol(Symbol_File, f)

{0: {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.1}, 1: {0: 0.2, 1: 0.4, 2: 0.3, 3: 0.1}, 2: {0: 0.2, 1: 0.2, 2: 0.5, 3: 0.1}}


(3,
 {'Red': 0, 'Green': 1, 'Blue': 2},
 {0: {0: 0.4, 1: 0.3, 2: 0.2, 3: 0.1},
  1: {0: 0.2, 1: 0.4, 2: 0.3, 3: 0.1},
  2: {0: 0.2, 1: 0.2, 2: 0.5, 3: 0.1}})