In [819]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [820]:
import os
import sys
from math import log
import numpy as np
incrementer = 0.0000000000000000000001

In [821]:
def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

In [822]:
def getFileFromCommandLine():
    filename = sys.argv[1]
    return getFileContents(filename)

In [823]:
def splitWordTag(word_tag_pair):
    splitted = word_tag_pair.split('/')
    tag = splitted[-1]
    word = '/'.join(splitted[:-1])
    return word, tag

In [824]:
def getUniqueTags(tagged_data):
    tags = {}
    for line in tagged_data:
        word_tag_pairs = line.strip().split(' ')
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            if tag in tags.keys():
                tags[tag] += 1
            else:
                tags[tag] = 1
    return tags

In [825]:
def getOpenProbabilities(tagged_data, all_tags_dict):
    global incrementer
    sentences_count = len(tagged_data)
    open_tag_count_dict = {}
    for line in tagged_data:
        first_word_tag_pairs = line.strip().split(' ')[0]
        word, tag = splitWordTag(first_word_tag_pairs)
        if tag in open_tag_count_dict.keys():
            open_tag_count_dict[tag] += 1
        else:
            open_tag_count_dict[tag] = 1
    
    #increment all existing tags count to one
    open_tag_count_dict.update((tag, occurances + incrementer) for tag, occurances in open_tag_count_dict.items())
    sentences_count += (sentences_count*incrementer)
    
    #add one two non-opening tags
    for tag in all_tags_dict.keys():
        try:
            val = open_tag_count_dict[tag]
        except KeyError as e:
            open_tag_count_dict[tag] = incrementer
            sentences_count += incrementer
    
    open_tag_count_dict.update((tag, log((occurances*1.0)/sentences_count)) for tag, occurances in open_tag_count_dict.items())
    return open_tag_count_dict

In [826]:
def getCloseProbabilities(tagged_data, all_tags_dict):
    global incrementer
    sentences_count = len(tagged_data)
    close_tag_count_dict = {}
    for line in tagged_data:
        last_word_tag_pairs = line.strip().split(' ')[-1]
        word, tag = splitWordTag(last_word_tag_pairs)
        if tag in close_tag_count_dict.keys():
            close_tag_count_dict[tag] += 1
        else:
            close_tag_count_dict[tag] = 1
            
    #increment all existing tags count by one
    close_tag_count_dict.update((tag, occurances + incrementer) for tag, occurances in close_tag_count_dict.items())
    
    sentences_count += (sentences_count*incrementer)
    
    #add one two non-closing tags
    for tag in all_tags_dict.keys():
        try:
            val = close_tag_count_dict[tag]
        except KeyError as e:
            close_tag_count_dict[tag] = incrementer
            sentences_count += incrementer
            
    close_tag_count_dict.update((tag, log((occurances*1.0)/sentences_count)) for tag, occurances in close_tag_count_dict.items())
    return close_tag_count_dict

In [827]:
def buildTransitionMatrix(tagged_data, tags_dict):
    global incrementer
    tags = tags_dict.keys()
    tags.sort()
    
    tags_index_dict = {}
    tags_index_dict_reverse = {}
    for index, tag in enumerate(tags):
        tags_index_dict[tag] = index
        tags_index_dict_reverse[index] = tag
    
    tag_count = len(tags)
    
    #Change this line to np.ones for add 1 smoothing
    transition_matrix = np.zeros(shape=(tag_count, tag_count))
    
    for line in tagged_data:
        prev_tag = None
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            
            if prev_tag is not None:
                transition_matrix[tags_index_dict[prev_tag]][tags_index_dict[tag]] += 1
            
            prev_tag = tag
    
    transition_matrix = transition_matrix + incrementer
    
    probability_transition_matrix = transition_matrix/transition_matrix.sum(axis=1, keepdims=True)
    
    print "Transition Values are NaN : ", np.argwhere(np.isnan(probability_transition_matrix))
    probability_transition_matrix[np.isnan(probability_transition_matrix)] = incrementer
    probability_transition_matrix = np.log(probability_transition_matrix)
    return probability_transition_matrix.tolist(), tags_index_dict, tags_index_dict_reverse
        

In [828]:
def getUniqueWords(tagged_data):
    words = []
    for line in tagged_data:
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            words.append(word)
    return list(set(words))

In [829]:
def computeEmissionProbabilities(tagged_data, tags_dict):
    global incrementer
    tags = tags_dict.keys()
    tags.sort()
    
    words = getUniqueWords(tagged_data)
    words.sort()
    
    tags_index_dict = {}
    for index, tag in enumerate(tags):
        tags_index_dict[tag] = index
        
    words_index_dict = {}
    words_index_dict_reverse = {}
    for index, word in enumerate(words):
        words_index_dict[word] = index
        words_index_dict_reverse[index] = word
    
    tag_count = len(tags)
    word_count = len(words)
    
    # word_count + 1 => Last column for unseen words
    emission_matrix = np.zeros(shape=(tag_count, word_count + 1))
    
    for line in tagged_data:
        prev_tag = None
        word_tag_pairs = line.strip().split(' ')
        
        for word_tag_pair in word_tag_pairs:
            word, tag = splitWordTag(word_tag_pair)
            
            emission_matrix[tags_index_dict[tag]][words_index_dict[word]] += 1
            
            prev_tag = tag
    #increment 1 in all the elements so that the last col for unseen words have non zero values
    emission_matrix = emission_matrix + incrementer
    probability_emission_matrix = emission_matrix/emission_matrix.sum(axis=1, keepdims=True)
    print "Emission Values are NaN : ", np.argwhere(np.isnan(probability_emission_matrix))
    probability_emission_matrix[np.isnan(probability_emission_matrix)] = incrementer
    probability_emission_matrix = np.log(probability_emission_matrix)
    return probability_emission_matrix.tolist(), tags_index_dict, words_index_dict, words_index_dict_reverse

In [830]:
def printEmissionProbabilities(count):
    counter = 0
    global probability_emission_matrix, tags_index_dict, words_index_dict
    word_count = len(words_index_dict.keys())
    tag_count = len(tags_index_dict.keys())
    for word, word_index in words_index_dict.iteritems():
        for tag, tag_index in tags_index_dict.iteritems():
            if probability_emission_matrix[tag_index][word_index] != 0:
                print tag, " => ", word, ' => ', probability_emission_matrix[tag_index][word_index]
                counter += 1
                if counter > count:
                    return

In [None]:
def writeModelToFile(probability_transition_matrix, opening_probabilities, closing_probabilities, probability_emission_matrix, tags_index_dict, words_index_dict):
    total_tags = len(tags_index_dict.keys())
    total_words = len(words_index_dict.keys())
        
    lineCounter = 6
    text = ''
    
    text += '---------------------TransitionMatrix---------------------' + '\n'
    lineCounter += 1
    tr_start_line_number = lineCounter
    tr_end_line_number = tr_start_line_number
    for row in range(len(probability_transition_matrix)):
        row_text = ''
        for col in range(len(probability_transition_matrix[0])):
            row_text += str(probability_transition_matrix[row][col]) + '\t'
        row_text = row_text.strip()
        text += row_text + '\n'
        tr_end_line_number += 1
    
    text += '---------------------EmissionMatrix---------------------' + '\n'
    
    em_start_line_number = tr_end_line_number + 1
    em_end_line_number = em_start_line_number
    for row in range(len(probability_emission_matrix)):
        row_text = ''
        for col in range(len(probability_emission_matrix[0])):
            row_text += str(probability_emission_matrix[row][col]) + '\t'
        row_text = row_text.strip()
        text += row_text + '\n'
        em_end_line_number += 1
        
    text += '---------------------OpeningClosingProbabilities---------------------' + '\n'
    
    oc_start_line_number = em_end_line_number + 1
    oc_end_line_number = oc_start_line_number
    for tag in opening_probabilities:
        tag_details = tag + '\t' + str(opening_probabilities[tag]) + '\t' + str(closing_probabilities[tag]) + '\t' + str(tags_index_dict[tag]) + '\n'
        text += tag_details
        oc_end_line_number += 1
    
    text += '---------------------Words---------------------' + '\n'
    
    wi_start_line_number = oc_end_line_number + 1
    wi_end_line_number = wi_start_line_number
        
    for word in words_index_dict:
        word_details = word + '\t' + str(words_index_dict[word]) + '\n'
        text += word_details
        wi_end_line_number += 1
    
    
    header = ''
    header += 'total_tags:' + str(total_tags) + '\n'
    header += 'total_words:' + str(total_words) + '\n'
    header += 'tranistion_matrix:' + str(tr_start_line_number) + ':' + str(tr_end_line_number) + '\n'
    header += 'emission_matrix:' + str(em_start_line_number) + ':' + str(em_end_line_number) + '\n'
    header += 'open_close_probabilities:' + str(oc_start_line_number) + ':' + str(oc_end_line_number) + '\n'
    header += 'word_indexes:' + str(wi_start_line_number) + ':' + str(wi_end_line_number) + '\n'
    
    text = header + text
    filename = 'hmmmodel.txt'
    with open(filename, 'w') as output_file:
        output_file.write(text)

In [None]:
tagged_data = getFileContents('data/en_train_tagged.txt')
tags_dict = getUniqueTags(tagged_data)

In [None]:
opening_probabilities = getOpenProbabilities(tagged_data, tags_dict)
closing_probabilities = getCloseProbabilities(tagged_data, tags_dict)

In [None]:
probability_transition_matrix, tags_index_dict, tags_index_dict_reverse = buildTransitionMatrix(tagged_data, tags_dict)

In [None]:
probability_emission_matrix, tags_index_dict, words_index_dict, words_index_dict_reverse = computeEmissionProbabilities(tagged_data, tags_dict)

In [None]:
writeModelToFile(probability_transition_matrix, opening_probabilities, closing_probabilities, probability_emission_matrix, tags_index_dict, words_index_dict)

In [None]:
printEmissionProbabilities(5)

In [None]:
tag_count = len(tags_index_dict.keys())

In [947]:
def readModelFile():
    filename = 'hmmmodel.txt'
    lines = []
    with open(filename, 'r') as model_file:
        lines = model_file.readlines()
    return lines

In [952]:
def parseModel(lines):
    total_tags = int(lines[0].strip().split(':')[-1])
    total_words = int(lines[1].strip().split(':')[-1])
    
    tr_start_line_number = int(lines[2].strip().split(':')[-2])
    tr_end_line_number = int(lines[2].strip().split(':')[-1])
    
    em_start_line_number = int(lines[3].strip().split(':')[-2])
    em_end_line_number = int(lines[3].strip().split(':')[-1])
    
    oc_start_line_number = int(lines[4].strip().split(':')[-2])
    oc_end_line_number = int(lines[4].strip().split(':')[-1])
    
    wi_start_line_number = int(lines[5].strip().split(':')[-2])
    wi_end_line_number = int(lines[5].strip().split(':')[-1])
    
    print total_tags, total_words, tr_start_line_number, tr_end_line_number, em_start_line_number, em_end_line_number, oc_start_line_number,oc_end_line_number, wi_start_line_number, wi_end_line_number
    
    probability_transition_matrix = []
    for line_number in range(tr_start_line_number, tr_end_line_number, 1):
        row_values = (float, lines[line_number].strip().split('\t'))
        probability_transition_matrix.append(row_values)
    
    probability_emission_matrix = []
    for line_number in range(em_start_line_number, em_end_line_number, 1):
        row_values = (float, lines[line_number].strip().split('\t'))
        probability_emission_matrix.append(row_values)
        
    
    opening_probabilities = {}
    closing_probabilities = {}
    
    tags_index_dict = {}
    tags_index_dict_reverse = {}
    
    for line_number in range(oc_start_line_number, oc_end_line_number, 1):
        row_values = lines[line_number].strip().split('\t')
        tag_name = row_values[0]
        open_p = float(row_values[1])
        close_p = float(row_values[2])
        index = int(row_values[3])
        
        opening_probabilities[tag_name] = open_p
        closing_probabilities[tag_name] = close_p
        tags_index_dict[tag_name] = index
        tags_index_dict_reverse[index] = tag_name
    
    words_index_dict = {}
    words_index_dict_reverse = {}
    
    for line_number in range(wi_start_line_number, wi_end_line_number, 1):
        row_values = lines[line_number].strip().split('\t')
        word = row_values[0]
        index = int(row_values[1])
        words_index_dict[word] = index
        words_index_dict_reverse[index] = word
        
    return opening_probabilities, closing_probabilities, probability_transition_matrix, probability_emission_matrix, tags_index_dict, tags_index_dict_reverse, words_index_dict, words_index_dict_reverse 

In [None]:
def getMostProbableTags(sentence):
    global opening_probabilities, closing_probabilities, probability_transition_matrix, probability_emission_matrix, tags_index_dict, tags_index_dict_reverse, words_index_dict, words_index_dict_reverse 
    global tag_count, unseen_words
    
    sentence_words = sentence.strip().split(' ')
    
    sentence_len = len(sentence_words)
    
    viterbi_matrix = np.zeros(shape=(tag_count, sentence_len))
    
    tracing_matrix = [[None for x in range(sentence_len)] for y in range(tag_count)]
    
    for word_index in range(sentence_len):
        word = sentence_words[word_index]
        for model_tag in tags_index_dict:
            model_tag_index = tags_index_dict[model_tag]
            try:
                word_emission_probability = probability_emission_matrix[model_tag_index][words_index_dict[word]]
            except KeyError as e:
                word_emission_probability = 1.0  #probability_emission_matrix[model_tag_index][-1]
            
            if word_index == 0:
                try:
                    tag_opening_probability = opening_probabilities[model_tag]
                except KeyError as e:
                    print "tag_opening_probability : Keyerror encountered"
                    tag_opening_probability = 1.1754943508222875e-10
                viterbi_matrix[model_tag_index][word_index] = tag_opening_probability + word_emission_probability
            else:
                max_probability = np.finfo(float).min
                max_tag = None
                for prev_model_tag in tags_index_dict:
                    prev_model_tag_index = tags_index_dict[prev_model_tag]
                    tag_transition_probability = probability_transition_matrix[prev_model_tag_index][model_tag_index]
#                     if tag_transition_probability == 0.0:
#                         print "Transition probability still zero"
#                         tag_transition_probability = 1.1754943508222875e-10
                    temp_probability = viterbi_matrix[prev_model_tag_index][word_index-1] + tag_transition_probability + word_emission_probability  
                    if temp_probability > max_probability:
                        max_probability = temp_probability
                        max_tag = prev_model_tag
                        
                viterbi_matrix[model_tag_index][word_index] = max_probability
                tracing_matrix[model_tag_index][word_index] = max_tag
    
    max_probability = np.finfo(float).min
    max_probability_tag = None
    for model_tag in tags_index_dict:
        model_tag_index = tags_index_dict[model_tag]
        temp_probability = 0.0
        try:
            tag_closing_probabilities = closing_probabilities[model_tag]
        except KeyError as e:
            print "tag_closing_probabilities : Keyerror encountered", 
            tag_closing_probabilities = 1.1754943508222875e-10
        temp_probability =  tag_closing_probabilities + viterbi_matrix[model_tag_index][sentence_len-1]
        if temp_probability > max_probability:
            max_probability = temp_probability
            max_probability_tag = model_tag

    assigned_tags = [max_probability_tag]
    current_best_tag = max_probability_tag
    for col in range(sentence_len-1, 0, -1):
        current_best_tag = tracing_matrix[tags_index_dict[current_best_tag]][col]
        assigned_tags.append(current_best_tag)
    assigned_tags = assigned_tags[::-1]
    
    anotated_sentence = ''
    for index, assigned_tag in enumerate(assigned_tags):
        anotated_sentence += str(sentence_words[index]) + '/' + str(assigned_tag) + ' '
    
    
    return anotated_sentence.strip()

In [953]:
lines = readModelFile()

['total_tags:50\n', 'total_words:19672\n', 'tranistion_matrix:7:57\n', 'emission_matrix:58:108\n', 'open_close_probabilities:109:159\n', 'word_indexes:160:19832\n']


In [955]:
opening_probabilities, closing_probabilities, probability_transition_matrix, probability_emission_matrix, tags_index_dict, tags_index_dict_reverse, words_index_dict, words_index_dict_reverse  = parseModel(lines)

50 19672 7 57 58 108 109 159 160 19832


In [None]:
def startPredicting():
    test_data = getFileContents('data/en_dev_raw.txt')
    output = ''
    for test_line in test_data:
        predicted_tagged_line = getMostProbableTags(test_line)
        output += predicted_tagged_line + '\n'
    
    output = output.strip()
    
    with open('hmmoutput.txt', 'w') as output_file:
        output_file.write(output)


In [960]:
def getFileContents(filename):
    data = None
    with open(filename, 'r') as f:
        data = f.readlines()
    return data

def computeAccuracy():
    dev_tagged_data = getFileContents('data/zh_dev_tagged.txt')
    predicted_data = getFileContents('hmmoutput.txt')
    correct = 0
    total = 0
    for index, line in enumerate(dev_tagged_data):
        predicted_tagged_line = predicted_data[index]
        expected_tagged_line = dev_tagged_data[index]
        
        predicted_word_tag_pairs = predicted_tagged_line.strip().split(' ')
        expected_word_tag_pairs = expected_tagged_line.strip().split(' ')
        for index, predicted_word in enumerate(predicted_word_tag_pairs):
            if predicted_word == expected_word_tag_pairs[index]:
                correct += 1
            total += 1
#             if total % 100 == 0:
#                 print correct, total, " => ", (correct*100.0)/total
    accuracy = (correct*100.0)/total
    print accuracy

if __name__ == '__main__':
    computeAccuracy()

85.9985785359


In [889]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [1099]:
from scipy.stats import beta
def f(x):
    if x>0 and x<=1:
        y= (0.5*(beta.pdf(0.5,8,5)))
        return y
    elif x>4 and x<=5:
        y = 0.5*(x-4)
        return y
    elif x>5 and x<=6:
        y=-0.5*(x-6)
        return y
    else:
        return 0

In [1103]:
# def f(x):
#     return -1*x**2 + 8 * x
max_x = scipy.optimize.fmin(lambda x: -f(x), 0, maxiter=10000, maxfun=10000)
print max_x

Optimization terminated successfully.
         Current function value: -0.966797
         Iterations: 3
         Function evaluations: 7
[ 0.00025]


In [1095]:
f(5)

0.5