In [1]:
import os
import numpy as np
from collections import defaultdict
import math

from conlleval2 import evaluate

In [2]:
def get_emission_dict(file_path):
    
    e = {}
    y_count = {}
    emission_dict = {}
    
    train_data = open(file_path, 'r', encoding="utf-8")
    lines = train_data.readlines()
    
    all_x = set()
    all_y = set()

    for line in lines:
        line = line.strip()

        if len(line) > 0:
            x, pos, y = line.split(" ")
            all_x.add(x)
            all_y.add(y)
            y_count[y] = y_count.get(y,0) + 1
            e[(x, y)]  = e.get((x,y),0) + 1
            e[(pos, y)] = e.get((pos,y),0) + 1

    for x, y in e.keys():
        key = "emission:" + y + "+" + x
        emission_dict[key] = math.log(e[(x, y)] / y_count[y])
    
    return emission_dict, list(all_y)

train_path = "full/train"
emission_dict, states = get_emission_dict(train_path)

In [3]:
emission_dict

{'emission:O+The': -4.333515843240958,
 'emission:O+DT': -2.2045841023613524,
 'emission:O+official': -7.166729187297174,
 'emission:O+JJ': -2.617071711239342,
 'emission:O+cause': -8.776167099731275,
 'emission:O+NN': -1.8272698774179623,
 'emission:O+of': -3.518671727703493,
 'emission:O+IN': -1.9894501491261936,
 'emission:O+death': -7.166729187297174,
 'emission:O+has': -4.649032714686183,
 'emission:O+VBZ': -3.5639524322366496,
 'emission:O+not': -5.97280671882474,
 'emission:O+RB': -3.8525431826246486,
 'emission:O+been': -5.755742213586912,
 'emission:O+VBN': -3.3294297280649645,
 'emission:O+officially': -8.370701991623111,
 'emission:O+determined': -8.370701991623111,
 'emission:O+,': -2.6427690567346263,
 'emission:O+but': -6.211217742269738,
 'emission:O+CC': -3.7199212943829667,
 'emission:O+investigators': -9.46931428029122,
 'emission:O+NNS': -2.5233002891919925,
 'emission:O+believe': -9.46931428029122,
 'emission:O+VBP': -4.122206749573751,
 'emission:O+the': -2.8532490

In [4]:
def get_resulting_dict(file_path, emission_dict):

    t = {}
    y_count = {}
    
    train_data = open(file_path, 'r', encoding="utf-8")
    lines = train_data.readlines()
    start = "start"
    all_y = set(["start", "stop"])

    for line in lines:
        line = line.strip()
        if len(line) <= 0:
            t[(start, "stop")] = t.get((start,"stop"),0) + 1
            start = "start"
            y_count[start] = y_count.get(start,0) + 1
        else:
            x, pos, y = line.split(" ")
            t[(start, y)] = t.get((start,y),0) + 1
            y_count[y] = y_count.get(y,0) + 1
            start = y
            all_y.add(y)

    for start, end in t.keys():
        key = "transition:" + start + "+" + end
        emission_dict[key] = math.log(t[(start, end)] / y_count[start])

    return emission_dict

resulting_dict = get_resulting_dict(train_path, emission_dict)

In [5]:
resulting_dict

{'emission:O+The': -4.333515843240958,
 'emission:O+DT': -2.2045841023613524,
 'emission:O+official': -7.166729187297174,
 'emission:O+JJ': -2.617071711239342,
 'emission:O+cause': -8.776167099731275,
 'emission:O+NN': -1.8272698774179623,
 'emission:O+of': -3.518671727703493,
 'emission:O+IN': -1.9894501491261936,
 'emission:O+death': -7.166729187297174,
 'emission:O+has': -4.649032714686183,
 'emission:O+VBZ': -3.5639524322366496,
 'emission:O+not': -5.97280671882474,
 'emission:O+RB': -3.8525431826246486,
 'emission:O+been': -5.755742213586912,
 'emission:O+VBN': -3.3294297280649645,
 'emission:O+officially': -8.370701991623111,
 'emission:O+determined': -8.370701991623111,
 'emission:O+,': -2.6427690567346263,
 'emission:O+but': -6.211217742269738,
 'emission:O+CC': -3.7199212943829667,
 'emission:O+investigators': -9.46931428029122,
 'emission:O+NNS': -2.5233002891919925,
 'emission:O+believe': -9.46931428029122,
 'emission:O+VBP': -4.122206749573751,
 'emission:O+the': -2.8532490

In [6]:
def viterbi_algo(x, states, f):
    scores = np.full((len(x), len(states)), -np.inf)
    parents = np.full((len(x), len(states)), 0, dtype=int)
    
    for i in range(len(states)):
        emission_key1 = "emission:" + states[i] + "+" + x[0].split()[0]
        emission_key2 = "emission:" + states[i] + "+" + x[0].split()[1]
        transmission_key = "transition:" + "start" + "+" + states[i]
        scores[0, i] = f.get(emission_key1, -10e8) + f.get(emission_key2, -10e8) + f.get(transmission_key, -10e8)
    
    for i in range(1, len(x)):
        for j in range(len(states)):
            for k in range(len(states)):
                emission_key1 = "emission:" + states[k] + "+" + x[i].split()[0]
                emission_key2 = "emission:" + states[k] + "+" + x[i].split()[1]
                transmission_key = "transition:" + states[j] + "+" + states[k]
                overall_score = scores[i-1, j] + f.get(emission_key1, -10e8) + f.get(emission_key2, -10e8) + f.get(transmission_key, -10e8)

                if overall_score > scores[i, k]:
                    scores[i, k] = overall_score
                    parents[i,k] = j
    
    best_score = -np.inf
    best_parent = None
    
    for i in range(len(states)):
        t_feature = "transition:" + states[i] + "+" + "stop"
        total = scores[len(x)-1, i] + f.get(t_feature, -10**8)
        
        if total > best_score:
            best_score = total
            best_parent = i
    
    best_state = [states[best_parent]]
    prev_parent = best_parent
    for i in range(len(x)-1, 0, -1):
        prev_parent = parents[i, prev_parent]
        output = states[prev_parent]
        best_state = [output] + best_state
    return best_state

In [7]:
def get_prediction(file_dir,resulting_dict):
    train_path = os.path.join(file_dir, "train")
    input_path = os.path.join(file_dir, "dev.in")
    
    test_set = open(input_path, 'r', encoding="utf-8")
    lines = test_set.readlines()
    
    sequences = []
    sequence = []
    for line in lines:
        if line == '\n':
            sequences.append(sequence)
            sequence = []
            continue

        line = line.replace('\n', '')
        sequence.append(line)

    out_path = os.path.join(file_dir, "dev.p5.SP.out")
    out_file = open(out_path, "w", encoding="utf-8")
    
    for x in sequences:
        predicted = viterbi_algo(x, states, resulting_dict)
        for i in range(len(x)):
            out_file.write(x[i] + ' ' + predicted[i] + '\n')
        out_file.write('\n')
    out_file.close()
        
    
    print("Complete prediction for dataset")

file_dir = "full"
get_prediction(file_dir,resulting_dict)

Complete prediction for dataset


In [8]:
def eval(pred,gold):
    f_pred = open(pred,encoding = 'utf-8')
    f_gold = open(gold,encoding = 'utf-8')
    data_pred = f_pred.readlines()
    data_gold = f_gold.readlines()
    gold_tags = list()
    pred_tags = list()
    for sentence in range(len(data_pred)):
        words_pred = data_pred[sentence].strip().split(' ')
        words_gold = data_gold[sentence].strip().split(' ')  
        if len(words_gold)==1:
            continue
        gold_tags.append(words_gold[2])
        pred_tags.append(words_pred[2])
    return gold_tags,pred_tags

processed 2097 tokens with 235 phrases; found: 236 phrases; correct: 163.
accuracy:  72.21%; (non-O)
accuracy:  93.75%; precision:  69.07%; recall:  69.36%; FB1:  69.21
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  1
              geo: precision:  85.88%; recall:  75.26%; FB1:  80.22  85
              gpe: precision:  68.00%; recall:  80.95%; FB1:  73.91  25
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
              org: precision:  45.71%; recall:  48.48%; FB1:  47.06  35
              per: precision:  71.88%; recall:  76.67%; FB1:  74.19  32
              tim: precision:  64.15%; recall:  65.38%; FB1:  64.76  53
((69.0677966101695, 69.36170212765957, 69.2144373673036), 0)


In [9]:
def structured_perceptron(file_dir,resulting_dict,epoch = 15, lr= 0.01):
    train_path = os.path.join(file_dir, "train")
    input_path = os.path.join(file_dir, "dev.in")
    test_set = open(train_path, 'r', encoding="utf-8")
    lines = test_set.readlines()
    copy_dict = resulting_dict.copy()
    for n in range(epoch):
        sequences = [] #ls of sequences
        word_sequence = []
        correct_state = []
        for line in lines:
            temp = []
            if line == '\n':
                sequences.append([word_sequence,correct_state])
                word_sequence = []
                correct_state = []
                continue
            line = line.strip().split(" ")
            word_sequence.append(line[0]+" "+line[1])
            correct_state.append(line[2])
        for x in sequences:
            predicted = viterbi_algo(x[0], states, copy_dict)
            x.append(predicted)
        for word_no in range(len(sequences)):
            sentence = sequences[word_no][0]
            word_only = []
            pos_only = []
            for word_pos in sentence:
                word,pos = word_pos.split(" ")
                word_only.append(word)
                pos_only.append(pos)
            correct_states = sequences[word_no][1]
            predicted_states = sequences[word_no][2]
            #for each prediction, check if its correct
            for i in range(1,len(word_only)):
                if correct_states[i] != predicted_states[i]:
#                     print(correct_states[i],predicted_states[i])
                    copy_dict["emission:"+ predicted_states[i] +"+"+ word_only[i]] -= 1* lr
                    copy_dict["emission:"+ predicted_states[i] +"+"+ pos_only[i]] -= 1*lr
                    copy_dict["transition:"+ predicted_states[i-1] +"+" + predicted_states[i]] -= 1*lr
                    copy_dict["emission:"+ correct_states[i] +"+"+ word_only[i]] += 1*lr
                    copy_dict["emission:"+ correct_states[i] +"+"+ pos_only[i]] += 1*lr
                    copy_dict["transition:"+ correct_states[i-1] +"+"+ correct_states[i]] += 1*lr

    return copy_dict

In [10]:
# Finding the best dictionary based on the number of epoch
temp = emission_dict
temp_res = resulting_dict
best_acc = -999999999
best_dict = {}
for i in range(5,20):
    print("starting")
    print(i) 
    resulting_dict = get_resulting_dict(train_path, emission_dict)
    if temp!= emission_dict:
        print('changed emission dict')
    if temp_res != resulting_dict:
        print('changed resulting dict original')
    resulting_dict = structured_perceptron(file_dir, resulting_dict,epoch = i)

    get_prediction(file_dir, resulting_dict)
    true_path = "full/dev.out"
    pred_path = "full/dev.p5.SP.out"

    g_tags, p_tags = eval(true_path, pred_path)
    res, acc = evaluate(g_tags,p_tags,verbose=True,acc = True)
    print(res)
    if acc > best_acc:
        print("new best found")
        best_acc = acc
        best_dict = resulting_dict

starting
5
Complete prediction for dataset
processed 2097 tokens with 193 phrases; found: 236 phrases; correct: 143.
accuracy:  76.73%; (non-O)
accuracy:  92.94%; precision:  60.59%; recall:  74.09%; FB1:  66.67
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  1
              geo: precision:  65.88%; recall:  83.58%; FB1:  73.68  85
              gpe: precision:  68.00%; recall:  70.83%; FB1:  69.39  25
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
              org: precision:  42.86%; recall:  57.69%; FB1:  49.18  35
              per: precision:  62.50%; recall:  80.00%; FB1:  70.18  32
              tim: precision:  66.04%; recall:  71.43%; FB1:  68.63  53
(60.59322033898306, 74.09326424870466, 66.66666666666667)
new best found
starting
6
Complete prediction for dataset
processed 2097 tokens with 195 phrases; found: 236 phrases; correct: 143.
accuracy:  76.00%; (non-O

Complete prediction for dataset
processed 2097 tokens with 184 phrases; found: 236 phrases; correct: 138.
accuracy:  79.12%; (non-O)
accuracy:  92.56%; precision:  58.47%; recall:  75.00%; FB1:  65.71
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  1
              geo: precision:  68.24%; recall:  80.56%; FB1:  73.89  85
              gpe: precision:  68.00%; recall:  80.95%; FB1:  73.91  25
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
              org: precision:  40.00%; recall:  58.33%; FB1:  47.46  35
              per: precision:  46.88%; recall:  71.43%; FB1:  56.60  32
              tim: precision:  64.15%; recall:  75.56%; FB1:  69.39  53
(58.47457627118644, 75.0, 65.71428571428571)
starting
16
Complete prediction for dataset
processed 2097 tokens with 189 phrases; found: 236 phrases; correct: 140.
accuracy:  78.43%; (non-O)
accuracy:  92.47%; precision:  59.32

In [12]:
# Using the best dictionary to evaluate the dataset
get_prediction(file_dir,best_dict)
true_path = "full/dev.out"
pred_path = "full/dev.p5.SP.out"

g_tags, p_tags = eval(true_path, pred_path)
print(evaluate(g_tags,p_tags,verbose=True))

Complete prediction for dataset
processed 2097 tokens with 183 phrases; found: 236 phrases; correct: 138.
accuracy:  80.33%; (non-O)
accuracy:  92.66%; precision:  58.47%; recall:  75.41%; FB1:  65.87
              art: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
              eve: precision:   0.00%; recall:   0.00%; FB1:   0.00  1
              geo: precision:  65.88%; recall:  82.35%; FB1:  73.20  85
              gpe: precision:  72.00%; recall:  75.00%; FB1:  73.47  25
              nat: precision:   0.00%; recall:   0.00%; FB1:   0.00  2
              org: precision:  40.00%; recall:  58.33%; FB1:  47.46  35
              per: precision:  50.00%; recall:  72.73%; FB1:  59.26  32
              tim: precision:  64.15%; recall:  77.27%; FB1:  70.10  53
((58.47457627118644, 75.40983606557377, 65.8711217183771), 0)
