# 50.007 Machine Learning
## Group Project

## Part 1
Report the precision, recall and F scores of such a baseline system for each dataset:
- EN dataset
  - Entity scores:
    - Entity  precision: 0.5348
    - Entity  recall: 0.7656
    - Entity  F: 0.6297
  - Sentiment scores:
    - Sentiment  precision: 0.3902
    - Sentiment  recall: 0.5586
    - Sentiment  F: 0.4595
- FR dataset
  - Entity scores:
    - Entity  precision: 0.1670
    - Entity  recall: 0.7815
    - Entity  F: 0.2751
  - Sentiment scores:
    - Sentiment  precision: 0.0709
    - Sentiment  recall: 0.3319
    - Sentiment  F: 0.1169

In [1]:
# import statements
import numpy as np
import math
import copy

In [2]:
# function that takes in the filename for the training data
# returns word_sequences, tag_sequences
# word_sequences is a list in the form: [ [x_1_1, x_1_2, ...], [x_2_1, x_2_2, ...], ... [x_m_1, x_m_2, ... ] ]
# tag_sequences is a list in the form: [ [y_1_1, y_1_2, ...], [y_2_1, y_2_2, ...], ... [y_m_1, y_m_2, ... ] ]
def read_training_data(training_filename):
    training_file = open(training_filename, "r", encoding="utf-8")
    
    word_sequences = []
    tag_sequences = []
    
    current_word_sequence = []
    current_tag_sequence = []
    
    for line in training_file:
        training_word_and_tag = line.strip().split(" ")
        
        # add the current word and tag to the current word sequence and current tag sequence
        if (len(training_word_and_tag) == 2):
            current_word_sequence += [training_word_and_tag[0]]
            current_tag_sequence += [training_word_and_tag[1]]
        
        # if the sentence ended (empty line), add the previous word sequence and tag sequence to the lists of
        # word sequences and tag sequences respectively.
        else:
            word_sequences += [copy.deepcopy(current_word_sequence)]
            tag_sequences += [copy.deepcopy(current_tag_sequence)]
            
            current_word_sequence = []
            current_tag_sequence = []
            
    # account for the last word sequence
    if (len(current_word_sequence) != 0):
        word_sequences += [copy.deepcopy(current_word_sequence)]
        tag_sequences += [copy.deepcopy(current_tag_sequence)]
        
    training_file.close()

    return word_sequences, tag_sequences

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in the filename for the test data
# returns the test data as a list in the form: [ [x1_1, x1_2, ...], [x2_1, x2_2, ...] ]
def read_test_data(test_filename):
    test_file = open(test_filename, "r", encoding="utf-8")
    
    test_word_sequences = []
    
    current_test_word_sequence = []

    for line in test_file:
        test_word = line.strip()
        
        # add current word to the current word sequence
        if (len(test_word) != 0):
            current_test_word_sequence += [test_word]
            
        # if sentence ended (len(test_word) == 0)
        else:
            test_word_sequences += [copy.deepcopy(current_test_word_sequence)]
            current_test_word_sequence = []
        
    # account for the last word sequence
    if (len(current_test_word_sequence) != 0):
        test_word_sequences += [copy.deepcopy(current_test_word_sequence)]

    test_file.close()

    return test_word_sequences

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in the filename of the training data and optional k value
# returns emission_dict, tags_list
# emission_dict[x][y] gives the value e(x|y)
def create_emission_dict_tags_list(training_filename, k=1):
    # emission_dict[x][y] gives the value e(x|y)
    emission_dict = {}
    
    # count_y_dict[y] gives the total number of words tagged as y
    count_y_dict = {} 

    # count_x_tagged_as_y_dict[x_i][y_j] gives the number of times each observed variable x_i
    # was tagged as state y_j in the training data
    count_x_tagged_as_y_dict = {}

    # read training data
    word_sequences, tags_sequences = read_training_data(training_filename)

    # fill up count_y_dict and count_x_tagged_as_y_dict
    for sequence_index in range(0, len(word_sequences)):
        for word_index in range(0, len(word_sequences[sequence_index])):
            x = word_sequences[sequence_index][word_index]
            y = tags_sequences[sequence_index][word_index]

            # account for creating dictionary entry for the first time
            if not(y in count_y_dict.keys()):
                count_y_dict[y] = 0

            count_y_dict[y] += 1

            # account for creating dictionary entry for the first time
            if not(x in count_x_tagged_as_y_dict.keys()): 
                count_x_tagged_as_y_dict[x] = {}
                
            if not(y in count_x_tagged_as_y_dict[x].keys()):
                count_x_tagged_as_y_dict[x][y] = 0

            count_x_tagged_as_y_dict[x][y] += 1
        
    tags_list = count_y_dict.keys()

    # fill up emission_dict
    for sequence_index in range(0, len(word_sequences)):
        for word_index in range(0, len(word_sequences[sequence_index])):
            x = word_sequences[sequence_index][word_index]
            y = tags_sequences[sequence_index][word_index]
        
            # account for creating dictionary entry for the first time
            if not(x in emission_dict.keys()):
                emission_dict[x] = {}

            emission_dict[x][y] = count_x_tagged_as_y_dict[x][y] / (count_y_dict[y] + k)
            
            # add entry for "START" and "END"
            emission_dict[x]["START"] = 0
            emission_dict[x]["END"] = 0

    # add entry for #UNK#
    emission_dict["#UNK#"] = {}
    
    for tag in tags_list: # iterate over all the tags used in training
        emission_dict["#UNK#"][tag] = k / (count_y_dict[tag] + k)

    return emission_dict, tags_list

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in observed variable x and hidden state y
# returns emission parameter e(x|y)
def emission(emission_dict, tags_list, x, y, k=1):
    # if tag was not in training data
    if (not(y in tags_list)):
        print("This tag was not in the training data")
        result = 0

    # else if word was not in training data
    elif (not(x in emission_dict.keys())): # treat x as "#UNK#"
        result = emission_dict["#UNK#"][y] # result = k / (count_y_dict[y] + k)
        
    # else if word is was in training data
    else:
        # if x was never tagged as y before during training, the probability is 0
        if not(y in emission_dict[x].keys()):
            emission_dict[x][y] = 0

        result = emission_dict[x][y] 

    return result

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in a filename and a list of results in the form: [ [x1, tag1], [x2, tag2], ...]
# writes the results to a file specified by the filename
def write_result(result_filename, word_sequences, tag_sequences):
    result_file = open(result_filename, "w" ,encoding="utf-8")
    
    for sequence_index in range(0, len(word_sequences)):
        for word_index in range(0, len(word_sequences[sequence_index])):
            result_file.write(word_sequences[sequence_index][word_index] + " " + tag_sequences[sequence_index][word_index] + "\n")
            
        result_file.write("\n")

    result_file.close()

# -----------------------------------------------------------------------------------------------------------------------------
    
# function that takes in the filenames for the training data and test data
# produces the tag y* = arg_max_y e(x|y) for each word in the test data
# writes the results to a file specified by the filename
# returns the results as a list in the form: [ [x1, y*1], [x2, y*2], ... ]
def simple_sentiment_analysis(training_filename, test_filename, result_filename, k=1):    
    # initialise emission_dict
    emission_dict, tags_list = create_emission_dict_tags_list(training_filename, k)
    
    test_word_sequences = read_test_data(test_filename)
    
    prediction_tag_sequences = []

    for test_word_sequence in test_word_sequences:
        current_prediction_tag_sequence = []
        
        for test_word in test_word_sequence:
            # find the tag that gives the highest value for e(test_variable | tag)
            predicted_tag = ""
            highest_emission_value = 0

            for tag in tags_list:
                current_emission_value = emission(emission_dict, tags_list, test_word, tag)

                if current_emission_value > highest_emission_value:
                    highest_emission_value = current_emission_value
                    predicted_tag = tag

            current_prediction_tag_sequence += [predicted_tag]
            
        # at the end of the sentence, add the current prediction tag sequence to the lise prediction_tag_sequences
        prediction_tag_sequences += [copy.deepcopy(current_prediction_tag_sequence)]
        current_prediction_tag_sequence = []

    write_result(result_filename, test_word_sequences, prediction_tag_sequences)

    return test_word_sequences, prediction_tag_sequences, emission_dict

In [3]:
# perform prediction for the EN dataset
en_results = simple_sentiment_analysis("EN/train", "EN/dev.in", "EN/dev.p1.out")

# evaluate prediction for the EN dataset
!python "evalResult.py" "EN/dev.out" "EN/dev.p1.out"


#Entity in gold data: 802
#Entity in prediction: 1148

#Correct Entity : 614
Entity  precision: 0.5348
Entity  recall: 0.7656
Entity  F: 0.6297

#Correct Sentiment : 448
Sentiment  precision: 0.3902
Sentiment  recall: 0.5586
Sentiment  F: 0.4595


In [4]:
# perform prediction for the FR dataset
fr_results = simple_sentiment_analysis("FR/train", "FR/dev.in", "FR/dev.p1.out")

# evaluate prediction for the FR dataset
!python "evalResult.py" "FR/dev.out" "FR/dev.p1.out"


#Entity in gold data: 238
#Entity in prediction: 1114

#Correct Entity : 186
Entity  precision: 0.1670
Entity  recall: 0.7815
Entity  F: 0.2751

#Correct Sentiment : 79
Sentiment  precision: 0.0709
Sentiment  recall: 0.3319
Sentiment  F: 0.1169


In [5]:
# part 1 test case
# create training data for test case
test_case_train_file = open("p1_test_train", "w")
test_case_train_file.write("word1 tag1\n")
test_case_train_file.write("word1 tag1\n")
test_case_train_file.write("word1 tag1\n")
test_case_train_file.write("word1 tag2\n")
test_case_train_file.write("\n")
test_case_train_file.write("word2 tag2\n")
test_case_train_file.write("word2 tag2\n")
test_case_train_file.write("word2 tag2\n")
test_case_train_file.write("\n")
test_case_train_file.write("word3 tag3")
test_case_train_file.close()

# create test data for test case
test_case_test_file = open("p1_test_in", "w")
test_case_test_file.write("word1\n")
test_case_test_file.write("word2\n")
test_case_test_file.write("word3\n")
test_case_test_file.write("unknown_word")
test_case_test_file.close()

# create expected output for test case
test_case_expected_file = open("p1_test_out", "w")
test_case_expected_file.write("word1 tag1\n")
test_case_expected_file.write("word2 tag2\n")
test_case_expected_file.write("word3 tag3\n")
test_case_expected_file.write("unknown_word tag3")
test_case_expected_file.close()

# perform the test
test_word_sequences, prediction_tag_sequences, emission_dict = simple_sentiment_analysis("p1_test_train", "p1_test_in", "p1_test_prediction")
test_word_sequences, expected_tag_sequences= read_training_data("p1_test_out")

# show results for the test
print("\nTest case emission_dict:")
print(emission_dict)
print("")

test_case_passed = True

for sequence_index in range(0, len(test_word_sequences)):
    for tag_index in range(0, len(test_word_sequences[sequence_index])):
        if prediction_tag_sequences[sequence_index][tag_index] != expected_tag_sequences[sequence_index][tag_index]:
            test_case_passed = False
            
            print("Test case failed.")
            print(f"Word: {test_case_prediction[i][0]}")
            print(f"Tag: {test_case_prediction[i][1]}")
            print(f"Expected tag: {test_case_expected[i][1]}\n")

print(f"Test case passed: {test_case_passed}")


Test case emission_dict:
{'word1': {'tag1': 0.75, 'START': 0, 'END': 0, 'tag2': 0.2, 'tag3': 0}, 'word2': {'tag2': 0.6, 'START': 0, 'END': 0, 'tag1': 0, 'tag3': 0}, 'word3': {'tag3': 0.5, 'START': 0, 'END': 0, 'tag1': 0, 'tag2': 0}, '#UNK#': {'tag1': 0.25, 'tag2': 0.2, 'tag3': 0.5}}

Test case passed: True


## Part 2

In [29]:
# takes in list of tag sequences in the form [ [y1_1, y1_2, ... ], [y2_1, y2_2, ...], ... ]
# outputs transition_dict, start_stop_transition_dict
def create_transition_dict(input_list):
    # Create transition dict
    # ========================
    transition_dict = {}
    
    # Get list of tags
    tags = set([tag for sentence in input_list for tag in sentence])

    # Update probability for each instance of tag1 > tag2 inside transition_dict
    for tag1 in tags:
        for tag2 in tags:
            count = 0
            total = 0
            for sentence in input_list:
                total += len(sentence) - 1
                for i in range(len(sentence) - 1):
                    if sentence[i] == tag1 and sentence[i+1] == tag2:
                        count += 1
            if count > 0:
                transition_dict[(tag1, tag2)] = count / total
    
    # Create initial probability dict
    # ========================
    start_tag_count_dict = {}
    stop_tag_count_dict = {}
    
    start_stop_transition_dict = {}
    
    # Get num of starting tags that appear
    for sentence in input_list:
        if sentence[0] not in start_tag_count_dict:
            start_tag_count_dict[sentence[0]] = 1
        else:
            start_tag_count_dict[sentence[0]] += 1
        
        if sentence[-1] not in stop_tag_count_dict:
            stop_tag_count_dict[sentence[-1]] = 1
        else:
            stop_tag_count_dict[sentence[-1]] += 1
    
    # Fill in initial prob dict with num of starting/ending tags divided by total sentence num
    for tag in start_tag_count_dict:
        start_stop_transition_dict[('START',tag)] = start_tag_count_dict[tag] / len(input_list)
    for tag in stop_tag_count_dict:
        start_stop_transition_dict[(tag,'STOP')] = stop_tag_count_dict[tag] / len(input_list)
    
    return transition_dict, start_stop_transition_dict

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in observed variable x and hidden state y
# returns transition parameter q(yi|yi-1)
def transition(transition_dict, yi_minus_1, yi):
    if (yi_minus_1, yi) not in transition_dict.keys():
        transition_dict[(yi_minus_1, yi)] = 0

    result = transition_dict[(yi_minus_1, yi)]

    return result
    
# -----------------------------------------------------------------------------------------------------------------------------
    
# emission_dict, tags_list = create_emission_dict_tags_list(training_filename, k)
# transition_dict, start_stop_transition_dict = create_transition_dict(training_filename)

# creates the table of pi values
def viterby_first_order(word_sequence, tags_list, emission_dict, transition_dict, start_stop_transition_dict):
    pi_dict = {}
    
    # =============================================================================================================================
    # initialization
    pi_dict[0] = {}
    
    for tag in tags_list:
        pi_dict[0][tag] = float('-inf')
        
    pi_dict[0]["START"] = math.log(1)
    
    for word_index in range(1, len(word_sequence)+1): # index for each word in each sentence
        for tag in tags_list: # each tag
            if word_index not in pi_dict.keys():
                pi_dict[word_index] = {}

            pi_dict[word_index][tag] = float('-inf') # initialize pi(j, u) = 0 for all j and u

    x_1 = word_sequence[0]   
    for v in tags_list:
        max_pi_val = float('-inf')

        for u in tags_list:
            pi = pi_dict[0][u]
            trans = transition(start_stop_transition_dict, u, v)
            emi = emission(emission_dict, tags_list, x_1, v)

            if trans != 0:
                log_trans = math.log(trans)
            else:
                log_trans = float('-inf')

            if emi != 0:
                log_emi = math.log(emi)
            else:
                log_emi = float('-inf')

            current_pi_val = pi + log_trans + log_emi

            # save the value that maximises
            if (current_pi_val > max_pi_val):
                max_pi_val = current_pi_val

        pi_dict[1][v] = max_pi_val
        
    # =============================================================================================================================
    # intermediate steps
    # for each observed variable
    for j in range(1, len(word_sequence)):
        x_j_plus_1 = word_sequence[j] # refers to the jth word (to calculate emission)
        
        # for each hidden state v
        for v in tags_list:
            
            # pi(j+1, v) = max over all u { pi(j,u) * transition(u, v) * emissision(x_j_plus_1, v) }
            max_pi_val = float('-inf')
            
            for u in tags_list:
                pi = pi_dict[j][u]
                trans = transition(transition_dict, u, v)
                emi = emission(emission_dict, tags_list, x_j_plus_1, v)

                if trans != 0:
                    log_trans = math.log(trans)
                else:
                    log_trans = float('-inf')
                
                if emi != 0:
                    log_emi = math.log(emi)
                else:
                    log_emi = float('-inf')
                
                current_pi_val = pi + log_trans + log_emi

                # save the value that maximises
                if (current_pi_val > max_pi_val):
                    max_pi_val = current_pi_val
        
            pi_dict[j+1][v] = max_pi_val
    
    # =============================================================================================================================
    # final step
    max_pi_val = float('-inf')
    
    # for each hidden state u
    for u in tags_list:
        pi = pi_dict[len(word_sequence)][u]
        trans = transition(transition_dict, u, "STOP")
        
        if trans != 0:
            log_trans = math.log(trans)
        else:
            log_trans = float('-inf')

        current_pi_val = pi + log_trans
        
        # save the value that maximises
        if (current_pi_val > max_pi_val):
            max_pi_val = current_pi_val
        
    pi_dict[len(word_sequence) + 1] = {}
    pi_dict[len(word_sequence) + 1]["STOP"]  = max_pi_val
    
    # =============================================================================================================================
    # BACKTRACKING
    decoding_list = []
    # check final layer argmax
    argmax = float('-inf')
    currentmax = 0
    argmax_index = 0
    
    for u in tags_list:
        pi = pi_dict[len(pi_dict)-2][u]
        trans = transition(transition_dict, u, "STOP")
        
        currentmax = pi + log_trans
        
        if currentmax > argmax:
            argmax = currentmax
            argmax_index = u
        
    decoding_list.append(argmax_index)
    
    
    # Backtrack rest of pi_dict
    for j in range(len(pi_dict)-3, 0, -1):
        
        argmax = float('-inf')
        currentmax = 0
        argmax_index = 0
    
        for u in tags_list:
    
            pi = pi_dict[j][u]
            trans = transition(transition_dict, u, j+1)

            currentmax = pi + log_trans

            if currentmax > argmax:
                argmax = currentmax
                argmax_index = u
        
        decoding_list.append(argmax_index)
        
    decoding_list = decoding_list[::-1]
            
    return pi_dict,decoding_list


# print(transition_dict)
# print(emission_dict)
word_sequences = read_test_data("EN/dev.in")
word_sequence = word_sequences[0]

word_seq, tag_seq = read_training_data("EN/train")

training_filename = "EN/train"
emission_dict, tags_list = create_emission_dict_tags_list(training_filename, 1)
transition_dict, start_stop_transition_dict = create_transition_dict(tag_seq)

viterby_first_order(word_sequences[0], tags_list, emission_dict, transition_dict, start_stop_transition_dict)

({0: {'O': -inf,
   'B-INTJ': -inf,
   'B-PP': -inf,
   'B-NP': -inf,
   'I-NP': -inf,
   'B-VP': -inf,
   'B-PRT': -inf,
   'I-VP': -inf,
   'B-ADJP': -inf,
   'B-SBAR': -inf,
   'B-ADVP': -inf,
   'I-INTJ': -inf,
   'B-CONJP': -inf,
   'I-CONJP': -inf,
   'I-ADVP': -inf,
   'I-ADJP': -inf,
   'I-SBAR': -inf,
   'I-PP': -inf,
   'START': 0.0},
  1: {'O': -inf,
   'B-INTJ': -inf,
   'B-PP': -inf,
   'B-NP': -inf,
   'I-NP': -inf,
   'B-VP': -inf,
   'B-PRT': -inf,
   'I-VP': -inf,
   'B-ADJP': -inf,
   'B-SBAR': -inf,
   'B-ADVP': -inf,
   'I-INTJ': -inf,
   'B-CONJP': -inf,
   'I-CONJP': -inf,
   'I-ADVP': -inf,
   'I-ADJP': -inf,
   'I-SBAR': -inf,
   'I-PP': -inf},
  2: {'O': -inf,
   'B-INTJ': -inf,
   'B-PP': -inf,
   'B-NP': -inf,
   'I-NP': -inf,
   'B-VP': -inf,
   'B-PRT': -inf,
   'I-VP': -inf,
   'B-ADJP': -inf,
   'B-SBAR': -inf,
   'B-ADVP': -inf,
   'I-INTJ': -inf,
   'B-CONJP': -inf,
   'I-CONJP': -inf,
   'I-ADVP': -inf,
   'I-ADJP': -inf,
   'I-SBAR': -inf,
   'I-PP': 

In [30]:
start_stop_transition_dict

{('START', 'O'): 0.42105263157894735,
 ('START', 'B-VP'): 0.11070780399274047,
 ('START', 'B-NP'): 0.3466424682395644,
 ('START', 'B-INTJ'): 0.0544464609800363,
 ('START', 'B-ADVP'): 0.038112522686025406,
 ('START', 'B-PP'): 0.014519056261343012,
 ('START', 'B-ADJP'): 0.009074410163339383,
 ('START', 'B-SBAR'): 0.003629764065335753,
 ('START', 'B-CONJP'): 0.0018148820326678765,
 ('O', 'STOP'): 0.6969147005444646,
 ('B-INTJ', 'STOP'): 0.08166969147005444,
 ('I-NP', 'STOP'): 0.06896551724137931,
 ('I-INTJ', 'STOP'): 0.043557168784029036,
 ('B-NP', 'STOP'): 0.06170598911070781,
 ('B-ADVP', 'STOP'): 0.007259528130671506,
 ('I-ADVP', 'STOP'): 0.003629764065335753,
 ('I-VP', 'STOP'): 0.010889292196007259,
 ('B-ADJP', 'STOP'): 0.007259528130671506,
 ('B-VP', 'STOP'): 0.0054446460980036296,
 ('I-ADJP', 'STOP'): 0.0054446460980036296,
 ('B-PRT', 'STOP'): 0.003629764065335753,
 ('B-PP', 'STOP'): 0.003629764065335753,
 ('O', 'O'): 0,
 ('B-INTJ', 'O'): 0,
 ('B-PP', 'O'): 0,
 ('B-NP', 'O'): 0,
 ('I

In [13]:
tag_sequences = []

for i in range(len(word_sequences)):
    most_likely_tags = viterby_first_order(word_sequences[i], tags_list, emission_dict, transition_dict, start_stop_transition_dict)
    tag_sequences.append(most_likely_tags)

print(tag_sequences)    
    
write_result("EN/dev.p2.out",word_sequences, tag_sequences)

# evaluate prediction for the FR dataset
!python "evalResult.py" "EN/dev.out" "EN/dev.p2.out"


#Entity in gold data: 802
#Entity in prediction: 940

#Correct Entity : 343
Entity  precision: 0.3649
Entity  recall: 0.4277
Entity  F: 0.3938

#Correct Sentiment : 27
Sentiment  precision: 0.0287
Sentiment  recall: 0.0337
Sentiment  F: 0.0310


In [8]:
def write_result_viterby(word_sequences, tag_sequences, result_filename):
    
    test_data = read_test_data(test_filename)
    
    with open(result_filename, "w" ,encoding="utf-8") as fp:
        
        for word,tag in zip(test_data, results):
            # account for empty lines
            if(len(word) == 0):
                fp.write("\n")
            else:
                fp.write(word[0] + " " + tag + "\n")
    fp.close()
    
# -----------------------------------------------------------------------------------------------------------------------------


In [9]:
def viterby_backtracking(test_filename, result_filename):
    global transition_dict
    global pi_dict
    global decoding_list
    
    

    tags_list_w_start_stop = list(tags_list)
    
    # check final layer argmax
    argmax = float('-inf')
    currentmax = 0
    argmax_index = 0
    
    for u in tags_list_w_start_stop:
        pi = pi_dict[len(pi_dict)-1][u]
        trans = transition(u, "STOP")
        
        if trans != 0:
            trans = math.log(trans)
        
        if pi == 0:
            pi = float('-inf')
        
        currentmax = pi + trans
        
        if currentmax > argmax:
            argmax = currentmax
            argmax_index = u
        
    decoding_list.append(argmax_index)
    
    
    # Backtrack rest of pi_dict
    for j in range(len(pi_dict)-2, 0, -1):
        
        argmax = float('-inf')
        currentmax = 1
        argmax_index = 0
    
        for u in tags_list_w_start_stop:
    
            pi = pi_dict[j][u]
            trans = transition(u, decoding_list[-1])
            
            if trans != 0:
                trans = math.log(trans)
            if pi == 0:
                pi = float('-inf')

            currentmax = pi + trans

            if currentmax > argmax:
                argmax = currentmax
                argmax_index = u
        
        decoding_list.append(argmax_index)
        
    decoding_list = decoding_list[::-1]
    
    write_result_viterby(test_filename, result_filename, decoding_list)
            
    return decoding_list   

In [10]:
# global variables
emission_dict = {} # emission_dict[x][y] gives e(x|y)
tags_list = []
transition_dict = {}
pi_dict = {}
decoding_list = []

In [11]:
# perform prediction for the EN dataset
en_results = simple_sentiment_analysis("EN/train", "EN/dev.in", "EN/dev.p1.out")

# evaluate prediction for the EN dataset
!python "evalResult.py" "EN/dev.out" "EN/dev.p1.out"


#Entity in gold data: 802
#Entity in prediction: 1148

#Correct Entity : 614
Entity  precision: 0.5348
Entity  recall: 0.7656
Entity  F: 0.6297

#Correct Sentiment : 448
Sentiment  precision: 0.3902
Sentiment  recall: 0.5586
Sentiment  F: 0.4595


In [12]:
viterby_first_order("EN/train", "EN/dev.in")
viterby_backtracking("EN/dev.in", "EN/dev.p2.out")

TypeError: viterby_first_order() missing 3 required positional arguments: 'emission_dict', 'transition_dict', and 'start_stop_transition_dict'

In [None]:
!python "evalResult.py" "EN/dev.out" "EN/dev.p2.out"

===================================================================

In [None]:
# global variables
emission_dict = {} # emission_dict[x][y] gives e(x|y)
tags_list = []
transition_dict = {}
pi_dict = {}
decoding_list = []

In [None]:
# # perform prediction for the FR dataset
fr_results = simple_sentiment_analysis("FR/train", "FR/dev.in", "FR/dev.p1.out")

# # evaluate prediction for the FR dataset
!python "evalResult.py" "FR/dev.out" "FR/dev.p1.out"

In [None]:
viterby_first_order("FR/train", "FR/dev.in")
viterby_backtracking("FR/dev.in", "FR/dev.p2.out")

In [None]:
!python "evalResult.py" "FR/dev.out" "FR/dev.p2.out"