## Setting up the data

In [1]:
# Import libraries
import os
import numpy as np
from collections import Counter

# Set OS-independent paths, relative to current directory
en_train_path = os.path.join("dataset", "EN", "train")
en_dev_in_path = os.path.join("dataset", "EN", "dev.in")
en_dev_out_path = os.path.join("dataset", "EN", "dev.out")
en_dev_p1_out_path = os.path.join("dataset", "EN", "dev.p1.out")
en_dev_p2_out_path = os.path.join("dataset", "EN", "dev.p2.out")
en_dev_p3_out_path = os.path.join("dataset", "EN", "dev.p3.out")
en_dev_p4_out_path = os.path.join("dataset", "EN", "dev.p4.out")

fr_train_path = os.path.join("dataset", "FR", "train")
fr_dev_in_path = os.path.join("dataset", "FR", "dev.in")
fr_dev_out_path = os.path.join("dataset", "FR", "dev.out")
fr_dev_p1_out_path = os.path.join("dataset", "FR", "dev.p1.out")
fr_dev_p2_out_path = os.path.join("dataset", "FR", "dev.p2.out")
fr_dev_p3_out_path = os.path.join("dataset", "FR", "dev.p3.out")
fr_dev_p4_out_path = os.path.join("dataset", "FR", "dev.p4.out")

# Define constant variables
N = 7
labels = {"START": 0,
          "O": 1,
          "B-positive": 2,
          "I-positive": 3,
          "B-neutral": 4,
          "I-neutral": 5,
          "B-negative": 6,
          "I-negative": 7,
          "END": 8}
labels_list = ["START", "O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative", "END"]



In [2]:
# Create labels dictionary and list (for EN)
def create_labels_array_dict(filepath):

    output_labels_dict = {"START": 0}
    
    output_labels_array = ["START"]

    counter = 1

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2: 
                _, label = line.strip().rsplit(" ", 1)
                
                # Reference to check if key exist in dictionary: https://www.geeksforgeeks.org/python-check-whether-given-key-already-exists-in-a-dictionary/
                if label not in output_labels_dict.keys():
                    output_labels_dict[label] = counter
                    output_labels_array.append(label)
                    counter += 1
            else:
                continue

    output_labels_dict["END"] = counter
    output_labels_array.append("END")

    # counter - 1 to replace global variable N. 
    return output_labels_array, output_labels_dict, counter - 1


print(create_labels_array_dict(en_train_path))
print(create_labels_array_dict(fr_train_path))


(['START', 'O', 'B-INTJ', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'B-PRT', 'I-VP', 'B-ADJP', 'B-SBAR', 'B-ADVP', 'I-INTJ', 'B-CONJP', 'I-CONJP', 'I-ADVP', 'I-ADJP', 'I-SBAR', 'I-PP', 'END'], {'START': 0, 'O': 1, 'B-INTJ': 2, 'B-PP': 3, 'B-NP': 4, 'I-NP': 5, 'B-VP': 6, 'B-PRT': 7, 'I-VP': 8, 'B-ADJP': 9, 'B-SBAR': 10, 'B-ADVP': 11, 'I-INTJ': 12, 'B-CONJP': 13, 'I-CONJP': 14, 'I-ADVP': 15, 'I-ADJP': 16, 'I-SBAR': 17, 'I-PP': 18, 'END': 19}, 18)
(['START', 'O', 'B-positive', 'I-positive', 'B-negative', 'B-neutral', 'I-negative', 'I-neutral', 'END'], {'START': 0, 'O': 1, 'B-positive': 2, 'I-positive': 3, 'B-negative': 4, 'B-neutral': 5, 'I-negative': 6, 'I-neutral': 7, 'END': 8}, 7)


In [3]:
# Read training data
print(labels)
def read_training_data(filepath):
    ''' NOTE: This returns results in tuple form, but without start and end. Not sure if anyone needs this function to be in this form, 
                so I wrote another function called generate_data_tuple_list_with_start_end() which is the same as read_training_data but with
                start and end. - Jonah'''

    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2: # Make sure the line has two elements: word and label
                word, label = line.strip().rsplit(" ", 1)
                results.append((word, labels[label]))
                
            else:
                continue
    return results

print(read_training_data(fr_train_path))

# Read dev.in data
# There are no labels, just list of words
def read_dev_in_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    return results
# print(read_dev_in_data(fr_dev_in_path))


# Generate data in tuple form but with (" ", START_index) and (" ", END_index)
def generate_data_tuple_list_with_start_end(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2: # Like in read_training_data, to make sure that this line has two elements: word and label
                word, label = line.strip().rsplit(" ", 1)
                results.append((word, labels[label]))
                
            else:
                # Enters here if it is the end of the sequence.
                results.append((" ", labels["END"]))
                results.append((" ", labels["START"]))

                
    return results

print(generate_data_tuple_list_with_start_end(fr_train_path))

{'START': 0, 'O': 1, 'B-positive': 2, 'I-positive': 3, 'B-neutral': 4, 'I-neutral': 5, 'B-negative': 6, 'I-negative': 7, 'END': 8}
[('Nous', 1), ('avons', 1), ('tout', 1), ('aimé', 1), ('.', 1), ('Le', 1), ('foi', 2), ('gras', 3), ('est', 1), ('le', 1), ('meilleur', 1), ('de', 1), ("l'île", 1), ('.', 1), ('Une', 1), ('perle', 1), ('.', 1), ('C', 1), ('est', 1), ('l', 1), ('endroit', 6), ('parfait', 1), ('si', 1), ('on', 1), ('a', 1), ('envie', 1), ('de', 1), ('se', 1), ('faire', 1), ('oublier', 1), ('et', 1), ('ignorer', 1), ('!', 1), ('La', 1), ('glace', 4), ("d'accompagnement", 1), ('était', 1), ('correcte', 1), ('mais', 1), ('une', 1), ('petite', 1), ('boule', 1), ('seulement', 1), ('.', 1), ('Bonne', 1), ('soirée', 1), ('passé', 1), ('dans', 1), ('ce', 1), ('lieu', 2), ('L', 1), ("'", 1), ('ambiance', 2), ('est', 1), ('bruyante', 1), ('mais', 1), ('relativement', 1), ('agréable', 1), ('.', 1), ('Restaurant', 1), ('de', 1), ('village', 1), ('.', 1), ('Addition', 1), (':', 1), ('160'

In [4]:
# Calculate number of each labels, with the keys being the index of the label in labels_list
def calculate_number_of_labels(input_data):
    return Counter(elem[1] for elem in input_data)
print(calculate_number_of_labels(read_training_data(fr_train_path)))

# Print out all the words that are unique
def get_all_unique_words(input_data):
    print(len(set(item[0] for item in input_data)))
    return list(set(item[0] for item in input_data))
print(get_all_unique_words(read_training_data(fr_train_path)))




##################################
###### Part 1 Point 1 and 2 ######

# For the return value, we follow the matrix format defined in the slides accordingly
def calculate_emission_parameters(input_data, all_unique_tokens, k=1.0):
    
    ''' NOTE: input_data is a list of tuples. '''

    # Initialisation for emission_counts
    # Final index is for #UNK# tokens
    emission_counts = np.zeros((N, len(all_unique_tokens) + 1), dtype=np.longdouble)

    # Calculate number of each labels and store in a list
    label_counts = np.array(list(val[1] for val in sorted(calculate_number_of_labels(input_data).items())))
    print(label_counts)

    for token, labels_list_index in input_data:
        emission_counts[labels_list_index - 1][all_unique_tokens.index(token)] += 1

    # This is for the other case of #UNK# tokens
    emission_counts[:, -1] = np.full((1, N), k)[0]


    # Initialisation for emission_parameters
    emission_parameters = np.empty((N, len(all_unique_tokens) + 1), dtype=np.longdouble)

    for index, _ in enumerate(emission_counts):
        emission_parameters[index] = emission_counts[index] / (label_counts[index] + k)
    
    return emission_parameters
 


###### Part 1 Point 1 and 2 ######
##################################

Counter({1: 24512, 2: 810, 6: 675, 7: 233, 3: 181, 4: 113, 5: 43})
4315
['Couscous', 'jambes', 'réserver', 'garni', 'Plats', 'piscines', 'oublis', 'sens', 'charcuterie', 'Continuez', 'claquer', 'centre', 'accueilli', 'Honnêtement', 'conseillé', 'bo', 'avariée', 'offre', 'agreable', 'sent', 'révélant', 'volonté', 'cantine', 'revenu', 'réservez', 'justifieraient', 'sert', 'prétention', 'plage', 'placés', 'rassuré', "l'autre", 'choco', 'sorte', "l'écoute", '300', 'table', 'annoncé', 'suzette', 'agencé', 'escapades', 'conseille', 'Gault', 'vite', 'adulte', "l'endroit", 'desserts', 'zero', 'chant', 'nourriture', 'perfectible', 'Amoureux', 'perdue', 'chers', 'expérience', 'Lannion', 'balade', 'tamisée', "m'as", '26', "qu'est", 'connu', 'samedi', 'chance', 'conscient', 'parler', "j'aurais", 'raison', 'tournée', 'idée', 'sois', 'dépassé', 'barre', "l'expérience", 'règne', 'Merci', 'tocino', 'organisé', 'exception', 'spécial', 'baignent', 'législation', 'lasagnes', 'mousseline', 'demandant', 'M

In [5]:
# Get tag from word
def get_label_from_word(input_word, all_unique_tokens, emission_parameters):
    if input_word not in all_unique_tokens:
        column_to_consider = emission_parameters[:, -1]
    else:
        column_to_consider = emission_parameters[:, all_unique_tokens.index(input_word)]

    # Randomly choose the index if there is more than one argmax value
    x = np.random.choice(np.argwhere(np.isclose(column_to_consider, column_to_consider.max())).flatten()) + 1
    return labels_list[x]
  
# print(get_label_from_word("nourriture", get_all_unique_words(read_training_data(fr_train_path)), calculate_emission_parameters(read_training_data(fr_train_path), get_all_unique_words(read_training_data(fr_train_path)))))

In [6]:
def write_prediction_output_to_file(language):
    if language == "EN":
        # Conduct training/supervised learning (M-Step)
        train_data = read_training_data(en_train_path)
        all_unique_tokens = get_all_unique_words(train_data)
        emission_parameters = calculate_emission_parameters(train_data, all_unique_tokens)

        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(en_dev_in_path)
        for token in test_data:
            if token:
                predicted_results.append(token + " " + get_label_from_word(token, all_unique_tokens, emission_parameters))
            else:
                predicted_results.append("")
        with open(en_dev_p1_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")

    elif language == "FR":
        # Conduct training/supervised learning (M-Step)
        train_data = read_training_data(fr_train_path)
        all_unique_tokens = get_all_unique_words(train_data)
        emission_parameters = calculate_emission_parameters(train_data, all_unique_tokens)

        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(fr_dev_in_path)
        for token in test_data:
            if token:
                predicted_results.append(token + " " + get_label_from_word(token, all_unique_tokens, emission_parameters))
            else:
                predicted_results.append("")
        with open(fr_dev_p1_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")
                
write_prediction_output_to_file("FR")

# Temporary store the FR labels, labels_list and N 
FR_labels_list, FR_labels, FR_N = labels_list, labels, N


labels_list, labels, N = create_labels_array_dict(en_train_path)
#print(labels_list, labels)
write_prediction_output_to_file("EN")



4315
[24512   810   181   113    43   675   233]
3651
[2191  257  716 2768 1904 1327   85  607  159   71  386   88    2    2
   51   61    4    6]


In [7]:
#############################
####### Part 1 Point 3 ######

def read_dev_out_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    return results



####### Part 1 Point 3 ######
#############################









''' NOTE: Use the powershell command in the cell below this cell instead. '''

def count_number_of_entities(input_tuple_data, labels_list):
    # This is the function for counting entities, that is, when transit from START to B OR I and from O to B OR I and from I to B
    # Also need to consider whether there are cases of, for example, going from I-positive to I-neutral, or B-positive to B-neutral, etc.
    # By counting the number of transition from O to an entity, we should get the number of entities.

    total_count_of_entities = 0; 


    #############################
    # skip = {}
    #############################


    

    for i in range(0, len(input_tuple_data) - 1):
        # (len(input_tuple_data) - 1) is due to the accessing of index (i + 1) below. 
        # So subtract by one in the for loop range to prevent iteration from going out of the index range of input_tuple_data. 

        # To store the labels in one transition first 
        prev_label_index = input_tuple_data[i][1]
        next_label_index = input_tuple_data[i + 1][1]

        # Reference to check if a string contains a substring: https://stackoverflow.com/questions/3437059/does-python-have-a-string-contains-substring-method
        if((labels_list[prev_label_index] == "START" or labels_list[prev_label_index] == "O") 
           and ("B-" in labels_list[next_label_index] or "I-" in labels_list[next_label_index])):
            # Considers cases where labels transit from START or O to any B- OR I- labels. 
            total_count_of_entities += 1
            # print("Added (", labels_list[prev_label_index], ", ", labels_list[next_label_index], ")")

        elif("B-" in labels_list[prev_label_index] or "I-" in labels_list[prev_label_index]):
            if("B-" in labels_list[next_label_index]):
                # Considers cases like transition from B-positive to B-positive, or B-positive to B-neutral, etc.
                # Also considers cases like transition from I-positive to B-positive or B-neutral or B-negative.
                total_count_of_entities += 1
                # print("Added (", labels_list[prev_label_index], ", ", labels_list[next_label_index], ")")

            elif("I-" in labels_list[next_label_index] and labels_list[prev_label_index][2:] != labels_list[next_label_index][2:]):
                # Considers cases like transition from B-positive to I-neutral and I-negative, BUT NOT B-positive to I-positive.
                # Also considers cases like transition from I-positive to I-neutral and I-negative, BUT NOT I-positve to I-positive.

                # "labels_list[prev_label_index][2:] != labels_list[next_label_index][2:]" part of the if conditions is 
                #   to check that both labels are not both positive and positive, or neutral and neutral, or negative and negative. 

                total_count_of_entities += 1
                # print("Added (", labels_list[prev_label_index], ", ", labels_list[next_label_index], ")")




    #############################
    #         else:
    #             stringSkip = "(" + labels_list[prev_label_index] + ", " + labels_list[next_label_index] + ")"
    #             if stringSkip not in skip:
    #                 skip[stringSkip] = 1
    #             else:
    #                 skip[stringSkip] += 1

    #     else:
    #         stringSkip = "(" + labels_list[prev_label_index] + ", " + labels_list[next_label_index] + ")"
    #         if stringSkip not in skip:
    #             skip[stringSkip] = 1
    #         else:
    #             skip[stringSkip] += 1

    # print(skip)
    #############################

    return total_count_of_entities
    

def compare_data(dev_out_tuple_data, dev_p1_out_tuple_data):
    
    total_count_of_correct_entities = 0
    whole_entity_correct = False

    for i in range(1, len(dev_out_tuple_data) - 1):
        # Use dev_out range as anything more in dev_p1_out means that the rest in dev_p1_out are wrong?


        # To store the labels in transitions 
        prev_gold_label_index = dev_out_tuple_data[i - 1][1]
        current_gold_label_index = dev_out_tuple_data[i][1]
        next_gold_label_index = dev_out_tuple_data[i + 1][1]


        # ###############################
        # printout = "(" + dev_out_tuple_data[i][0] + ", " + labels_list[dev_out_tuple_data[i][1]] + ") == (" + dev_p1_out_tuple_data[i][0] + ", " + labels_list[dev_p1_out_tuple_data[i][1]] + ")"
        # ###############################
        # print(current_gold_label_index)
        # print(labels_list[current_gold_label_index])
        # print("=========================================")
        if("B-" in labels_list[current_gold_label_index]):
            # Below if and elif are all checking if the end of the entity has been reached. As long as current label is B-xxxx , it is the start of the entity.

            if dev_out_tuple_data[i] == dev_p1_out_tuple_data[i]:
               # Boolean turns false when one part of gold entity does not match predicted
               whole_entity_correct = True


            if("B-" in labels_list[next_gold_label_index]):
                # Considers cases like transition from B-positive to B-positive, or B-positive to B-neutral, etc.
                

                if(whole_entity_correct):
                    # Have reached the end of one entity before proceeding to a subsequent entity.
                    # Only considering the previous entity (whose end we have reached) without considering the subsequent entity yet.
                    # If the previous gold entity had no part that is different from the predicted data, then add to the count of correctly predicted entities.
                    
                    total_count_of_correct_entities += 1

                # ###############################
                # printout += " : " + str(whole_entity_correct)

                # print(printout)
                # printout = ""
                # ###############################

                
                # Reset boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 
                whole_entity_correct = False

                # ###############################

                # print("Finished Entity.")
                # print()
                # ###############################
                

            elif("I-" in labels_list[next_gold_label_index] and labels_list[current_gold_label_index][2:] != labels_list[next_gold_label_index][2:]):
                # Considers cases like transition from B-positive to I-neutral and I-negative, BUT NOT B-positive to I-positive.
  
                # "labels_list[current_gold_label_index][2:] != labels_list[next_gold_label_index][2:]" part of the if conditions is 
                #   to check that both labels are not both positive and positive, or neutral and neutral, or negative and negative. 

                if(whole_entity_correct):
                    # Have reached the end of one entity before proceeding to a subsequent entity.
                    # Only considering the previous entity (whose end we have reached) without considering the subsequent entity yet.
                    # If the previous gold entity had no part that is different from the predicted data, then add to the count of correctly predicted entities.
                    
                    total_count_of_correct_entities += 1

                # ###############################
                # printout += " : " + str(whole_entity_correct)

                # print(printout)
                # printout = ""
                # ###############################

                
                # Reset boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 
                whole_entity_correct = False

                # ###############################

                # print("Finished Entity.")
                # print()
                # ###############################


            elif(labels_list[next_gold_label_index] == "O" or labels_list[next_gold_label_index] == "END"):
                if(whole_entity_correct):
                    # Have reached the end of one entity before continuing or ending current sequence.
                    # If the previous gold entity had no part that is different from the predicted data, then add to the count of correctly predicted entities.
                    # Set boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 

                    total_count_of_correct_entities += 1

                # ###############################
                # printout += " : " + str(whole_entity_correct)

                # print(printout)
                # printout = ""
                # ###############################

                
                # Reset boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 
                whole_entity_correct = False

                # ###############################
                
                # print("Finished Entity.")
                # print()
                # ###############################

        elif("I-" in labels_list[current_gold_label_index]):
            
            # Checking if it is still in the same entity
            if(("I-" in labels_list[prev_gold_label_index] or "B-" in labels_list[prev_gold_label_index]) 
               and labels_list[prev_gold_label_index][2:] == labels_list[current_gold_label_index][2:]):
                # Considers cases of transition from B-positive to I-positive, B-neutral to I-neutral and B-negative to I-negative.
                # Also considers cases of transition from I-positive to I-positive, I-neutral to  I-neutral, and I-negative to I-negative.

                # "labels_list[prev_gold_label_index][2:] == labels_list[current_gold_label_index][2:]" part of the if conditions is 
                #   to check that both labels are both positive and positive, or neutral and neutral, or negative and negative. 

                # In the middle of an entity.
                # If this part of the gold entity is not the same as the predicted, the whole predicted entity is wrongly predicted.
                whole_entity_correct = dev_out_tuple_data[i] == dev_p1_out_tuple_data[i]

                # ###############################
                # printout += " : " + str(whole_entity_correct)

                # print(printout)
                # printout = ""
                # ###############################


            # Checking if the start of entity has been reached and the start label of the entity is I-xxxxx .
            elif(("I-" in labels_list[prev_gold_label_index] or "B-" in labels_list[prev_gold_label_index]) and labels_list[prev_gold_label_index][2:] != labels_list[current_gold_label_index][2:]):
                # Considers cases like transition from B-positive to I-neutral and I-negative, BUT NOT B-positive to I-positive.
                # Also considers cases like transition from I-positive to I-neutral and I-negative, BUT NOT I-positve to I-positive.
                # if dev_out_tuple_data[i] == dev_p1_out_tuple_data[i], whole_entity_correct becomes True
                whole_entity_correct = dev_out_tuple_data[i] == dev_p1_out_tuple_data[i]

            elif labels_list[prev_gold_label_index] == "O":
                whole_entity_correct = dev_out_tuple_data[i] == dev_p1_out_tuple_data[i]
                
            
            # Checking if end of entity. Use if and not elif as this block must still be checked even if the above if and elif were true.
            if("I-" in labels_list[next_gold_label_index] and labels_list[current_gold_label_index][2:] != labels_list[next_gold_label_index][2:]):
                # Considers cases like transition from I-positive to I-neutral and I-negative, BUT NOT I-positve to I-positive.

                # "labels_list[current_gold_label_index][2:] != labels_list[next_gold_label_index][2:]" part of the if conditions is 
                #   to check that both labels are not both positive and positive, or neutral and neutral, or negative and negative. 

                if whole_entity_correct:
                    # Have reached the end of one entity before proceeding to a subsequent entity.
                    # Only considering the previous entity (whose end we have reached) without considering the subsequent entity yet.
                    # If the previous gold entity had no part that is different from the predicted data, then add to the count of correctly predicted entities.
                    
                    total_count_of_correct_entities += 1

                ###############################
                printout += " : " + str(whole_entity_correct)


                print(printout)
                printout = ""
                print("Next is I-")
                ###############################

                
                # Reset boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 
                whole_entity_correct = False

                ###############################
                print("Finished Entity.")
                print()
                ###############################

            elif("B-" in labels_list[next_gold_label_index] or labels_list[next_gold_label_index] == "O" or labels_list[next_gold_label_index] == "END"):
                if whole_entity_correct:
                    # Have reached the end of one entity before continuing or ending current sequence.
                    # If the previous gold entity had no part that is different from the predicted data, then add to the count of correctly predicted entities.
                    # Set boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 

                    total_count_of_correct_entities += 1


                # ###############################
                # printout += " : " + str(whole_entity_correct)


                # print(printout)
                # printout = ""
                # print("Next is 'B-', 'O' or 'END'")
                # ###############################

                
                # Reset boolean to false, which would be turned to true later if the start of the next gold entity is the same as the predicted 
                whole_entity_correct = False

                # ###############################
                
                # print("Finished Entity.")
                # print()
                # ###############################

        # else:
        #     ###############################
            
        #     printout += " : " + "Unchecked"

        #     print(printout)
        #     printout = ""
        #     print()
        #     ###############################

        # print("Current Total Count: ", total_count_of_correct_entities)
        # print()





                
    return total_count_of_correct_entities

def precision_or_recall_calculation(correct_count, total_count):
    return correct_count / total_count

def f_score_calculation(precision, recall):
    return 2 / ( (1/precision) + (1/recall) )







#######################
####### FR Data #######

labels_list, labels, N = FR_labels_list, FR_labels, FR_N
fr_dev_out_storage = generate_data_tuple_list_with_start_end(fr_dev_out_path)
fr_dev_p1_out_storage = generate_data_tuple_list_with_start_end(fr_dev_p1_out_path)




'''COUNTING PREDICTED'''
total_count_of_predicted_entities = count_number_of_entities(fr_dev_p1_out_storage, labels_list)

'''COUNTING GOLD'''
total_count_of_gold_entities = count_number_of_entities(fr_dev_out_storage, labels_list)

correct_count = compare_data(fr_dev_out_storage, fr_dev_p1_out_storage)
precision = precision_or_recall_calculation(correct_count, total_count_of_predicted_entities)
recall = precision_or_recall_calculation(correct_count, total_count_of_gold_entities)
f_score = f_score_calculation(precision, recall)


print("Total count of FR predicted entities: ", total_count_of_predicted_entities)
print("Total count of FR gold entities: ", total_count_of_gold_entities)
print("Total number of correctly predicted entities: ", correct_count)
print()
print("FR Values:")
print("Precision: ", precision)
print("Recall: ", recall)
print("FR F_Score: ", f_score)



print("\n\n\n\n")



#######################
####### EN Data #######

labels_list, labels, N = create_labels_array_dict(en_train_path)
en_dev_out_storage = generate_data_tuple_list_with_start_end(en_dev_out_path)
en_dev_p1_out_storage = generate_data_tuple_list_with_start_end(en_dev_p1_out_path)

'''COUNTING PREDICTED'''
total_count_of_predicted_entities = count_number_of_entities(en_dev_p1_out_storage, labels_list)

'''COUNTING GOLD'''
total_count_of_gold_entities = count_number_of_entities(en_dev_out_storage, labels_list)

correct_count = compare_data(fr_dev_out_storage, fr_dev_p1_out_storage)
precision = precision_or_recall_calculation(correct_count, total_count_of_predicted_entities)
recall = precision_or_recall_calculation(correct_count, total_count_of_gold_entities)
f_score = f_score_calculation(precision, recall)

print("Total count of EN predicted entities: ", total_count_of_predicted_entities)
print("Total count of EN gold entities: ", total_count_of_gold_entities)
print("Total number of correctly predicted entities: ", correct_count)
print()
print("EN Values:")
print("Precision: ", precision)
print("Recall: ", recall)
print("EN F_Score: ", f_score)




''' NOTE: Use the powershell command in the cell below this cell instead. '''


Total count of FR predicted entities:  1114
Total count of FR gold entities:  238
Total number of correctly predicted entities:  79

FR Values:
Precision:  0.07091561938958707
Recall:  0.3319327731092437
FR F_Score:  0.1168639053254438





Total count of EN predicted entities:  1095
Total count of EN gold entities:  801
Total number of correctly predicted entities:  201

EN Values:
Precision:  0.18356164383561643
Recall:  0.250936329588015
EN F_Score:  0.2120253164556962


' NOTE: Use the powershell command in the cell below this cell instead. '

In [8]:
!python .\dataset\FR\evalResult.py .\dataset\FR\dev.out .\dataset\FR\dev.p1.out

!python .\dataset\EN\evalResult.py .\dataset\EN\dev.out .\dataset\EN\dev.p1.out


#Entity in gold data: 238
#Entity in prediction: 1114

#Correct Entity : 186
Entity  precision: 0.1670
Entity  recall: 0.7815
Entity  F: 0.2751

#Correct Sentiment : 79
Sentiment  precision: 0.0709
Sentiment  recall: 0.3319
Sentiment  F: 0.1169

#Entity in gold data: 802
#Entity in prediction: 1096

#Correct Entity : 589
Entity  precision: 0.5374
Entity  recall: 0.7344
Entity  F: 0.6207

#Correct Sentiment : 448
Sentiment  precision: 0.4088
Sentiment  recall: 0.5586
Sentiment  F: 0.4721


In [72]:
#############################
####### Part 2 Point 1 ######

def count_transition_from_u_to_v(input_tuple_data, index_of_u_label, index_of_v_label):

    # This is the function for Count(y_(i-1), y_i), that is, Count(u, v).

    total_count_of_such_transition = 0; 

    for i in range(0, len(input_tuple_data) - 1):
        # (len(input_tuple_data) - 1) is due to the accessing of index (i + 1) below. 
        # So subtract by one in the for loop range to prevent iteration from going out of the index range of input_tuple_data. 

        # To store the labels in one transition first 
        prev_label_index = input_tuple_data[i][1]
        next_label_index = input_tuple_data[i + 1][1]

        if(prev_label_index == index_of_u_label and next_label_index == index_of_v_label):
            total_count_of_such_transition += 1

    return total_count_of_such_transition



def maximum_likelihood_estimation(transition_from_u_to_v_count, total_count_of_u):
    # This is the MLE function, which is q(y_i | y_(i-1)), that is, q(v | u).

    return transition_from_u_to_v_count / total_count_of_u

def get_emission_from_words(word, unique_words, emission_params):
    if word not in unique_words:
        e = emission_params[:, -1]
    else:
        e = emission_params[:, unique_words.index(word)]
    return np.reshape(e, (len(e), 1))

def get_parents(y):
    return [np.random.choice(labels_list, p=transition_params[:, labels_list.index(y)])]

def viterbi_log(transition, emission, all_unique_tokens, input_data):
    # Initial step
    log_transition = np.log(transition)
    pi = [np.log(np.zeros([transition.shape[0], 1]))]
    pi[0][0] = 0
    parents = []
    j = 0
    predicted_results = []
    
    for x in input_data:
        if x != '':  # Propagate with Viterbi
            b = np.log(get_emission_from_words(x, all_unique_tokens, emission))
            res = np.matmul(pi[j], np.transpose(np.ones(b.shape))) + np.matmul(np.ones(b.shape), np.transpose(b)) + log_transition
            # print(res.max(axis=0))
            # print(np.matmul(pi[j], np.transpose(np.zeros(b.shape))))
            pi.append(np.reshape(res.max(axis=0), (-1, 1)))
            print(get_parents(res))
            print("*******************************")
            parents.append(get_parents(res))
            # parents.append(np.argmax(res, axis = 0))
            j += 1
        else:  # Final step
            # print("new sentence")
            res = pi[j] + log_transition[:,-1:]
            
            output = get_parents(res)
            # debug = [pi[j][output[0]]]

            # Output, trace back for the sequence
            while j > 1:  # Trace until second (first is START)
                j -= 1
                # debug.insert(0, pi[j][output[0]])
                output.insert(0, parents[j][output[0]])
            for i in output:
                predicted_results.append(labels_list[i])
            # for i in range(len(output)):
            #     predicted_results.append(labels_list[output[i]] + ' ' + str(debug[i]))
            predicted_results.append('')

            # Reset
            pi = [np.log(np.zeros([transition.shape[0], 1]))]
            pi[0][0] = 0
            parents = []
            j = 0
            # break
    print(predicted_results)

    return predicted_results

def calculate_transition_parameters(input_data):
    transition_counts = np.zeros([N+2, N+2])
    priori_counts = np.zeros([N+2, 1])
    print(priori_counts.shape)
    prev = 0
    for pair in input_data:
        curr = pair[1]
        priori_counts[prev] += 1
        transition_counts[prev, curr] += 1
        prev = curr
    transition = transition_counts / priori_counts
    transition[-1, 0] = 0  # Ignore transition from END to START
    # print(transition)
    return transition


''' NOTE: Using generate_data_tuple_list_with_start_end() to generate tuple data with start and end included.'''
#######################
####### FR Data #######
print("FR")
print()

labels_list, labels, N = FR_labels_list, FR_labels, FR_N
input_tuple_data = generate_data_tuple_list_with_start_end(fr_train_path) # Note the function used.
labels_count = calculate_number_of_labels(input_tuple_data)



for u in range(0, len(labels_list) - 1):
    # From START to y_n. This is the index for u.

    for v in range(1, len(labels_list)):
        # From y_1 to END. This is the index for v.

        val_of_count_transition_from_u_to_v = count_transition_from_u_to_v(input_tuple_data, u, v)

        print("Count(" + labels_list[u] + ", " + labels_list[v] + "): ", val_of_count_transition_from_u_to_v)
        print("Maximum likelihood estimation is:", maximum_likelihood_estimation(val_of_count_transition_from_u_to_v, labels_count[u]))
        print()
       

#######################
####### EN Data #######
print()
print()
print("EN")
print()

labels_list, labels, N = create_labels_array_dict(en_train_path)
input_tuple_data = generate_data_tuple_list_with_start_end(en_train_path) # Note the function used.
labels_count = calculate_number_of_labels(input_tuple_data)


for u in range(0, len(labels_list) - 1):
    # From START to y_n. This is the index for u.

    for v in range(1, len(labels_list)):
        # From y_1 to END. This is the index for v.

        val_of_count_transition_from_u_to_v = count_transition_from_u_to_v(input_tuple_data, u, v)

        print("Count(" + labels_list[u] + ", " + labels_list[v] + "): ", val_of_count_transition_from_u_to_v)
        print("Maximum likelihood estimation is:", maximum_likelihood_estimation(val_of_count_transition_from_u_to_v, labels_count[u]))
        print()





FR

Count(START, O):  1473
Maximum likelihood estimation is: 0.9025735294117647

Count(START, B-positive):  67
Maximum likelihood estimation is: 0.04105392156862745

Count(START, I-positive):  0
Maximum likelihood estimation is: 0.0

Count(START, B-negative):  76
Maximum likelihood estimation is: 0.04656862745098039

Count(START, B-neutral):  15
Maximum likelihood estimation is: 0.009191176470588236

Count(START, I-negative):  0
Maximum likelihood estimation is: 0.0

Count(START, I-neutral):  0
Maximum likelihood estimation is: 0.0

Count(START, END):  0
Maximum likelihood estimation is: 0.0

Count(O, O):  21452
Maximum likelihood estimation is: 0.8751631853785901

Count(O, B-positive):  742
Maximum likelihood estimation is: 0.03027088772845953

Count(O, I-positive):  0
Maximum likelihood estimation is: 0.0

Count(O, B-negative):  599
Maximum likelihood estimation is: 0.02443701044386423

Count(O, B-neutral):  98
Maximum likelihood estimation is: 0.003998041775456919

Count(O, I-negati

In [102]:
def get_emission_from_words(word, unique_words, emission_params):
    if word not in unique_words:
        b = emission_params[:, -1]
    else:
        b = emission_params[:, unique_words.index(word)]
    return np.reshape(np.pad(b, 1), (-1, 1))


def get_parents(res):
    # choice = [np.random.choice(np.argwhere(res[1:-1, i] == res[1:-1, i].max(axis=0)).flatten()) + 1 for i in range(res.shape[1])]
    # return choice
    print(res)
    
    choice_2 = []
    for i in range(res.shape[1]):
       choice_2.append(np.random.choice(np.argwhere(res[1:-1, i] == res[1:-1, i].max(axis=0)).flatten()) + 1)
            
    
    print(choice_2)
    print("=================")
    return choice_2

def viterbi_log(transition, emission, all_unique_tokens, input_data):
    # Initial step
    log_transition = np.log(transition)
    pi = [np.log(np.zeros([transition.shape[0], 1]))]
    pi[0][0] = 0
    parents = []
    j = 0
    predicted_results = []
    for x in input_data:
        if x != '':  # Propagate with Viterbi
            b = np.log(get_emission_from_words(x, all_unique_tokens, emission))
            res = np.matmul(pi[j], np.transpose(np.ones(b.shape))) + np.matmul(np.ones(b.shape), np.transpose(b)) + log_transition
            # print(res.max(axis=0))
            # print(np.matmul(pi[j], np.transpose(np.zeros(b.shape))))
            
            pi.append(np.reshape(res.max(axis=0), (-1, 1)))
            parents.append(get_parents(res))
            # parents.append(np.argmax(res, axis = 0))

            
            j += 1
        else:  # Final step
            # print("new sentence")
            res = pi[j] + log_transition[:,-1:]
            # print(pi[j].shape)
            
            
            output = get_parents(res)
            # debug = [pi[j][output[0]]]

            # Output, trace back for the sequence
            while j > 1:  # Trace until second (first is START)
                j -= 1
                # debug.insert(0, pi[j][output[0]])
                
                # print(parents[j])
                # print(output[0])
                # print("???")
                output.insert(0, parents[j][output[0]])
                
            # print(len(output))
            # print("----------------")
            for i in output:                
                predicted_results.append(labels_list[i])
            # for i in range(len(output)):
            #     predicted_results.append(labels_list[output[i]] + ' ' + str(debug[i]))
            predicted_results.append('')

            # Reset
            pi = [np.log(np.zeros([transition.shape[0], 1]))]
            pi[0][0] = 0
            parents = []
            j = 0
            # break
    return predicted_results

def calculate_transition_parameters(input_data):
    transition_counts = np.zeros([N+2, N+2])
    priori_counts = np.zeros([N+2, 1])
    prev = 0
    for pair in input_data:
        curr = pair[1]
        priori_counts[prev] += 1
        transition_counts[prev, curr] += 1
        prev = curr
    transition = transition_counts / priori_counts
    transition[-1, 0] = 0  # Ignore transition from END to START
    # print(transition)
    return transition

labels_list, labels, N = create_labels_array_dict(fr_train_path)
input_data_start_end = generate_data_tuple_list_with_start_end(fr_train_path)
input_data = read_training_data(fr_train_path)
all_unique_words = get_all_unique_words(input_data)
transition_params = calculate_transition_parameters(input_data_start_end)
emission_params = calculate_emission_parameters(input_data, all_unique_words)
test_data = read_dev_in_data(fr_dev_in_path)

predicted_results = viterbi_log(transition_params, emission_params, all_unique_words, test_data)

4315
[24512   810   181   675   113   233    43]


  log_transition = np.log(transition)
  pi = [np.log(np.zeros([transition.shape[0], 1]))]
  b = np.log(get_emission_from_words(x, all_unique_tokens, emission))
  pi = [np.log(np.zeros([transition.shape[0], 1]))]


[[       -inf -8.82249097        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -inf        -inf]
 [       -inf        -inf        -inf        -inf        -inf        -inf
         -inf        -

In [77]:
def write_viterbi_output_to_file(language):
    if language == "EN":
        labels_list, labels, N = create_labels_array_dict(en_train_path)
        input_data = read_training_data(en_train_path)
        train_data_w_start_end = generate_data_tuple_list_with_start_end(en_train_path)
        test_data = read_dev_in_data(en_dev_in_path)
        output_path = en_dev_p2_out_path

    elif language == "FR":
        labels_list, labels, N = create_labels_array_dict(fr_train_path)
        input_data = read_training_data(fr_train_path)
        train_data_w_start_end = generate_data_tuple_list_with_start_end(fr_train_path)
        test_data = read_dev_in_data(fr_dev_in_path)
        output_path = fr_dev_p2_out_path

    # Conduct training/supervised learning (M-Step)
    all_unique_words = get_all_unique_words(input_data)
    emission_parameters = calculate_emission_parameters(input_data, all_unique_words)
    transition_parameters = calculate_transition_parameters(train_data_w_start_end)
    
    
    # Execute testing/decoding with Viterbi Algorithm (E-Step)
    predicted_results = viterbi_log(transition_parameters, emission_parameters, all_unique_words, test_data)
    with open(output_path, "w+", encoding="utf-8") as file:
        for i in range(len(test_data)):
            if test_data[i] and predicted_results[i]:
                file.write("{} {}\n".format(test_data[i], predicted_results[i]))
            else:
                file.write("\n")

In [12]:
labels_list, labels, N = create_labels_array_dict(fr_train_path)

write_viterbi_output_to_file("FR")

# Temporary store the FR labels, labels_list and N 
FR_labels_list, FR_labels, FR_N = labels_list, labels, N


labels_list, labels, N = create_labels_array_dict(en_train_path)
#print(labels_list, labels)
write_viterbi_output_to_file("EN")

# Temporary store the FR labels, labels_list and N 
EN_labels_list, EN_labels, EN_N = labels_list, labels, N

4315
[24512   810   181   675   113   233    43]


  log_transition = np.log(transition)
  pi = [np.log(np.zeros([transition.shape[0], 1]))]
  b = np.log(get_emission_from_words(x, all_unique_tokens, emission))
  pi = [np.log(np.zeros([transition.shape[0], 1]))]


3651
[2191  257  716 2768 1904 1327   85  607  159   71  386   88    2    2
   51   61    4    6]


In [13]:
''' NOTE: Use the powershell command in the cell below this cell instead. '''

#######################
####### FR Data #######

labels_list, labels, N = create_labels_array_dict(fr_train_path)
fr_dev_out_storage = generate_data_tuple_list_with_start_end(fr_dev_out_path)
fr_dev_p2_out_storage = generate_data_tuple_list_with_start_end(fr_dev_p2_out_path)

correct_count = compare_data(fr_dev_out_storage, fr_dev_p2_out_storage)
precision = precision_or_recall_calculation(correct_count, len(fr_dev_p2_out_storage))
recall = precision_or_recall_calculation(correct_count, len(fr_dev_out_storage))
f_score = f_score_calculation(precision, recall)
print("FR F_Score: ", f_score)



#######################
####### EN Data #######

labels_list, labels, N = create_labels_array_dict(en_train_path)
en_dev_out_storage = generate_data_tuple_list_with_start_end(en_dev_out_path)
en_dev_p2_out_storage = generate_data_tuple_list_with_start_end(en_dev_p2_out_path)

correct_count = compare_data(en_dev_out_storage, en_dev_p2_out_storage)
precision = precision_or_recall_calculation(correct_count, len(en_dev_p2_out_storage))
recall = precision_or_recall_calculation(correct_count, len(en_dev_out_storage))
f_score = f_score_calculation(precision, recall)
print("EN F_Score: ", f_score)


FR F_Score:  0.022380467955239063
EN F_Score:  0.31038798498122655


In [14]:
!python .\dataset\FR\evalResult.py .\dataset\FR\dev.out .\dataset\FR\dev.p2.out

!python .\dataset\EN\evalResult.py .\dataset\EN\dev.out .\dataset\EN\dev.p2.out


#Entity in gold data: 238
#Entity in prediction: 452

#Correct Entity : 136
Entity  precision: 0.3009
Entity  recall: 0.5714
Entity  F: 0.3942

#Correct Sentiment : 76
Sentiment  precision: 0.1681
Sentiment  recall: 0.3193
Sentiment  F: 0.2203

#Entity in gold data: 802
#Entity in prediction: 855

#Correct Entity : 542
Entity  precision: 0.6339
Entity  recall: 0.6758
Entity  F: 0.6542

#Correct Sentiment : 448
Sentiment  precision: 0.5240
Sentiment  recall: 0.5586
Sentiment  F: 0.5407


----

# Part 3

In [104]:
def calculate_double_transition_parameters(input_data):
    transition_counts = np.zeros([N+2, N+2, N+2])
    priori_counts = np.zeros([N+2, N+2])

    prev2 = 0
    prev = 1
    
    for i in range(len(input_data)):
        # input_data[i] -> ('écoute', 1)
        sticky = input_data[i]
        curr = sticky[1]
        priori_counts[prev2, prev] += 1
        transition_counts[prev2, prev, curr] += 1
        
        prev2 = prev
        prev = curr

    for state1 in transition_counts:
        for state2 in state1:
            state2[0] = 0
            state2[-1] = 0

    # print(priori_counts)
    transition = transition_counts / priori_counts
    transition[np.isnan(transition)] =0
    return transition

In [105]:
labels_list, labels, N = create_labels_array_dict(fr_train_path)

input_data_start_end = generate_data_tuple_list_with_start_end(fr_train_path)
input_data = read_training_data(fr_train_path)
all_unique_words = get_all_unique_words(input_data)
transition_params = calculate_double_transition_parameters(input_data)
print(transition_params.shape)

4315
(9, 9, 9)


  transition = transition_counts / priori_counts


In [129]:
def get_parents_two(res):
    

    
    
    # choice = [np.random.choice(np.argwhere(res[1:-1, i,j] == res[1:-1, i,j].max(axis=0)).flatten()) + 1 for i in range(res.shape[1]) for j in range(res.shape[0]) ]
    
    
    # print(choice)
    # choice_2 = []
    # for i in range(res.shape[1]):
    #    choice_2.append(np.random.choice(np.argwhere(res[1:-1, i,:] == res[1:-1, i,:].max(axis=0)).flatten()) + 1)
    
    choice_2 = [np.random.choice(np.argwhere(res[1:-1, i,j] == res[1:-1, i,j].max(axis=0)).flatten()) + 1 for i in range(res.shape[1]) for j in range(res.shape[0]) ]
    
    split_arr = np.array_split(choice_2, res.shape[1])

    result_lst = []
    for sub_arr in split_arr:
        unique, counts = np.unique(sub_arr, return_counts=True)
        max_freq_idx = np.argmax(counts)
        most_common = unique[max_freq_idx]
        result_lst.append(most_common)

    # print the resulting list

    
    
    return result_lst

def get_max_digit_list(choice):
    print(choice)
    return [np.bincount(choice).argmax()]


def secondorder_viterbi_log(transition, emission, all_unique_tokens, input_data):
    log_transition = np.log(transition)

    pi = [np.log(np.zeros([transition.shape[0], transition.shape[1], 1]))]
    pi[0][0][0] = 0
    parents = []

    j = 0
    predicted_results = []

    for x in input_data:
        if x != '': 
            b = np.log(get_emission_from_words(x, all_unique_tokens, emission))

            k = np.arange(pi[j].shape[0]).reshape(pi[j].shape[1], 1)

            res = np.matmul(k, np.transpose(np.ones(b.shape))) + np.matmul(np.ones(b.shape), np.transpose(b)) + log_transition
            # print(res.shape)
            pi.append(np.reshape(res.max(axis=0), (transition.shape[0],transition.shape[1])))
            
            parents.append(get_parents_two(res))
            j += 1
        else: 
            res = pi[j] + log_transition[:,-1:,-1:]

            
            output = get_parents_two(res)

            output = get_max_digit_list(output)

            while j > 1: 
                j -= 1
                output.insert(0, parents[j][output[0]])

            for i in output:
                predicted_results.append(labels_list[i])

            predicted_results.append('')

            # Reset
            pi = [np.log(np.zeros([transition.shape[0], transition.shape[1], 1]))]
            pi[0][0][0] = 0
            parents = []
            j = 0
    return predicted_results

labels_list, labels, N = create_labels_array_dict(fr_train_path)
input_data_start_end = generate_data_tuple_list_with_start_end(fr_train_path)
input_data = read_training_data(fr_train_path)
all_unique_words = get_all_unique_words(input_data)
transition_params = calculate_double_transition_parameters(input_data_start_end)
emission_params = calculate_emission_parameters(input_data, all_unique_words)
test_data = read_dev_in_data(fr_dev_in_path)

predicted_results = secondorder_viterbi_log(transition_params, emission_params, all_unique_words, test_data)
print(predicted_results)

4315
[24512   810   181   675   113   233    43]


  transition = transition_counts / priori_counts
  log_transition = np.log(transition)
  pi = [np.log(np.zeros([transition.shape[0], transition.shape[1], 1]))]
  b = np.log(get_emission_from_words(x, all_unique_tokens, emission))
  pi = [np.log(np.zeros([transition.shape[0], transition.shape[1], 1]))]


[1, 1, 2, 3, 1, 2, 5, 5, 4]
[2, 7, 5, 1, 5, 3, 2, 1, 6]
[7, 1, 5, 3, 2, 2, 4, 4, 1]
[1, 2, 1, 1, 2, 5, 5, 4, 5]
[2, 5, 4, 4, 2, 2, 3, 3, 2]
[2, 7, 2, 1, 1, 7, 5, 6, 4]
[4, 4, 4, 3, 3, 6, 1, 3, 2]
[5, 1, 6, 5, 4, 4, 3, 7, 5]
[4, 7, 1, 3, 7, 3, 6, 1, 7]
[3, 1, 1, 4, 1, 1, 4, 7, 1]
[5, 1, 1, 6, 4, 1, 2, 4, 1]
[1, 1, 4, 2, 7, 6, 7, 2, 2]
[5, 2, 2, 4, 1, 1, 4, 6, 3]
[4, 1, 1, 3, 5, 2, 3, 2, 5]
[1, 6, 3, 6, 4, 6, 1, 6, 4]
[1, 1, 3, 1, 6, 4, 5, 7, 1]
[1, 1, 6, 3, 2, 4, 6, 4, 7]
[2, 5, 1, 7, 2, 7, 1, 1, 2]
[1, 7, 4, 2, 2, 2, 5, 4, 2]
[6, 2, 1, 1, 4, 2, 7, 3, 1]
[1, 7, 2, 2, 3, 7, 5, 7, 2]
[6, 5, 1, 4, 1, 3, 7, 3, 4]
[2, 4, 2, 5, 1, 3, 7, 6, 4]
[4, 1, 6, 2, 4, 4, 6, 1, 1]
[7, 5, 3, 3, 5, 7, 4, 5, 1]
[1, 4, 3, 7, 1, 7, 4, 1, 2]
[6, 6, 6, 3, 6, 1, 7, 4, 2]
[3, 1, 1, 4, 1, 7, 1, 5, 4]
[2, 1, 6, 7, 4, 4, 1, 3, 7]
[3, 1, 6, 1, 4, 4, 1, 4, 2]
[1, 2, 7, 4, 2, 1, 2, 3, 7]
[6, 5, 2, 7, 1, 6, 1, 3, 7]
[2, 1, 1, 1, 5, 1, 3, 7, 4]
[1, 3, 1, 4, 1, 4, 6, 2, 1]
[1, 4, 3, 5, 3, 3, 4, 3, 4]
[1, 4, 5, 7, 1, 1, 1

In [125]:
def write_viterbi_output_to_file3(language):
    if language == "EN":
        labels_list, labels, N = create_labels_array_dict(en_train_path)
        input_data = read_training_data(en_train_path)
        train_data_w_start_end = generate_data_tuple_list_with_start_end(en_train_path)
        test_data = read_dev_in_data(en_dev_in_path)
        output_path = en_dev_p3_out_path

    elif language == "FR":
        labels_list, labels, N = create_labels_array_dict(fr_train_path)
        input_data = read_training_data(fr_train_path)
        train_data_w_start_end = generate_data_tuple_list_with_start_end(fr_train_path)
        test_data = read_dev_in_data(fr_dev_in_path)
        output_path = fr_dev_p3_out_path

    # Conduct training/supervised learning (M-Step)
    all_unique_words = get_all_unique_words(input_data)
    emission_parameters = calculate_emission_parameters(input_data, all_unique_words)
    transition_parameters = calculate_double_transition_parameters(train_data_w_start_end)

    # Execute testing/decoding with Viterbi Algorithm (E-Step)
    predicted_results = secondorder_viterbi_log(transition_parameters, emission_parameters, all_unique_words, test_data)
    with open(output_path, "w+", encoding="utf-8") as file:
        for i in range(len(predicted_results)):
            if test_data[i] and predicted_results[i]:
                file.write("{} {}\n".format(test_data[i], predicted_results[i]))
            else:
                file.write("\n")

In [126]:
# Temporary store the FR labels, labels_list and N 
labels_list, labels, N = create_labels_array_dict(fr_train_path)
FR_labels_list, FR_labels, FR_N = labels_list, labels, N
write_viterbi_output_to_file3("FR")

# Temporary store the FR labels, labels_list and N 
labels_list, labels, N = create_labels_array_dict(en_train_path)
EN_labels_list, EN_labels, EN_N = labels_list, labels, N
#print(labels_list, labels)
write_viterbi_output_to_file3("EN")

4315
[24512   810   181   675   113   233    43]


  transition = transition_counts / priori_counts
  log_transition = np.log(transition)
  pi = [np.log(np.zeros([transition.shape[0], transition.shape[1], 1]))]
  b = np.log(get_emission_from_words(x, all_unique_tokens, emission))
  pi = [np.log(np.zeros([transition.shape[0], transition.shape[1], 1]))]


O
===
O
===
B-positive
===
B-neutral
===
B-neutral
===
O
===
O
===
O
===
B-positive
===
B-negative
===
B-positive
===
O
===
O
===
O
===
I-positive
===
O
===
O
===
O
===
I-neutral
===
O
===
B-neutral
===
O
===
O
===
O
===
O
===
B-positive
===
B-negative
===
I-positive
===
I-positive
===
I-negative
===
B-negative
===
I-negative
===
I-negative
===
I-positive
===
I-positive
===
B-positive
===
I-positive
===
I-positive
===
O
===
O
===
B-negative
===
I-negative
===
B-positive
===
I-neutral
===
O
===
O
===
O
===
O
===
O
===
B-negative
===
B-positive
===
I-neutral
===
I-neutral
===
O
===
O
===
O
===
B-positive
===
O
===
O
===
O
===
O
===
O
===
B-neutral
===
O
===
O
===
B-negative
===
B-neutral
===
I-positive
===
O
===
I-negative
===
I-negative
===
I-negative
===
B-positive
===
B-negative
===
O
===
B-positive
===
I-positive
===
B-positive
===
I-negative
===
B-positive
===
B-negative
===
I-positive
===
O
===
B-neutral
===
O
===
O
===
O
===
O
===
B-negative
===
O
===
O
===
B-positive
===
B-negati

In [127]:
''' NOTE: Use the powershell command in the cell below this cell instead. '''

#######################
####### FR Data #######

labels_list, labels, N = create_labels_array_dict(fr_train_path)
fr_dev_out_storage = generate_data_tuple_list_with_start_end(fr_dev_out_path)
fr_dev_p3_out_storage = generate_data_tuple_list_with_start_end(fr_dev_p3_out_path)

correct_count = compare_data(fr_dev_out_storage, fr_dev_p3_out_storage)
precision = precision_or_recall_calculation(correct_count, len(fr_dev_p3_out_storage))
recall = precision_or_recall_calculation(correct_count, len(fr_dev_out_storage))
f_score = f_score_calculation(precision, recall)
print("FR F_Score: ", f_score)



#######################
####### EN Data #######

labels_list, labels, N = create_labels_array_dict(en_train_path)
en_dev_out_storage = generate_data_tuple_list_with_start_end(en_dev_out_path)
en_dev_p3_out_storage = generate_data_tuple_list_with_start_end(en_dev_p3_out_path)

correct_count = compare_data(en_dev_out_storage, en_dev_p3_out_storage)
precision = precision_or_recall_calculation(correct_count, len(en_dev_p3_out_storage))
recall = precision_or_recall_calculation(correct_count, len(en_dev_out_storage))
f_score = f_score_calculation(precision, recall)
print("EN F_Score: ", f_score)










FR F_Score:  0.005340793489318413
EN F_Score:  0.058823529411764705


In [128]:
!python .\dataset\FR\evalResult.py .\dataset\FR\dev.out .\dataset\FR\dev.p3.out

!python .\dataset\EN\evalResult.py .\dataset\EN\dev.out .\dataset\EN\dev.p3.out


#Entity in gold data: 238
#Entity in prediction: 1389

#Correct Entity : 67
Entity  precision: 0.0482
Entity  recall: 0.2815
Entity  F: 0.0824

#Correct Sentiment : 19
Sentiment  precision: 0.0137
Sentiment  recall: 0.0798
Sentiment  F: 0.0234

#Entity in gold data: 802
#Entity in prediction: 1058

#Correct Entity : 381
Entity  precision: 0.3601
Entity  recall: 0.4751
Entity  F: 0.4097

#Correct Sentiment : 77
Sentiment  precision: 0.0728
Sentiment  recall: 0.0960
Sentiment  F: 0.0828


----
# Part 4

In [23]:
'''
NOTE:   THIS CELL CODES FAILED AND THUS ARE NOT USED, BUT JUST KEPT IN CASE SOME PARTS ARE NEEDED FOR REFERENCE
        THE CURRENT WORKING CELL IS BELOW THIS CELL.



def get_unique_words_with_all_possible_labels(input_tuple_data):
    dict_of_unique_words_with_all_labels = {}

    for tuple in input_tuple_data:
        # Sets default as empty list if key (the unique word) is not found.
        current_word_label_array = dict_of_unique_words_with_all_labels.get(tuple[0], [ ]) 
        if tuple[0] in dict_of_unique_words_with_all_labels and tuple[1] not in dict_of_unique_words_with_all_labels[tuple[0]]:
            # If label not already inserted in the array value of that word key, then add it in.
            dict_of_unique_words_with_all_labels[tuple[0]].append(tuple[1])
        elif tuple[0] not in dict_of_unique_words_with_all_labels:
            # If unique word is not a key in the dict, make it a key, and add the label into the array key value
            dict_of_unique_words_with_all_labels[tuple[0]] = []
            dict_of_unique_words_with_all_labels[tuple[0]].append(tuple[1])

    return dict_of_unique_words_with_all_labels

def count_alphabet_transitions(dict_of_unique_words_with_all_labels):
    # Counts all transition between characters for every unique word

    dict_of_all_transition_of_alphabets = {}

    for current_unique_word_key in dict_of_unique_words_with_all_labels:
        # Dictionaries for each label key value would have the keys as tuples of (alphabet1, alphabet2), and their values being their count.

        # Using "d[key] = d.get(key, 0) + 1" to set a default value to 0 if key not found in the first place. Reference: https://stackoverflow.com/questions/1602934/check-if-a-given-key-already-exists-in-a-dictionary
        # This settles the very first character only. So it counts the number of times this character appears as the first letter of the word that is for some label.
        dict_of_all_transition_of_alphabets[(' ', current_unique_word_key[0])] = dict_of_all_transition_of_alphabets.get((' ', current_unique_word_key[0]), 0) + 1

        if(len(current_unique_word_key) > 1): 
            # If check above is to check if the word is like "a" or not. If so, the bottom for loop would go out of index range.
            for i in range(0, len(current_unique_word_key) - 1):
                # Make a tuple of each letter to the next letter, and get the key that is equivalent to that tuple if it exist (or set it as a key with the default value as 0 if it does not exist). Then add 1. 
                dict_of_all_transition_of_alphabets[(current_unique_word_key[i], current_unique_word_key[i + 1])] = dict_of_all_transition_of_alphabets.get((current_unique_word_key[i], current_unique_word_key[i + 1]), 0) + 1

        # This settles the very last character only. So it counts the number of times this character appears as the last letter of the word that is for some label.
        dict_of_all_transition_of_alphabets[(current_unique_word_key[-1], ' ')] = dict_of_all_transition_of_alphabets.get((current_unique_word_key[-1], ' '), 0) + 1

    return dict_of_all_transition_of_alphabets


def counting_of_each_transition_in_each_label(dict_of_unique_words_with_all_labels, labels_list):

    dict_of_all_transitions_in_each_label = {}

    # Add all labels as keys in the dictionary and set their value as dictionaries as well. Their dictionary values would hold the alphabet transitions as keys, whose values are their counts.
    for i in range(1, len(labels_list) - 1):
        # if labels_list[i] != "START" and labels_list[i] != "END":
        dict_of_all_transitions_in_each_label[i] = {}

    for current_unique_word_key in dict_of_unique_words_with_all_labels:
        for each_label_index in dict_of_unique_words_with_all_labels[current_unique_word_key]:
            # This settles the very first character only. So it counts the number of times this character appears as the first letter of the word that is for some label.
            dict_of_all_transitions_in_each_label[each_label_index][(' ', current_unique_word_key[0])] = dict_of_all_transitions_in_each_label[each_label_index].get((' ', current_unique_word_key[0]), 0) + 1

            if(len(current_unique_word_key) > 1): 
            # If check above is to check if the word is like "a" or not. If so, the bottom for loop would go out of index range.

                for i in range(0, len(current_unique_word_key) - 1):
                    # Make a tuple of each letter to the next letter, and get the key that is equivalent to that tuple if it exist (or set it as a key with the default value as 0 if it does not exist). Then add 1. 
                    dict_of_all_transitions_in_each_label[each_label_index][(current_unique_word_key[i], current_unique_word_key[i + 1])] = dict_of_all_transitions_in_each_label[each_label_index].get((current_unique_word_key[i], current_unique_word_key[i + 1]), 0) + 1

            # This settles the very last character only. So it counts the number of times this character appears as the last letter of the word that is for some label.
            dict_of_all_transitions_in_each_label[each_label_index][(current_unique_word_key[-1], ' ')] = dict_of_all_transitions_in_each_label[each_label_index].get((current_unique_word_key[-1], ' '), 0) + 1

    return dict_of_all_transitions_in_each_label


def probabilities_of_each_transition_in_each_label(dict_of_all_transitions_in_each_label, dict_of_all_transition_of_alphabets,  labels_list):
    dict_of_transition_prob_for_each_label = {}

    # Initialisation
    for label_index_key in range(1, len(labels_list) - 1):
        dict_of_transition_prob_for_each_label[label_index_key] = {}

    # Calculate probabilities
    for label_index_key in range(1, len(labels_list) - 1):
        # Exclude START and END.

        if label_index_key == 1:
            scale_factor = 10
        else:
            scale_factor = 1

        for current_transition_key in dict_of_all_transitions_in_each_label[label_index_key]:
            dict_of_transition_prob_for_each_label[label_index_key][current_transition_key] = dict_of_all_transitions_in_each_label[label_index_key][current_transition_key] / (dict_of_all_transition_of_alphabets[current_transition_key] * scale_factor) 

    return dict_of_transition_prob_for_each_label
'''

'''
def count_alphabet_transitions(input_tuple_data, labels_list):
    dict_of_all_transition_of_alphabets = {}

    # Add all labels as keys in the dictionary of dictionaries.
    for i in range(1, len(labels_list) - 1):
        # if labels_list[i] != "START" and labels_list[i] != "END":
        dict_of_all_transition_of_alphabets[i] = {}

    for currentTuple in input_tuple_data:
        # Dictionaries for each label key value would have the keys as tuples of (alphabet1, alphabet2), and their values being their count.

        # Using "d[key] = d.get(key, 0) + 1" to set a default value to 0 if key not found in the first place. Reference: https://stackoverflow.com/questions/1602934/check-if-a-given-key-already-exists-in-a-dictionary
        
        # This settles the very first character only. So it counts the number of times this character appears as the first letter of the word that is for some label.
        dict_of_all_transition_of_alphabets[currentTuple[1]][(' ', currentTuple[0][0])] = dict_of_all_transition_of_alphabets[currentTuple[1]].get((' ', currentTuple[0][0]), 0) + 1

        if(len(currentTuple[0]) > 1): 
            # If check above is to check if the word is like "a" or not. If so, the bottom for loop would go out of index range.
            for i in range(1, len(currentTuple[0]) - 1):
                dict_of_all_transition_of_alphabets[currentTuple[1]][(currentTuple[0][i], currentTuple[0][i + 1])] = dict_of_all_transition_of_alphabets[currentTuple[1]].get((currentTuple[0][i], currentTuple[0][i + 1]), 0) + 1

        # This settles the very last character only. So it counts the number of times this character appears as the last letter of the word that is for some label.
        dict_of_all_transition_of_alphabets[currentTuple[1]][(currentTuple[0][-1], ' ')] = dict_of_all_transition_of_alphabets[currentTuple[1]].get((currentTuple[0][-1], ' '), 0) + 1

    return dict_of_all_transition_of_alphabets



def probabilities_of_each_labels(labels_count, labels_list):
    sum_of_all_labels = 0
    for i in range(0, len(labels_list)):
        sum_of_all_labels += labels_count[i]

    dict_of_label_probabilities = {}

    for i in range(0, len(labels_list)):
        dict_of_label_probabilities[i] = labels_count[i] / sum_of_all_labels

    return dict_of_label_probabilities



def probabilities_of_each_transition_in_each_label(dict_of_all_transition_of_alphabets, labels_list):

    total_transitions_in_each_label = {}

    for i in range(1, len(labels_list) -1):
        # Excludes labels that is "START" and "END"

        for j in dict_of_all_transition_of_alphabets[i]: 
            # j iterates through all alphabet transition keys for a label key dict value of the outer dict
            total_transitions_in_each_label[i] = total_transitions_in_each_label.get(i, 0) + dict_of_all_transition_of_alphabets[i][j]
            # print(dict_of_all_transition_of_alphabets[i], " ",  dict_of_all_transition_of_alphabets[i][j], " , i: ", i, " , j: ", j)

    print("total_transitions_in_each_label: ", total_transitions_in_each_label)

    dict_of_transition_prob_for_each_label = {}

    # Initialisation
    for i in dict_of_all_transition_of_alphabets:
        dict_of_transition_prob_for_each_label[i] = {}

    # Calculate probabilities
    for i in range(1, len(labels_list) - 1):
        for j in dict_of_all_transition_of_alphabets[i]: 
            dict_of_transition_prob_for_each_label[i][j] = dict_of_all_transition_of_alphabets[i][j] / total_transitions_in_each_label[i]

    return dict_of_transition_prob_for_each_label
'''


'''
def predict_word_sentiment(word, dict_of_transition_prob_for_each_label, dict_of_label_probabilities, labels_list, regularization_factor):

    current_max_probability = 0
    current_most_probable_label_index = 0
    

    for i in range(1, len(labels_list) - 1):
        # Exclude START and END labels

        current_calculated_probability = 1

        # First multiply with p(Label)
        print("dict_of_label_probabilities: ", dict_of_label_probabilities)

        # if(labels_list[i] != "O"):
        #     current_calculated_probability *= dict_of_label_probabilities[i]

        # else:
        #     current_calculated_probability *= pow(dict_of_label_probabilities[i], regularization_factor)

        # Next multiply with p( (' ', first_alphabet) | label )
        # If key not found, set default to be 0.000001, rather than 0.
        current_calculated_probability *= dict_of_transition_prob_for_each_label.get((' ', word[0]), 0.000001)

        for j in range(len(word) - 1):
            current_calculated_probability *= dict_of_transition_prob_for_each_label.get((word[j], word[j + 1]), 0.000001)
        
        current_calculated_probability *= dict_of_transition_prob_for_each_label.get((word[-1], ' '), 0.000001)

        if(current_calculated_probability > current_max_probability):
            current_max_probability = current_calculated_probability
            current_most_probable_label_index = i

    return labels_list[current_most_probable_label_index]

def write_prediction_output_to_file_part4(language):

    regularization_factor = 1

    if language == "EN":
        # Conduct training/supervised learning (M-Step)
        labels_list, labels, N = FR_labels_list, FR_labels, FR_N
        input_tuple_data = read_training_data(fr_train_path)
        labels_count = calculate_number_of_labels(input_tuple_data)


        dict_of_label_probabilities = probabilities_of_each_labels(labels_count, labels_list)
        dict_of_all_transition_of_alphabets = count_alphabet_transitions(input_tuple_data, labels_list)
        dict_of_transition_prob_for_each_label = probabilities_of_each_transition_in_each_label(dict_of_all_transition_of_alphabets, labels_list)


        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(en_dev_in_path)

        for token in test_data:
            if token:
                predicted_results.append(token + " " + predict_word_sentiment(token, dict_of_transition_prob_for_each_label, dict_of_label_probabilities, labels_list, regularization_factor))
            else:
                predicted_results.append("")
        with open(en_dev_p4_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")

    elif language == "FR":
        # Conduct training/supervised learning (M-Step)
        labels_list, labels, N = FR_labels_list, FR_labels, FR_N
        input_tuple_data = read_training_data(fr_train_path)
        labels_count = calculate_number_of_labels(input_tuple_data)


        dict_of_label_probabilities = probabilities_of_each_labels(labels_count, labels_list)
        dict_of_all_transition_of_alphabets = count_alphabet_transitions(input_tuple_data, labels_list)
        dict_of_transition_prob_for_each_label = probabilities_of_each_transition_in_each_label(dict_of_all_transition_of_alphabets, labels_list)



        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(fr_dev_in_path)
        for token in test_data:
            if token:
                predicted_results.append(token + " " + predict_word_sentiment(token, dict_of_transition_prob_for_each_label, dict_of_label_probabilities, labels_list, regularization_factor))
            else:
                predicted_results.append("")
        with open(fr_dev_p4_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")


labels_list, labels, N = FR_labels_list, FR_labels, FR_N
input_tuple_data = read_training_data(fr_train_path)
labels_count = calculate_number_of_labels(input_tuple_data)

dict_of_unique_words_with_all_labels = get_unique_words_with_all_possible_labels(input_tuple_data)
print(dict_of_unique_words_with_all_labels)

# dict_of_label_probabilities = probabilities_of_each_labels(labels_count, labels_list)
dict_of_all_transition_of_alphabets = count_alphabet_transitions(dict_of_unique_words_with_all_labels)
dict_of_all_transitions_in_each_label = counting_of_each_transition_in_each_label(dict_of_unique_words_with_all_labels, labels_list)
dict_of_transition_prob_for_each_label = probabilities_of_each_transition_in_each_label(dict_of_all_transitions_in_each_label, dict_of_all_transition_of_alphabets,  labels_list)

# dict_of_transition_prob_for_each_label = probabilities_of_each_transition_in_each_label(dict_of_all_transition_of_alphabets, labels_list)

# print(dict_of_label_probabilities)
print("dict_of_all_transition_of_alphabets: ", dict_of_all_transition_of_alphabets)
print("dict_of_all_transitions_in_each_label: ", dict_of_all_transitions_in_each_label)
print("dict_of_transition_prob_for_each_label: ", dict_of_transition_prob_for_each_label)

# write_prediction_output_to_file_part4("EN")
# write_prediction_output_to_file_part4("FR")

# labels_list, labels, N = create_labels_array_dict(en_train_path)
# en_dev_out_storage = generate_data_tuple_list_with_start_end(en_dev_out_path)
# en_dev_p2_out_storage = generate_data_tuple_list_with_start_end(en_dev_p2_out_path)
'''

'\ndef predict_word_sentiment(word, dict_of_transition_prob_for_each_label, dict_of_label_probabilities, labels_list, regularization_factor):\n\n    current_max_probability = 0\n    current_most_probable_label_index = 0\n    \n\n    for i in range(1, len(labels_list) - 1):\n        # Exclude START and END labels\n\n        current_calculated_probability = 1\n\n        # First multiply with p(Label)\n        print("dict_of_label_probabilities: ", dict_of_label_probabilities)\n\n        # if(labels_list[i] != "O"):\n        #     current_calculated_probability *= dict_of_label_probabilities[i]\n\n        # else:\n        #     current_calculated_probability *= pow(dict_of_label_probabilities[i], regularization_factor)\n\n        # Next multiply with p( (\' \', first_alphabet) | label )\n        # If key not found, set default to be 0.000001, rather than 0.\n        current_calculated_probability *= dict_of_transition_prob_for_each_label.get((\' \', word[0]), 0.000001)\n\n        for j in

In [24]:
def count_unique_words_for_each_label(input_tuple_data, labels_list):
    dict_of_count_of_words_in_each_label = {}
    dict_of_total_count_unique_words = {}

    # Initialisation by setting labels as keys of dictionary, and their values are also dictionaries.
    for i in range(1, len(labels_list) - 1):
        # Exclude START and END
        dict_of_count_of_words_in_each_label[i] = {}


    for tuple in input_tuple_data:

        # Using "d[key] = d.get(key, 0) + 1" to set a default value to 0 if key not found in the first place. Reference: https://stackoverflow.com/questions/1602934/check-if-a-given-key-already-exists-in-a-dictionary
        dict_of_count_of_words_in_each_label[tuple[1]][tuple[0]] = dict_of_count_of_words_in_each_label[tuple[1]].get(tuple[0], 0) + 1

        dict_of_total_count_unique_words[tuple[0]] = dict_of_total_count_unique_words.get(tuple[0], 0) + 1


    return dict_of_count_of_words_in_each_label, dict_of_total_count_unique_words



def probabilities_of_words_in_each_label(dict_of_count_of_words_in_each_label, dict_of_total_count_unique_words):
    dict_of_words_prob_in_each_label = {}

    # Initialisation by setting labels as keys of dictionary, and their values are also dictionaries.
    for i in range(1, len(labels_list) - 1):
        # Exclude START and END
        dict_of_words_prob_in_each_label[i] = {}

    for label_index_key in dict_of_count_of_words_in_each_label:
        for unique_word_key in dict_of_count_of_words_in_each_label[label_index_key]:
            dict_of_words_prob_in_each_label[label_index_key][unique_word_key] = dict_of_count_of_words_in_each_label[label_index_key][unique_word_key] / (dict_of_total_count_unique_words[unique_word_key]) 

    return dict_of_words_prob_in_each_label




def predict_word_sentiment(word, dict_of_words_prob_in_each_label, labels_list):

    current_max_probability = 0
    current_most_probable_label_index = 1
    

    for label_index_key in dict_of_words_prob_in_each_label:
        # Exclude START and END labels

        if word in dict_of_words_prob_in_each_label[label_index_key]:

            if dict_of_words_prob_in_each_label[label_index_key][word] > current_max_probability:
                current_max_probability = dict_of_words_prob_in_each_label[label_index_key][word]
                current_most_probable_label_index = label_index_key
        

    return labels_list[current_most_probable_label_index]

def write_prediction_output_to_file_part4(language):

    if language == "EN":
        # Conduct training/supervised learning (M-Step)
        labels_list, labels, N = create_labels_array_dict(en_train_path)
        input_tuple_data = read_training_data(en_train_path)
        labels_count = calculate_number_of_labels(input_tuple_data)

        dict_of_count_of_words_in_each_label, dict_of_total_count_unique_words = count_unique_words_for_each_label(input_tuple_data, labels_list)
        dict_of_words_prob_in_each_label = probabilities_of_words_in_each_label(dict_of_count_of_words_in_each_label, dict_of_total_count_unique_words)
        

        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(en_dev_in_path)

        for token in test_data:
            if token:
                predicted_results.append(token + " " + predict_word_sentiment(token, dict_of_words_prob_in_each_label, labels_list))
            else:
                predicted_results.append("")
        with open(en_dev_p4_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")

    elif language == "FR":
        # Conduct training/supervised learning (M-Step)
        labels_list, labels, N = FR_labels_list, FR_labels, FR_N
        input_tuple_data = read_training_data(fr_train_path)
        labels_count = calculate_number_of_labels(input_tuple_data)


        dict_of_count_of_words_in_each_label, dict_of_total_count_unique_words = count_unique_words_for_each_label(input_tuple_data, labels_list)
        dict_of_words_prob_in_each_label = probabilities_of_words_in_each_label(dict_of_count_of_words_in_each_label, dict_of_total_count_unique_words)
        

        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(fr_dev_in_path)

        for token in test_data:
            if token:
                predicted_results.append(token + " " + predict_word_sentiment(token, dict_of_words_prob_in_each_label, labels_list))
            else:
                predicted_results.append("")
        with open(fr_dev_p4_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")


labels_list, labels, N = FR_labels_list, FR_labels, FR_N
input_tuple_data = read_training_data(fr_train_path)
labels_count = calculate_number_of_labels(input_tuple_data)


write_prediction_output_to_file_part4("FR")

labels_list, labels, N = create_labels_array_dict(en_train_path)
input_tuple_data = generate_data_tuple_list_with_start_end(en_train_path) # Note the function used.
labels_count = calculate_number_of_labels(input_tuple_data)
# labels_list, labels, N = create_labels_array_dict(en_train_path)
# en_dev_out_storage = generate_data_tuple_list_with_start_end(en_dev_out_path)
# en_dev_p2_out_storage = generate_data_tuple_list_with_start_end(en_dev_p2_out_path)

write_prediction_output_to_file_part4("EN")

In [25]:
!python .\dataset\FR\evalResult.py .\dataset\FR\dev.out .\dataset\FR\dev.p4.out

!python .\dataset\EN\evalResult.py .\dataset\EN\dev.out .\dataset\EN\dev.p4.out


#Entity in gold data: 238
#Entity in prediction: 163

#Correct Entity : 115
Entity  precision: 0.7055
Entity  recall: 0.4832
Entity  F: 0.5736

#Correct Sentiment : 77
Sentiment  precision: 0.4724
Sentiment  recall: 0.3235
Sentiment  F: 0.3840

#Entity in gold data: 802
#Entity in prediction: 741

#Correct Entity : 484
Entity  precision: 0.6532
Entity  recall: 0.6035
Entity  F: 0.6273

#Correct Sentiment : 449
Sentiment  precision: 0.6059
Sentiment  recall: 0.5599
Sentiment  F: 0.5820
