## Setting up the data

In [21]:
# Import libraries
import os
import numpy as np
from collections import Counter

# Set OS-independent paths, relative to current directory
en_train_path = os.path.join("dataset", "EN", "train")
en_dev_in_path = os.path.join("dataset", "EN", "dev.in")
en_dev_out_path = os.path.join("dataset", "EN", "dev.out")
en_dev_p1_out_path = os.path.join("dataset", "EN", "dev.p1.out")
# es_dev_p2_out_path = os.path.join("dataset", "ES", "dev.p2.out")
# es_dev_p3_out_path = os.path.join("dataset", "ES", "dev.p3.out")
fr_train_path = os.path.join("dataset", "FR", "train")
fr_dev_in_path = os.path.join("dataset", "FR", "dev.in")
fr_dev_out_path = os.path.join("dataset", "FR", "dev.out")
fr_dev_p1_out_path = os.path.join("dataset", "FR", "dev.p1.out")
# ru_dev_p2_out_path = os.path.join("dataset", "RU", "dev.p2.out")
# ru_dev_p3_out_path = os.path.join("dataset", "RU", "dev.p3.out")

# Define constant variables
N = 7
labels = {"START": 0,
          "O": 1,
          "B-positive": 2,
          "I-positive": 3,
          "B-neutral": 4,
          "I-neutral": 5,
          "B-negative": 6,
          "I-negative": 7,
          "END": 8}
labels_list = ["START", "O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative", "END"]



In [22]:
# Create labels dictionary and list (for EN)
def create_labels_array_dict(filepath):

    output_labels_dict = {"START": 0}
    
    output_labels_array = ["START"]

    counter = 1

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2: 
                _, label = line.strip().rsplit(" ", 1)
                
                # Reference to check if key exist in dictionary: https://www.geeksforgeeks.org/python-check-whether-given-key-already-exists-in-a-dictionary/
                if label not in output_labels_dict.keys():
                    output_labels_dict[label] = counter
                    output_labels_array.append(label)
                    counter += 1
            else:
                continue

    output_labels_dict["END"] = counter
    output_labels_array.append("END")

    return output_labels_array, output_labels_dict


print(create_labels_array_dict(en_train_path))
print(create_labels_array_dict(fr_train_path))


(['START', 'O', 'B-INTJ', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'B-PRT', 'I-VP', 'B-ADJP', 'B-SBAR', 'B-ADVP', 'I-INTJ', 'B-CONJP', 'I-CONJP', 'I-ADVP', 'I-ADJP', 'I-SBAR', 'I-PP', 'END'], {'START': 0, 'O': 1, 'B-INTJ': 2, 'B-PP': 3, 'B-NP': 4, 'I-NP': 5, 'B-VP': 6, 'B-PRT': 7, 'I-VP': 8, 'B-ADJP': 9, 'B-SBAR': 10, 'B-ADVP': 11, 'I-INTJ': 12, 'B-CONJP': 13, 'I-CONJP': 14, 'I-ADVP': 15, 'I-ADJP': 16, 'I-SBAR': 17, 'I-PP': 18, 'END': 19})
(['START', 'O', 'B-positive', 'I-positive', 'B-negative', 'B-neutral', 'I-negative', 'I-neutral', 'END'], {'START': 0, 'O': 1, 'B-positive': 2, 'I-positive': 3, 'B-negative': 4, 'B-neutral': 5, 'I-negative': 6, 'I-neutral': 7, 'END': 8})


In [23]:
# Read training data
def read_training_data(filepath):
    results = []

    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2: # Make sure the line has two elements: word and label
                word, label = line.strip().rsplit(" ", 1)
                results.append((word, labels[label]))
                
            else:
                continue
    return results

print(read_training_data(fr_train_path))

# Read dev.in data
# There are no labels, just list of words
def read_dev_in_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    return results
print(read_dev_in_data(fr_dev_in_path))

[('Nous', 1), ('avons', 1), ('tout', 1), ('aimé', 1), ('.', 1), ('Le', 1), ('foi', 2), ('gras', 3), ('est', 1), ('le', 1), ('meilleur', 1), ('de', 1), ("l'île", 1), ('.', 1), ('Une', 1), ('perle', 1), ('.', 1), ('C', 1), ('est', 1), ('l', 1), ('endroit', 6), ('parfait', 1), ('si', 1), ('on', 1), ('a', 1), ('envie', 1), ('de', 1), ('se', 1), ('faire', 1), ('oublier', 1), ('et', 1), ('ignorer', 1), ('!', 1), ('La', 1), ('glace', 4), ("d'accompagnement", 1), ('était', 1), ('correcte', 1), ('mais', 1), ('une', 1), ('petite', 1), ('boule', 1), ('seulement', 1), ('.', 1), ('Bonne', 1), ('soirée', 1), ('passé', 1), ('dans', 1), ('ce', 1), ('lieu', 2), ('L', 1), ("'", 1), ('ambiance', 2), ('est', 1), ('bruyante', 1), ('mais', 1), ('relativement', 1), ('agréable', 1), ('.', 1), ('Restaurant', 1), ('de', 1), ('village', 1), ('.', 1), ('Addition', 1), (':', 1), ('160', 1), ('euros', 1), ('à', 1), ('2', 1), ('avec', 1), ('2', 1), ('verres', 1), ('chacun', 1), ('et', 1), ('en', 1), ('partageant', 1

In [24]:
# Calculate number of each labels
def calculate_number_of_labels(input_data):
    return Counter(labels_list[elem[1]] for elem in input_data)
print(calculate_number_of_labels(read_training_data(fr_train_path)))

# Print out all the words that are unique
def get_all_unique_words(input_data):
    return list(set(item[0] for item in input_data))
print(get_all_unique_words(read_training_data(fr_train_path)))




##################################
###### Part 1 Point 1 and 2 ######

# For the return value, we follow the matrix format defined in the slides accordingly
def calculate_emission_parameters(input_data, all_unique_tokens, k=1.0):
    # Final index is for #UNK# tokens
    emission_counts = np.zeros((N, len(all_unique_tokens) + 1), dtype=np.longdouble)

    # Calculate number of each labels and store in a list
    label_counts = np.array(list(val[1] for val in sorted(calculate_number_of_labels(input_data).items())))
    print(label_counts)

    for token, label in input_data:
        emission_counts[label - 1][all_unique_tokens.index(token)] += 1

    # This is for the other case of #UNK# tokens
    emission_counts[:, -1] = np.full((1, N), k)[0]

    emission_parameters = np.empty((N, len(all_unique_tokens) + 1), dtype=np.longdouble)

    for index, _ in enumerate(emission_counts):
        emission_parameters[index] = emission_counts[index] / (label_counts[index] + k)

    return emission_parameters
 
print(calculate_emission_parameters(read_training_data(fr_train_path), get_all_unique_words(read_training_data(fr_train_path))))


###### Part 1 Point 1 and 2 ######
##################################

Counter({'O': 24512, 'B-positive': 810, 'B-negative': 675, 'I-negative': 233, 'I-positive': 181, 'B-neutral': 113, 'I-neutral': 43})
['pdj', 'après', 'enthousiasme', 'arrière', 'empêche', "L'esprit", 'persillade', 'bianca', 'riz', 'serait-ce', 'charlotte', 'thaï', 'mayonnaise', 'SNCF', 'intéréssantes', 'Marseillais', 'réclamer', 'summum', 'prostitués', 'conquis', 'cuisson', 'Leur', 'boulettes', 'Idéal', 'Total', 'réchauffées', '?', 'Rédibitoire', 'gâteau', 'interminable', 'Compter', '2eme', 'Or', 'mer', 'extraordinaire', 'délavées', "l'intérieur", 'heure', 'vicitime', 'Lille', 'Viande', 'Heureusement', 'agréablement', 'venais', 'éviter', 'digeste', 'prendrez', "j'entends", '1er', 'vraiment', '15-20', 'Service', 'Quelques', 'remettre', "j'y", 'apres', "d'aubergine", 'meilleures', 'Super', 'bondée', 'monsieur', 'pub', 'reconnaîtra', 'accueillir', 'nems', 'Ravioles', 'haute', 'accueillis', 'j', 'arrivées', 'maître', 'choisie', 'cauchemars', 'conviviale', 'notamment', 'évidement', 'peste',

In [25]:
# Get tag from word
def get_label_from_token(input_word, all_unique_tokens, emission_parameters):
    if input_word not in all_unique_tokens:
        column_to_consider = emission_parameters[:, -1]
    else:
        column_to_consider = emission_parameters[:, all_unique_tokens.index(input_word)]

    # Randomly choose the index if there is more than one argmax value
    x = np.random.choice(np.argwhere(np.isclose(column_to_consider, column_to_consider.max())).flatten()) + 1
    return labels_list[x]
  
print(get_label_from_token("nourriture", get_all_unique_words(read_training_data(fr_train_path)), calculate_emission_parameters(read_training_data(fr_train_path), get_all_unique_words(read_training_data(fr_train_path)))))

[  675   113   810   233    43   181 24512]
B-positive


In [26]:
def write_prediction_output_to_file(language):
    if language == "EN":
        # Conduct training/supervised learning (M-Step)
        train_data = read_training_data(en_train_path)
        all_unique_tokens = get_all_unique_words(train_data)
        emission_parameters = calculate_emission_parameters(train_data, all_unique_tokens)

        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(en_dev_in_path)
        for token in test_data:
            if token:
                predicted_results.append(token + " " + get_label_from_token(token, all_unique_tokens, emission_parameters))
            else:
                predicted_results.append("")
        with open(en_dev_p1_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")

    elif language == "FR":
        # Conduct training/supervised learning (M-Step)
        train_data = read_training_data(fr_train_path)
        all_unique_tokens = get_all_unique_words(train_data)
        emission_parameters = calculate_emission_parameters(train_data, all_unique_tokens)

        # Execute testing/decoding (E-Step)
        predicted_results = []
        test_data = read_dev_in_data(fr_dev_in_path)
        for token in test_data:
            if token:
                predicted_results.append(token + " " + get_label_from_token(token, all_unique_tokens, emission_parameters))
            else:
                predicted_results.append("")
        with open(fr_dev_p1_out_path, "w+", encoding="utf-8") as file:
            for line in predicted_results:
                file.write(line + "\n")
                
write_prediction_output_to_file("FR")


labels_list, labels = create_labels_array_dict(en_dev_in_path)
write_prediction_output_to_file("EN")

[  675   113   810   233    43   181 24512]


KeyError: 'O'