# 50.007 Machine Learning
## Group Project

# Part 1
Report the precision, recall and F scores of such a baseline system for each dataset:
- EN dataset
  - Entity scores:
    - Entity  precision: 0.5348
    - Entity  recall: 0.7656
    - Entity  F: 0.6297
  - Sentiment scores:
    - Sentiment  precision: 0.3902
    - Sentiment  recall: 0.5586
    - Sentiment  F: 0.4595
- FR dataset
  - Entity scores:
    - Entity  precision: 0.1670
    - Entity  recall: 0.7815
    - Entity  F: 0.2751
  - Sentiment scores:
    - Sentiment  precision: 0.0709
    - Sentiment  recall: 0.3319
    - Sentiment  F: 0.1169

In [15]:
# import statements
import numpy as np
import math
import copy

In [16]:
# function that takes in the filename for the training data
# returns word_sequences, tag_sequences
# word_sequences is a list in the form: [ [x_1_1, x_1_2, ...], [x_2_1, x_2_2, ...], ... [x_m_1, x_m_2, ... ] ]
# tag_sequences is a list in the form: [ [y_1_1, y_1_2, ...], [y_2_1, y_2_2, ...], ... [y_m_1, y_m_2, ... ] ]
def read_training_data(training_filename):
    training_file = open(training_filename, "r", encoding="utf-8")
    
    word_sequences = []
    tag_sequences = []
    
    current_word_sequence = []
    current_tag_sequence = []
    
    for line in training_file:
        training_word_and_tag = line.strip().split(" ")
        
        # add the current word and tag to the current word sequence and current tag sequence
        if (len(training_word_and_tag) == 2):
            current_word_sequence += [training_word_and_tag[0]]
            current_tag_sequence += [training_word_and_tag[1]]
        
        # if the sentence ended (empty line), add the previous word sequence and tag sequence to the lists of
        # word sequences and tag sequences respectively.
        else:
            word_sequences += [copy.deepcopy(current_word_sequence)]
            tag_sequences += [copy.deepcopy(current_tag_sequence)]
            
            current_word_sequence = []
            current_tag_sequence = []
            
    # account for the last word sequence
    if (len(current_word_sequence) != 0):
        word_sequences += [copy.deepcopy(current_word_sequence)]
        tag_sequences += [copy.deepcopy(current_tag_sequence)]
        
    training_file.close()

    return word_sequences, tag_sequences

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in the filename for the test data
# returns the test data as a list in the form: [ [x1_1, x1_2, ...], [x2_1, x2_2, ...] ]
def read_test_data(test_filename):
    test_file = open(test_filename, "r", encoding="utf-8")
    
    test_word_sequences = []
    
    current_test_word_sequence = []

    for line in test_file:
        test_word = line.strip()
        
        # add current word to the current word sequence
        if (len(test_word) != 0):
            current_test_word_sequence += [test_word]
            
        # if sentence ended (len(test_word) == 0)
        else:
            test_word_sequences += [copy.deepcopy(current_test_word_sequence)]
            current_test_word_sequence = []
        
    # account for the last word sequence
    if (len(current_test_word_sequence) != 0):
        test_word_sequences += [copy.deepcopy(current_test_word_sequence)]

    test_file.close()

    return test_word_sequences

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in the filename of the training data and optional k value
# returns emission_dict, tags_list
# emission_dict[x][y] gives the value e(x|y)
def create_emission_dict_tags_list(training_filename, k=1):
    # emission_dict[x][y] gives the value e(x|y)
    emission_dict = {}
    
    # count_y_dict[y] gives the total number of words tagged as y
    count_y_dict = {} 

    # count_x_tagged_as_y_dict[x_i][y_j] gives the number of times each observed variable x_i
    # was tagged as state y_j in the training data
    count_x_tagged_as_y_dict = {}

    # read training data
    word_sequences, tags_sequences = read_training_data(training_filename)

    # fill up count_y_dict and count_x_tagged_as_y_dict
    for sequence_index in range(0, len(word_sequences)):
        for word_index in range(0, len(word_sequences[sequence_index])):
            x = word_sequences[sequence_index][word_index]
            y = tags_sequences[sequence_index][word_index]

            # account for creating dictionary entry for the first time
            if not(y in count_y_dict.keys()):
                count_y_dict[y] = 0

            count_y_dict[y] += 1

            # account for creating dictionary entry for the first time
            if not(x in count_x_tagged_as_y_dict.keys()): 
                count_x_tagged_as_y_dict[x] = {}
                
            if not(y in count_x_tagged_as_y_dict[x].keys()):
                count_x_tagged_as_y_dict[x][y] = 0

            count_x_tagged_as_y_dict[x][y] += 1
        
    tags_list = count_y_dict.keys()

    # fill up emission_dict
    for sequence_index in range(0, len(word_sequences)):
        for word_index in range(0, len(word_sequences[sequence_index])):
            x = word_sequences[sequence_index][word_index]
            y = tags_sequences[sequence_index][word_index]
        
            # account for creating dictionary entry for the first time
            if not(x in emission_dict.keys()):
                emission_dict[x] = {}

            emission_dict[x][y] = count_x_tagged_as_y_dict[x][y] / (count_y_dict[y] + k)
            
            # add entry for "START" and "END"
            emission_dict[x]["START"] = 0
            emission_dict[x]["END"] = 0

    # add entry for #UNK#
    emission_dict["#UNK#"] = {}
    
    for tag in tags_list: # iterate over all the tags used in training
        emission_dict["#UNK#"][tag] = k / (count_y_dict[tag] + k)

    return emission_dict, tags_list

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in observed variable x and hidden state y
# returns emission parameter e(x|y)
def emission(emission_dict, tags_list, x, y):
    # if tag was not in training data
    if (not(y in tags_list)):
        print("This tag was not in the training data")
        result = 0

    # else if word was not in training data
    elif (not(x in emission_dict.keys())): # treat x as "#UNK#"
        result = emission_dict["#UNK#"][y] # result = k / (count_y_dict[y] + k)
        
    # else if word is was in training data
    else:
        # if x was never tagged as y before during training, the probability is 0
        if not(y in emission_dict[x].keys()):
            emission_dict[x][y] = 0

        result = emission_dict[x][y] 

    return result

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in a filename and a list of results in the form: [ [x1, tag1], [x2, tag2], ...]
# writes the results to a file specified by the filename
def write_result(result_filename, word_sequences, tag_sequences):
    result_file = open(result_filename, "w" ,encoding="utf-8")
    
    for sequence_index in range(0, len(word_sequences)):
        for word_index in range(0, len(word_sequences[sequence_index])):
            result_file.write(word_sequences[sequence_index][word_index] + " " + tag_sequences[sequence_index][word_index] + "\n")
        
        result_file.write("\n")

    result_file.close()

# -----------------------------------------------------------------------------------------------------------------------------
    
# function that takes in the filenames for the training data and test data
# produces the tag y* = arg_max_y e(x|y) for each word in the test data
# writes the results to a file specified by the filename
# returns the results as a list in the form: [ [x1, y*1], [x2, y*2], ... ]
def simple_sentiment_analysis(training_filename, test_filename, result_filename, k=1):    
    # initialise emission_dict
    emission_dict, tags_list = create_emission_dict_tags_list(training_filename, k)
    
    test_word_sequences = read_test_data(test_filename)
    
    prediction_tag_sequences = []

    for test_word_sequence in test_word_sequences:
        current_prediction_tag_sequence = []
        
        for test_word in test_word_sequence:
            # find the tag that gives the highest value for e(test_variable | tag)
            predicted_tag = ""
            highest_emission_value = 0

            for tag in tags_list:
                current_emission_value = emission(emission_dict, tags_list, test_word, tag)

                if current_emission_value > highest_emission_value:
                    highest_emission_value = current_emission_value
                    predicted_tag = tag

            current_prediction_tag_sequence += [predicted_tag]
            
        # at the end of the sentence, add the current prediction tag sequence to the lise prediction_tag_sequences
        prediction_tag_sequences += [copy.deepcopy(current_prediction_tag_sequence)]
        current_prediction_tag_sequence = []

    write_result(result_filename, test_word_sequences, prediction_tag_sequences)

    return test_word_sequences, prediction_tag_sequences, emission_dict

In [17]:
# perform prediction for the EN dataset
en_results = simple_sentiment_analysis("EN/train", "EN/dev.in", "EN/dev.p1.out")

# evaluate prediction for the EN dataset
!python "evalResult.py" "EN/dev.out" "EN/dev.p1.out"


#Entity in gold data: 802
#Entity in prediction: 1148

#Correct Entity : 614
Entity  precision: 0.5348
Entity  recall: 0.7656
Entity  F: 0.6297

#Correct Sentiment : 448
Sentiment  precision: 0.3902
Sentiment  recall: 0.5586
Sentiment  F: 0.4595


In [18]:
# perform prediction for the FR dataset
fr_results = simple_sentiment_analysis("FR/train", "FR/dev.in", "FR/dev.p1.out")

# evaluate prediction for the FR dataset
!python "evalResult.py" "FR/dev.out" "FR/dev.p1.out"


#Entity in gold data: 238
#Entity in prediction: 1114

#Correct Entity : 186
Entity  precision: 0.1670
Entity  recall: 0.7815
Entity  F: 0.2751

#Correct Sentiment : 79
Sentiment  precision: 0.0709
Sentiment  recall: 0.3319
Sentiment  F: 0.1169


In [19]:
# part 1 test case
# create training data for test case
test_case_train_file = open("p1_test_train", "w")
test_case_train_file.write("word1 tag1\n")
test_case_train_file.write("word1 tag1\n")
test_case_train_file.write("word1 tag1\n")
test_case_train_file.write("word1 tag2\n")
test_case_train_file.write("\n")
test_case_train_file.write("word2 tag2\n")
test_case_train_file.write("word2 tag2\n")
test_case_train_file.write("word2 tag2\n")
test_case_train_file.write("\n")
test_case_train_file.write("word3 tag3")
test_case_train_file.close()

# create test data for test case
test_case_test_file = open("p1_test_in", "w")
test_case_test_file.write("word1\n")
test_case_test_file.write("word2\n")
test_case_test_file.write("word3\n")
test_case_test_file.write("unknown_word")
test_case_test_file.close()

# create expected output for test case
test_case_expected_file = open("p1_test_out", "w")
test_case_expected_file.write("word1 tag1\n")
test_case_expected_file.write("word2 tag2\n")
test_case_expected_file.write("word3 tag3\n")
test_case_expected_file.write("unknown_word tag3")
test_case_expected_file.close()

# perform the test
test_word_sequences, prediction_tag_sequences, emission_dict = simple_sentiment_analysis("p1_test_train", "p1_test_in", "p1_test_prediction")
test_word_sequences, expected_tag_sequences= read_training_data("p1_test_out")

# show results for the test
print("\nTest case emission_dict:")
print(emission_dict)
print("")

test_case_passed = True

for sequence_index in range(0, len(test_word_sequences)):
    for tag_index in range(0, len(test_word_sequences[sequence_index])):
        if prediction_tag_sequences[sequence_index][tag_index] != expected_tag_sequences[sequence_index][tag_index]:
            test_case_passed = False
            
            print("Test case failed.")
            print(f"Word: {test_case_prediction[i][0]}")
            print(f"Tag: {test_case_prediction[i][1]}")
            print(f"Expected tag: {test_case_expected[i][1]}\n")

print(f"Test case passed: {test_case_passed}")


Test case emission_dict:
{'word1': {'tag1': 0.75, 'START': 0, 'END': 0, 'tag2': 0.2, 'tag3': 0}, 'word2': {'tag2': 0.6, 'START': 0, 'END': 0, 'tag1': 0, 'tag3': 0}, 'word3': {'tag3': 0.5, 'START': 0, 'END': 0, 'tag1': 0, 'tag2': 0}, '#UNK#': {'tag1': 0.25, 'tag2': 0.2, 'tag3': 0.5}}

Test case passed: True


# Part 2
Report the precision, recall and F scores of such a baseline system for each dataset:
- EN dataset
  - Entity scores:
    - #Entity in gold data: 802
    - #Entity in prediction: 793
    - #Correct Entity : 554
    - Entity  precision: 0.6986
    - Entity  recall: 0.6908
    - Entity  F: 0.6947
  - Sentiment scores:
    - #Correct Sentiment: 500
    - Sentiment  precision: 0.6305
    - Sentiment  recall: 0.6234
    - Sentiment  F: 0.6270
- FR dataset
  - Entity scores:
    - #Correct Entity : 37
    - Entity  precision: 0.6066
    - Entity  recall: 0.1555
    - Entity  F: 0.2475
  - Sentiment scores:
    - #Correct Sentiment : 20
    - Sentiment  precision: 0.3279
    - Sentiment  recall: 0.0840
    - Sentiment  F: 0.1338

In [20]:
# takes in list of tag sequences in the form [ [y1_1, y1_2, ... ], [y2_1, y2_2, ...], ... ]
# outputs transition_dict
def create_transition_dict(input_list):
    # Create transition dict
    # ========================
    transition_dict = {}
    
    # Get list of tags
    tags = set([tag for sentence in input_list for tag in sentence])

    # Update probability for each instance of tag1 > tag2 inside transition_dict
    for tag1 in tags:
        for tag2 in tags:
            count = 0
            total = 0
            for sentence in input_list:
                total += len(sentence) - 1
                for i in range(len(sentence) - 1):
                    if sentence[i] == tag1 and sentence[i+1] == tag2:
                        count += 1
            if count > 0:
                transition_dict[(tag1, tag2)] = count / total
    
    # Create initial probability dict
    # ========================
    start_tag_count_dict = {}
    stop_tag_count_dict = {}
    
    # Get num of starting tags that appear
    for sentence in input_list:
        if sentence[0] not in start_tag_count_dict:
            start_tag_count_dict[sentence[0]] = 1
        else:
            start_tag_count_dict[sentence[0]] += 1
        
        if sentence[-1] not in stop_tag_count_dict:
            stop_tag_count_dict[sentence[-1]] = 1
        else:
            stop_tag_count_dict[sentence[-1]] += 1
    
    # Fill in initial prob dict with num of starting/ending tags divided by total sentence num
    for tag in start_tag_count_dict:
        transition_dict[('START',tag)] = start_tag_count_dict[tag] / len(input_list)
    for tag in stop_tag_count_dict:
        transition_dict[(tag,'STOP')] = stop_tag_count_dict[tag] / len(input_list)
    
    return transition_dict

# -----------------------------------------------------------------------------------------------------------------------------

# function that takes in observed variable x and hidden state y
# returns transition parameter q(yi|yi-1)
def transition(transition_dict, yi_minus_1, yi):
    if (yi_minus_1, yi) not in transition_dict.keys():
        transition_dict[(yi_minus_1, yi)] = 0

    result = transition_dict[(yi_minus_1, yi)]

    return result
    
# -----------------------------------------------------------------------------------------------------------------------------

# creates the table of pi values
def viterby_first_order(word_sequence, tags_list, emission_dict, transition_dict):
    pi_dict = {}
    
    # =============================================================================================================================
    # initialization
    pi_dict[0] = {}
        
    pi_dict[0]["START"] = math.log(1)
    
    for word_index in range(1, len(word_sequence)+1): # index for each word in each sentence
        for tag in tags_list: # each tag
            if word_index not in pi_dict.keys():
                pi_dict[word_index] = {}

            pi_dict[word_index][tag] = float('-inf') # initialize pi(j, u) = 0 for all j and u

    # for the first word, transition from "START" to v
    x_1 = word_sequence[0]
    
    for v in tags_list:
        pi = pi_dict[0]["START"]
        trans = transition(transition_dict, "START", v)
        emi = emission(emission_dict, tags_list, x_1, v)

        if trans != 0:
            log_trans = math.log(trans)
        else:
            log_trans = float('-inf')

        if emi != 0:
            log_emi = math.log(emi)
        else:
            log_emi = float('-inf')

        current_pi_val = pi + log_trans + log_emi

        pi_dict[1][v] = current_pi_val
        
    # =============================================================================================================================
    # intermediate steps
    # for each observed variable
    for j in range(1, len(word_sequence)):
        x_j_plus_1 = word_sequence[j] # refers to the jth word (to calculate emission)
        
        # for each hidden state v
        for v in tags_list:
            
            # pi(j+1, v) = max over all u { pi(j,u) * transition(u, v) * emissision(x_j_plus_1, v) }
            max_pi_val = float('-inf')
            
            for u in tags_list:
                pi = pi_dict[j][u]
                trans = transition(transition_dict, u, v)
                emi = emission(emission_dict, tags_list, x_j_plus_1, v)

                if trans != 0:
                    log_trans = math.log(trans)
                else:
                    log_trans = float('-inf')
                
                if emi != 0:
                    log_emi = math.log(emi)
                else:
                    log_emi = float('-inf')
                
                current_pi_val = pi + log_trans + log_emi

                # save the value that maximises
                if (current_pi_val > max_pi_val):
                    max_pi_val = current_pi_val
        
            pi_dict[j+1][v] = max_pi_val
    
    # =============================================================================================================================
    # final step
    max_pi_val = float('-inf')
    
    # for each hidden state u
    for u in tags_list:
        pi = pi_dict[len(word_sequence)][u]
        trans = transition(transition_dict, u, "STOP")
        
        if trans != 0:
            log_trans = math.log(trans)
        else:
            log_trans = float('-inf')

        current_pi_val = pi + log_trans
        
        # save the value that maximises
        if (current_pi_val > max_pi_val):
            max_pi_val = current_pi_val
        
    pi_dict[len(word_sequence) + 1] = {}
    pi_dict[len(word_sequence) + 1]["STOP"]  = max_pi_val
    
    # =============================================================================================================================
    # BACKTRACKING
    decoding_list = []
    # check final layer argmax
    argmax = float('-inf')
    currentmax = float('-inf')
    argmax_tag = "NO_TAG_FOUND"
    
    # start from last hidden variable
    for u in tags_list:
        pi = pi_dict[len(pi_dict)-2][u]
        trans = transition(transition_dict, u, "STOP")
        
        if trans != 0:
            log_trans = math.log(trans)
        else:
            log_trans = float("-inf")
        
        currentmax = pi + log_trans
        
        if currentmax > argmax:
            argmax = currentmax
            argmax_tag = u
        
    decoding_list.append(argmax_tag)
    
    
    # Backtrack rest of pi_dict
    for j in range(len(pi_dict)-3, 0, -1):
        
        argmax = float('-inf')
        currentmax = 0
        argmax_index = 0
    
        for u in tags_list:
    
            pi = pi_dict[j][u]
            trans = transition(transition_dict, u, decoding_list[-1])
            
            if trans != 0:
                log_trans = math.log(trans)
            else:
                log_trans = float("-inf")

            currentmax = pi + log_trans

            if currentmax > argmax:
                argmax = currentmax
                argmax_tag = u
        
        decoding_list.append(argmax_tag)
        
    decoding_list = decoding_list[::-1]
    
    return decoding_list

In [41]:
def viterbi_prediction(training_filename, test_filename, result_filename, k=1):
    training_word_sequences, training_tag_sequences = read_training_data(training_filename)
    test_word_sequences = read_test_data(test_filename)
    
    emission_dict, tags_list = create_emission_dict_tags_list(training_filename, k)
    transition_dict = create_transition_dict(training_tag_sequences)

    tag_sequences = []

    for i in range(len(test_word_sequences)):
        most_likely_tags = viterby_first_order(test_word_sequences[i], tags_list, emission_dict, transition_dict)
        tag_sequences.append(most_likely_tags)
    
    # some words may still be tagged "NO_TAG_FOUND". For those words, select the tag that 
    # gives the highest emission value
    for sentence_index in range(0, len(test_word_sequences)):
        for word_index in range(0, len(test_word_sequences[sentence_index])):
            # check the tag that was predicted for each word
            checking_word = test_word_sequences[sentence_index][word_index]
            checking_tag = tag_sequences[sentence_index][word_index]

            if (checking_tag == "NO_TAG_FOUND"):
                new_predicted_tag = ""
                
                # if the tag is "NO_TAG_FOUND", select the tag that gives the highest emission values
                highest_emission_value = 0
                
                for tag in tags_list:
                    current_emission_value = emission(emission_dict, tags_list, checking_word, tag)

                    if current_emission_value > highest_emission_value:
                        highest_emission_value = current_emission_value
                        new_predicted_tag = tag
                
                tag_sequences[sentence_index][word_index] = new_predicted_tag

        
    write_result(result_filename, test_word_sequences, tag_sequences)
    
    return

In [42]:
# perform prediction for the EN dataset
viterbi_prediction("EN/train", "EN/dev.in", "EN/dev.p2.out")

# evaluate prediction for the EN dataset
!python "evalResult.py" "EN/dev.out" "EN/dev.p2.out"


#Entity in gold data: 802
#Entity in prediction: 793

#Correct Entity : 554
Entity  precision: 0.6986
Entity  recall: 0.6908
Entity  F: 0.6947

#Correct Sentiment : 500
Sentiment  precision: 0.6305
Sentiment  recall: 0.6234
Sentiment  F: 0.6270


In [43]:
# perform prediction for the FR dataset
viterbi_prediction("FR/train", "FR/dev.in", "FR/dev.p2.out")

# evaluate prediction for the FR dataset
!python "evalResult.py" "FR/dev.out" "FR/dev.p2.out"


#Entity in gold data: 238
#Entity in prediction: 61

#Correct Entity : 37
Entity  precision: 0.6066
Entity  recall: 0.1555
Entity  F: 0.2475

#Correct Sentiment : 20
Sentiment  precision: 0.3279
Sentiment  recall: 0.0840
Sentiment  F: 0.1338


# Part 3

In [60]:
def convert_tuple_dict_to_dict_of_dicts(tuple_dict):
    dict_of_dicts = {}
    
    for key in tuple_dict.keys():
        if key[0] not in dict_of_dicts:
            dict_of_dicts[key[0]] = {}
            
        dict_of_dicts[key[0]][key[1]] = tuple_dict[key]
        
    return dict_of_dicts

In [63]:
def create_transition_dict_second_order(input_list):
    
    # Create transition dict
    # ========================
    transition_dict = {}
    
    for sentence in input_list:
        
        # ONLY RUN IF SENTENCE HAS MORE THAN 1 WORD
        if len(sentence) > 1:
            
            for i in range(len(sentence) - 2):

                current_state = sentence[i]
                next_state = sentence[i + 1]
                next_next_state = sentence[i + 2]

                if current_state not in transition_dict:
                    transition_dict[current_state] = {}

                if next_state not in transition_dict[current_state]:
                    transition_dict[current_state][next_state] = {}

                if next_next_state not in transition_dict[current_state][next_state]:       
                    transition_dict[current_state][next_state][next_next_state] = 0

                transition_dict[current_state][next_state][next_next_state] += 1

                
    for current_state in transition_dict:
        
        for next_state in transition_dict[current_state]:
            
            total_count = sum(transition_dict[current_state][next_state].values())
            
            for next_next_state in transition_dict[current_state][next_state]:
                transition_dict[current_state][next_state][next_next_state] /= total_count
           
        
    # Create initial probability dict
    # ========================
    start_tag_count_dict = {}
    stop_tag_count_dict = {}
    
    sentence_w_one_word = {}
    
    # Get num of starting tags that appear
    for sentence in input_list:
        
        # ONLY RUN IF SENTENCE HAS MORE THAN 1 WORD
        if len(sentence) > 1:
            
            if sentence[0] not in start_tag_count_dict:
                start_tag_count_dict[sentence[0]] = {}

            if sentence[1] not in start_tag_count_dict[sentence[0]]:
                start_tag_count_dict[sentence[0]][sentence[1]] = 1
            else:
                start_tag_count_dict[sentence[0]][sentence[1]] += 1


            if sentence[-2] not in stop_tag_count_dict:
                stop_tag_count_dict[sentence[-2]] = {}

            if sentence[-1] not in stop_tag_count_dict[sentence[-2]]:
                stop_tag_count_dict[sentence[-2]][sentence[-1]] = 1
            else:
                stop_tag_count_dict[sentence[-2]][sentence[-1]] += 1
        
        else:
            # FOR SENTENCES WITH ONLY ONE WORD
            word = sentence[0]
            
            if word not in sentence_w_one_word:
                sentence_w_one_word[word] = 0
            sentence_w_one_word[word] += 1



    
    # Fill in initial prob dict with num of starting/ending tags divided by total sentence num
    for tag in start_tag_count_dict:
        for tag2 in start_tag_count_dict[tag]:
            
            if 'START' not in transition_dict:
                transition_dict['START'] = {}
            if tag not in transition_dict['START']:
                transition_dict['START'][tag] = {}
                
            transition_dict['START'][tag][tag2] = start_tag_count_dict[tag][tag2] / len(input_list)
        
        for tag in stop_tag_count_dict:
             for tag2 in stop_tag_count_dict[tag]:
                    
                if tag not in transition_dict:
                    transition_dict[tag] = {}
                if tag2 not in transition_dict[tag]:
                    transition_dict[tag][tag2] = {}
            
                transition_dict[tag][tag2]['STOP'] = stop_tag_count_dict[tag][tag2] / len(input_list)
                
    # Fill in initial prob dict with sentences with only ONE WORD
    for tag in sentence_w_one_word:
        if 'START' not in transition_dict:
            transition_dict['START'] = {}
        if 'START' not in transition_dict['START']:
            transition_dict['START']['START'] = {}
        if tag not in transition_dict['START']['START']:
            transition_dict['START']['START'][tag] = sentence_w_one_word[tag] / len(input_list)
        
        if tag not in transition_dict['START']:
            transition_dict['START'][tag] = {}
        if 'STOP' not in transition_dict['START'][tag]:
            transition_dict['START'][tag]['STOP'] = sentence_w_one_word[tag] / len(input_list)
            
    return transition_dict

In [64]:
training_word_sequences, training_tag_sequences = read_training_data("EN/train")

test = [['a','c','a','c'],['a','c','a','d'],['c','b','a','d','d'],['a','c','b'],['c','a','c'],['c','b','c'],['a'],['a']]

create_transition_dict_second_order(test)['START']

{'a': {'c': 0.375, 'STOP': 0.25},
 'c': {'b': 0.25, 'a': 0.125},
 'START': {'a': 0.25}}

In [65]:
training_word_sequences, training_tag_sequences = read_training_data("EN/train")
create_transition_dict_second_order(training_tag_sequences)

{'O': {'O': {'O': 0.5,
   'B-INTJ': 0.03289473684210526,
   'B-PP': 0.019736842105263157,
   'B-SBAR': 0.015350877192982455,
   'B-NP': 0.2916666666666667,
   'B-VP': 0.08771929824561403,
   'B-ADVP': 0.041666666666666664,
   'B-ADJP': 0.010964912280701754,
   'STOP': 0.23049001814882034},
  'B-INTJ': {'O': 0.4,
   'I-INTJ': 0.31666666666666665,
   'B-VP': 0.05,
   'B-NP': 0.16666666666666666,
   'B-ADVP': 0.016666666666666666,
   'B-PP': 0.03333333333333333,
   'B-INTJ': 0.016666666666666666,
   'STOP': 0.029038112522686024},
  'B-PP': {'B-NP': 0.6976744186046512,
   'B-VP': 0.11627906976744186,
   'O': 0.18604651162790697},
  'B-NP': {'O': 0.09618320610687023,
   'I-NP': 0.37251908396946565,
   'B-VP': 0.4259541984732824,
   'B-PP': 0.035114503816793895,
   'B-SBAR': 0.0015267175572519084,
   'B-NP': 0.02900763358778626,
   'B-ADJP': 0.013740458015267175,
   'B-ADVP': 0.024427480916030534,
   'B-INTJ': 0.0015267175572519084,
   'STOP': 0.007259528130671506},
  'B-VP': {'B-PP': 0.1315