In [321]:
def preprocess_initial_data(input_path):
    preprocessed_data = []
    with open(input_path, "r") as content:
        for line in content.readlines():
            words = line.strip().split()  # split the line into words
            if words and words[-1][-1] in ('.', '?', '!'):  # check if the last word has punctuation
                words[-1] = words[-1][:-1]  # remove punctuation from the last word
            words = ['<s>'] + words + ['</s>'] if words else []  # add start and end markers for bigram
            preprocessed_data.append(words)  # save the preprocessed line to the list
    return preprocessed_data

# input paths
english = "../Data/Input/LangId.train.English"
french = "../Data/Input/LangId.train.French"
italian = "../Data/Input/LangId.train.Italian"

# preprocess english, french, italian data
english_pre = preprocess_initial_data(english)
french_pre = preprocess_initial_data(french)
italian_pre = preprocess_initial_data(italian)

def preprocess_test_data(input_path):
    preprocessed_data = []
    with open(input_path, "r") as content:
        for line in content.readlines():
            words = line.strip().split()  # split the line into words
            if words and words[-1][-1] in ('.', '?', '!'):  # check if the last word has punctuation
                words[-1] = words[-1][:-1]  # remove punctuation from the last word
            words = ['<s>'] + words + ['</s>']  # add start and end markers for bigram
            preprocessed_data.append(words)  # save the preprocessed line to the list
    return preprocessed_data

test_path = "../Data/Validation/LangId.test"

# preprocess test data
test_preprocessed = preprocess_test_data(test_path)

def create_bigram(data):
    bigram = {}

    for sentence in data:  # iterate through each sentence in the data
        for index in range(len(sentence) - 1):  # iterate through each word in the sentence
            current_word = sentence[index]
            next_word = sentence[index + 1]

            # check if the current word is already in the bigram dictionary
            if current_word in bigram:
                # check if the next word is already in the current word's dictionary
                if next_word in bigram[current_word]:
                    # increment the count for the next word in the current word's dictionary
                    bigram[current_word][next_word] += 1
                else:
                    # add the next word to the current word's dictionary and initialize its count to 1
                    bigram[current_word][next_word] = 1
            else:
                # add the current word to the bigram dictionary with the next word as a key and count as 1
                bigram[current_word] = {next_word: 1}
    return bigram

english_bigram = create_bigram(english_pre)
french_bigram = create_bigram(french_pre)
italian_bigram = create_bigram(italian_pre)


In [322]:
import numpy as np
from math import log, exp
from sklearn.linear_model import LinearRegression

def count_frequencies(data):
    count_dict = {}
    counts = []
    frequencies = []

    # calculate counts and frequencies
    for key in data:
        for sub_key in data[key]:
            count = data[key][sub_key]

            count_dict[count] = count_dict.get(count, 0) + 1

            # update counts and frequencies lists
            if count not in counts:
                counts.append(count)
                frequencies.append(1)
            else:
                frequencies[counts.index(count)] += 1

    return count_dict, counts, frequencies

def transform_data(counts, frequencies):
    # transform frequencies for regression
    log_frequencies = [log(y) for y in frequencies]
    counts = np.array(counts).reshape(-1, 1)
    log_frequencies = np.array(log_frequencies)

    return counts, log_frequencies

def smooth_model_data(data, count_dict, counts, frequencies):
    # fit linear regression model
    regression_model = LinearRegression().fit(counts, frequencies)

    # smooth the data using regression coefficients
    for key in data:
        for sub_key in data[key]:
            count = data[key][sub_key]

            if count + 1 in count_dict and count in count_dict:
                data[key][sub_key] = (count + 1) * (count_dict[count + 1]) / count_dict[count]
            else:
                data[key][sub_key] = (count + 1) * (exp(regression_model.predict([[count + 1], ]))) / count_dict[count]

    return data

def estimate_unseen_frequency(count_dict, model_size):
    # estimate unseen frequency (N1/N)
    return count_dict[1] / (model_size * model_size - sum(frequencies))

def smooth_gt_model(data):
    count_dict, counts, frequencies = count_frequencies(data)
    counts, log_frequencies = transform_data(counts, frequencies)
    data = smooth_model_data(data, count_dict, counts, frequencies)
    data["unseen_frequency"] = estimate_unseen_frequency(count_dict, len(data))
    return data


In [323]:
result_file_path = "../Data/Output/wordLangId2.out"

with open(result_file_path, "r") as result_file:
    result_list = [line.split()[1] for line in result_file.readlines()]

answer_file_path = "../Data/Validation/labels.sol"

with open(answer_file_path, "r") as answer_file:
    answer_list = [line.split()[1] for line in answer_file.readlines()]


In [324]:
import pandas as pd
data = {"actual": result_list, "expected":answer_list}
df = pd.DataFrame(data)
def check_similarity(actual, expected):
    return 1 if actual == expected else 0

df["similarity_score"] = df.apply(lambda x: check_similarity(x['actual'], x['expected']), axis=1)
accuracy = sum(df["similarity_score"]) / len(df)
accuracy

0.9833333333333333