In [323]:
def preprocess_sentence(text):
    letters = list(text.strip())  # remove leading and trailing whitespaces
    if len(letters) > 0:
        if letters[0] == "(":
            letters = letters[2:]
        if letters[-1] == " ":
            letters = letters[:-1]
        # remove punctuation at the end of the sentence
        if letters[-1] in ('.', '?', '!', ';'):
            letters = letters[:-1]
            # remove whitespace after removing punctuation if present
            if letters[-1] == ' ':
                letters = letters[:-1]
        # add start and end of the sentence markers for bigrams
        letters = ['<s>'] + letters + ['</s>']
    return letters

def load_and_preprocess_data(file_path):
    preprocessed_data = []
    with open(file_path, "r") as file:
        for line in file.readlines():
            preprocessed_line = preprocess_sentence(line)
            if len(preprocessed_line) > 0:  # check if the line is not empty after preprocessing
                preprocessed_data.append(preprocessed_line)
    return preprocessed_data

eng_train_path = "../Data/Input/LangId.train.English"
eng_preprocessed = load_and_preprocess_data(eng_train_path)

fre_train_path = "../Data/Input/LangId.train.French"
fre_preprocessed = load_and_preprocess_data(fre_train_path)

it_train_path = "../Data/Input/LangId.train.Italian"
it_preprocessed = load_and_preprocess_data(it_train_path)

def preprocess_test_sentence(text):
    letters = list(text.strip())  # remove leading and trailing whitespaces
    if len(letters) > 0:
        if letters[0] == "(":
            letters = letters[2:]
        if letters[-1] == " ":
            letters = letters[:-1]
        # remove punctuation at the end of the sentence
        if letters[-1] in ('.', '?', '!', ';'):
            letters = letters[:-1]
            # remove whitespace after removing punctuation if present
            if letters[-1] == ' ':
                letters = letters[:-1]
        # add start and end of the sentence markers for bigrams
        letters = ['<s>'] + letters + ['</s>']
    return letters

def load_and_preprocess_test_data(file_path):
    preprocessed_data = []
    with open(file_path, "r") as file:
        for line in file.readlines():
            preprocessed_line = preprocess_test_sentence(line)
            if len(preprocessed_line) > 0:  # check if the line is not empty after preprocessing
                preprocessed_data.append(preprocessed_line)
    return preprocessed_data

test_data_path = "../Data/Validation/LangId.test"
test_preprocessed = load_and_preprocess_test_data(test_data_path)

def generate_bigram(data):
    bigram = {}
    
    for sentence in data:  # iterate over each sentence in the data
        for idx in range(len(sentence) - 1):  # iterate over each word index in the sentence
            current_word = sentence[idx]
            next_word = sentence[idx + 1]
            
            # initialize the bigram dictionary for the current word if it doesn't exist
            if current_word not in bigram:
                bigram[current_word] = {}
            
            # increment the count of the next word following the current word
            if next_word in bigram[current_word]:
                bigram[current_word][next_word] += 1
            else:
                bigram[current_word][next_word] = 1
    
    return bigram

eng_bigram = generate_bigram(eng_preprocessed)
fre_bigram = generate_bigram(fre_preprocessed)
it_bigram = generate_bigram(it_preprocessed)

def calculate_probability(model, word_prev, word_n):
    count = 0
    total = 0  

    if word_prev in model:
        if word_n in model[word_prev]:
            count = model[word_prev][word_n]

        total = sum(model[word_prev].values())

        # calculate the conditional probability P(word_n | word_prev)
        if total > 0:
            result = count / total
        else:
            result = 0.0  # handle division by zero
    else:
        result = 0.0  # if word_prev is not in the model, return probability 0
    
    return result

output_file_path = "../Data/Output/letterLangId.out"

with open(output_file_path, "w+") as output_file:
    # iterate test data by sentence
    for idx, sentence in enumerate(test_preprocessed):

        prob_dict = {"English": 0, "French" : 0, "Italian" : 0}
        # iterate words in sentence
        for word in range(0, len(sentence) - 1):
            # apply bigram model for english and calculate probability
            prob_dict["English"] += calculate_probability(eng_bigram, sentence[word], sentence[word+1])
            # apply bigram model for french and calculate probability
            prob_dict["French"] += calculate_probability(fre_bigram, sentence[word], sentence[word+1])
            # apply bigram model for italian and calculate probability
            prob_dict["Italian"] += calculate_probability(it_bigram, sentence[word], sentence[word+1])
            
        # compare probability and extract language with the high probability
        lang = max(prob_dict, key=prob_dict.get)
        output_file.write(str(idx+1) + " " + lang + "\n")

def read_file(file_path):
    data_list = []
    with open(file_path, "r") as file:
        for line in file.readlines():
            # Split each line by whitespace and get the second element
            line_parts = line.split()
            if len(line_parts) > 1:  # Ensure there are at least two elements in the line
                data_list.append(line_parts[1])
    return data_list

actual_path = "../Data/Output/letterLangId.out"
expected_path = "../Data/Validation/labels.sol"

actual_list = read_file(actual_path)
expected_list = read_file(expected_path)


In [324]:
import pandas as pd

data = {"actual": actual_list, "expected": expected_list}
df = pd.DataFrame(data)

def check_similarity(actual, expected):
    return 1 if actual == expected else 0

df["similarity_score"] = df.apply(lambda x: check_similarity(x['actual'], x['expected']), axis=1)

accuracy = sum(df["similarity_score"]) / len(df)
accuracy

0.9833333333333333