In [593]:
def preprocess_sentence(text):
    words = text.split()  # split the text into words
    if words and words[-1][-1] in ('.', '?', '!'):  # check if the last word has punctuation
        words[-1] = words[-1][:-1]  # remove punctuation from the last word
    return ['<s>'] + words + ['</s>'] if words else []  # add start and end markers for bigram

def load_and_preprocess_data(file_path):
    preprocessed_data = []
    with open(file_path, "r") as content:
        for line in content.readlines():
            preprocessed_line = preprocess_sentence(line)
            if preprocessed_line:  # check if the preprocessed line is not empty
                preprocessed_data.append(preprocessed_line)
    return preprocessed_data

eng_train_path = "../Data/Input/LangId.train.English"
eng_preprocessed = load_and_preprocess_data(eng_train_path)

fre_train_path = "../Data/Input/LangId.train.French"
fre_preprocessed = load_and_preprocess_data(fre_train_path)

it_train_path = "../Data/Input/LangId.train.Italian"
it_preprocessed = load_and_preprocess_data(it_train_path)

test_data_path = "../Data/Validation/LangId.test"
with open(test_data_path, "r") as test_content:
    test_preprocessed = []
    for line in test_content.readlines():
        words = line.strip().split()
        # remove punctuation at the end of the sentence
        if words[-1] in ('.', '?', '!'):
            words = words[:-1]
        # Add start and end markers for bigram
        words = ['<s>'] + words + ['</s>']
        test_preprocessed.append(words)

def generate_bigram(data):
    bigram = {}
    for sentence in data:  # iterate over each sentence in the data
        for idx in range(len(sentence) - 1):  # iterate over each word index in the sentence
            current_word = sentence[idx]
            next_word = sentence[idx + 1]
            # initialize the bigram dictionary for the current word if it doesn't exist
            if current_word not in bigram:
                bigram[current_word] = {}
            # increment the count of the next word following the current word
            if next_word in bigram[current_word]:
                bigram[current_word][next_word] += 1
            else:
                bigram[current_word][next_word] = 1
    return bigram

eng_bigram = generate_bigram(eng_preprocessed)
fre_bigram = generate_bigram(fre_preprocessed)
it_bigram = generate_bigram(it_preprocessed)

def calculate_probability(model, word_prev, word_n):
    count = 0  
    total = 0  
    if word_prev in model:
        if word_n in model[word_prev]:
            count = model[word_prev][word_n]
        total = sum(model[word_prev].values())
        # calculate the conditional probability P(word_n | word_prev)
        if total > 0:
            result = count / total
        else:
            result = 0.0  # handle division by zero
    else:
        result = 0.0  # if word_prev is not in the model, return probability 0
    return result

result_file_path = "../Data/Output/wordLangId.out"

def read_language_predictions(file_path):
    result_list = []
    with open(file_path, "r") as result_file:
        for line in result_file.readlines():
            lang = line.strip().split()[1]
            result_list.append(lang)
    return result_list

result_list = read_language_predictions(result_file_path)

actual_file_path = "../Data/Validation/labels.sol"

def read_actual_labels(file_path):
    actual_list = []
    with open(file_path, "r") as ans_file:
        for line in ans_file.readlines():
            label = line.strip().split()[1]
            actual_list.append(label)
    return actual_list

actual_list = read_actual_labels(actual_file_path)


In [594]:
import pandas as pd

def create_dataframe(model_list, actual_list):
    data = {"actual": model_list, "expected": actual_list}
    df = pd.DataFrame(data)
    return df

df = create_dataframe(result_list, actual_list)

def check_similarity(actual, expected):
    return 1 if actual == expected else 0

df["similarity_score"] = df.apply(lambda x: check_similarity(x['actual'], x['expected']), axis=1)
accuracy = sum(df["similarity_score"]) / len(df)
accuracy

0.9433333333333334