In [2]:
from fileReader import readTrainingFile, readTestFile
import math
import re

def trainClassifier():

    italian = readTrainingFile('../Data/Input/LangId.train.Italian', "ital")
    french = readTrainingFile('../Data/Input/LangId.train.French', "fra")
    english = readTrainingFile('../Data/Input/LangId.train.English', "eng")

    bigrams = {}

    englishWords = english.split()
    for word in range(0, len(englishWords) - 1):
        bigram = (englishWords[word], englishWords[word + 1])
        if bigram in bigrams:
            bigrams[bigram][0] += 1
        else:
            bigrams[bigram] = [1, 0, 0]

    italWords = italian.split()
    for word in range(0, len(italWords) - 1):
        bigram = (italWords[word], italWords[word + 1])
        if bigram in bigrams:
            bigrams[bigram][1] += 1
        else:
            bigrams[bigram] = [0, 1, 0]

    fraWords = french.split()
    for word in range(0, len(fraWords) - 1):
        bigram = (fraWords[word], fraWords[word + 1])
        if bigram in bigrams:
            bigrams[bigram][2] += 1
        else:
            bigrams[bigram] = [0, 0, 1]

    probBigramGivenClass = {}
    uniqueBigrams = len(bigrams)
    laplace = 1.0

    for bigram in bigrams:
        probBigramGivenClass[(bigram, 0)] = math.log((bigrams[bigram][0] + laplace) / (len(english) + laplace * uniqueBigrams))
        probBigramGivenClass[(bigram, 1)] = math.log((bigrams[bigram][1] + laplace) / (len(italian) + laplace * uniqueBigrams))
        probBigramGivenClass[(bigram, 2)] = math.log((bigrams[bigram][2] + laplace) / (len(french) + laplace * uniqueBigrams))

    numBigramsPerClass = (len(english), len(italian), len(french))

    return probBigramGivenClass, uniqueBigrams, numBigramsPerClass

In [5]:
def naiveBayesClassifer():
    laplace = 1.0
    probBigramGivenClass, uniqueBigrams, numBigramsPerClass = trainClassifier()
    count = 0

    classifiedLabels = []

    for line in readTestFile():
        probEngGivenWord = math.log(1.0/3.0)
        probItalGivenWord = math.log(1.0/3.0)
        probFraGivenWord = math.log(1.0/3.0)
        lineSplit = line.split()
        for word in range(0, len(lineSplit) - 1):
            bigram = (lineSplit[word], lineSplit[word + 1])
            if (bigram, 0) not in probBigramGivenClass:
                continue
            if probBigramGivenClass[(bigram, 0)] == 0:
                probEngGivenWord += math.log(laplace / (numBigramsPerClass[0] + uniqueBigrams * laplace))
            else:
                probEngGivenWord += probBigramGivenClass[(bigram, 0)]
            if probBigramGivenClass[(bigram, 1)] == 0:
                probItalGivenWord += math.log(laplace / (numBigramsPerClass[1] + uniqueBigrams * laplace))
            else:
                probItalGivenWord += probBigramGivenClass[(bigram, 1)]
            if probBigramGivenClass[(bigram, 2)] == 0:
                probFraGivenWord += math.log(laplace / (numBigramsPerClass[2] + uniqueBigrams * laplace))
            else:
                probFraGivenWord += probBigramGivenClass[(bigram, 2)]

        if probEngGivenWord >= probItalGivenWord and probEngGivenWord >= probFraGivenWord:
            classifiedLabels.append("English")
        elif probItalGivenWord >= probEngGivenWord and probItalGivenWord >= probFraGivenWord:
            classifiedLabels.append("Italian")
        else:
            classifiedLabels.append("French")

    return classifiedLabels


In [8]:
def test_accuracy():

    predicted_list = naiveBayesClassifer()
    actual_list = []

    with open('../Data/Validation/labels.sol', 'r') as file:
        for line in file.readlines():
            line = re.sub('[0-9]+ (.*)\n', '\\1', line)
            actual_list.append(line)

    correct = 0.0

    with open('../Data/Output/wordLangId.out', 'w') as file:
        for i in range(0, len(predicted_list)):
            file.write(predicted_list[i] + '\n')
            if predicted_list[i] == actual_list[i]:
                correct += 1.0
    

    return correct / len(predicted_list)


test_accuracy()

0.9833333333333333