In [56]:
from fileReader import readTrainingFile, readTestFile
import math 
import re

def goodTuring(bigrams):
    
    Cstar = []
    Nc = []

    for bigram in bigrams:
        count = bigrams[bigram]
        if count in Cstar:
            Nc[Cstar.index(count)] += 1
        else:
            Cstar.append(count) # records what the count is
            Nc.append(1) # 1 occurence

    # Pstar(unseen) = N_1 / N
    bigrams[0] = Nc[Cstar.index(1)] / (len(bigrams.keys()) * len(bigrams.keys()) - sum(Nc))

    # Linear regression technique learned from https://realpython.com/linear-regression-in-python/
    # Given that we could not use any of the packages, I just followed the guide for how LinReg works in general

    NcLog = [math.log(x) for x in Nc]
    xmean = sum(Cstar) * 1.0 / len(Cstar) 
    ymean = sum(NcLog) * 1.0 / len(NcLog)
    covariance = sum([(Cstar[i] - xmean) * (NcLog[i] - ymean) for i in range(len(Cstar))])
    variance = sum([(x - xmean) ** 2 for x in Cstar])

    # b_1 and b_0 from a linear equation
    b_1 = covariance / variance
    b_0 = ymean - b_1 * xmean

    for bigram in bigrams.keys():
        count = bigrams[bigram]
        if count in Cstar:
            if count + 1 in Cstar:
                bigrams[bigram] = (Nc[Cstar.index(count + 1)] * (count + 1) / Nc[Cstar.index(count)])
            else:
                bigrams[bigram] = count + math.exp(b_0 + b_1 * (count + 1)) * (count + 1) / math.exp((b_0 + b_1 * count)) / Nc[Cstar.index(count)]

    return bigrams


def trainClassifier():

    italian = readTrainingFile('../Data/Input/LangId.train.Italian', "ital")
    french = readTrainingFile('../Data/Input/LangId.train.French', "fra")
    english = readTrainingFile('../Data/Input/LangId.train.English', "eng")
    
    eng_bigrams = {}
    eng_words = {}
    englishSplit = english.split()
    for word in range(0, len(englishSplit) - 1):
        bigram = (englishSplit[word], englishSplit[word + 1])
        if bigram in eng_bigrams:
            eng_bigrams[bigram] += 1
        else:
            eng_bigrams[bigram] = 1
    for word in englishSplit:
        if word in eng_words:
            eng_words[word] += 1
        else:
            eng_words[word] = 1
    eng_bigrams = goodTuring(eng_bigrams)
    
    ital_bigrams = {}
    ital_words = {}
    italSplit = italian.split()
    for word in range(0, len(italSplit) - 1):
        bigram = (italSplit[word], italSplit[word + 1])
        if bigram in ital_bigrams:
            ital_bigrams[bigram] += 1
        else:
            ital_bigrams[bigram] = 1
    for word in italSplit:
        if word in ital_words:
            ital_words[word] += 1
        else:
            ital_words[word] = 1
    ital_bigrams = goodTuring(ital_bigrams)

    fra_bigrams = {}
    fra_words = {}
    fraSplit = french.split()
    for word in range(0, len(fraSplit) - 1):
        bigram = (fraSplit[word], fraSplit[word + 1])
        if bigram in fra_bigrams:
            fra_bigrams[bigram] += 1
        else:
            fra_bigrams[bigram] = 1
    for word in fraSplit:
        if word in fra_words:
            fra_words[word] += 1
        else:
            fra_words[word] = 1
    fra_bigrams = goodTuring(fra_bigrams)

    return (eng_bigrams, ital_bigrams, fra_bigrams, eng_words, ital_words, fra_words)
    

In [57]:

def naiveBayesClassifier():

    eng_bigrams, ital_bigrams, fra_bigrams, eng_words, ital_words, fra_words = trainClassifier()

    classifiedLabels = []

    for line in readTestFile():
        probEngGivenWord = math.log(1.0/3.0)
        probItalGivenWord = math.log(1.0/3.0)
        probFraGivenWord = math.log(1.0/3.0)
        lineSplit = line.split()
        uniqueBigrams = len(eng_bigrams) + len(ital_bigrams)+ len(fra_bigrams)
        for word in range(0, len(lineSplit) - 1):
            bigram = (lineSplit[word], lineSplit[word + 1])

            if bigram not in eng_bigrams:
                probEngGivenWord += math.log(eng_bigrams[0])
            else:
                probEngGivenWord += math.log(eng_bigrams[bigram] / eng_words[bigram[0]])
            if bigram not in ital_bigrams:
                probItalGivenWord += math.log(ital_bigrams[0])
            else:
                probItalGivenWord += math.log(ital_bigrams[bigram] / ital_words[bigram[0]])
            if bigram not in fra_bigrams:
                probFraGivenWord += math.log(fra_bigrams[0])
            else:
                probFraGivenWord += math.log(fra_bigrams[bigram] / fra_words[bigram[0]])

        if probEngGivenWord >= probItalGivenWord and probEngGivenWord >= probFraGivenWord:
            classifiedLabels.append("English")
        elif probItalGivenWord >= probEngGivenWord and probItalGivenWord >= probFraGivenWord:
            classifiedLabels.append("Italian")
        else:
            classifiedLabels.append("French")
            
    return classifiedLabels



In [59]:
def test_accuracy():

    predicted_list = naiveBayesClassifier()
    actual_list = []

    with open('../Data/Validation/labels.sol', 'r') as file:
        for line in file.readlines():
            line = re.sub('[0-9]+ (.*)\n', '\\1', line)
            actual_list.append(line)

    correct = 0.0

    with open('../Data/Output/wordLangId2.out', 'w') as file:
        for i in range(0, len(predicted_list)):
            file.write(predicted_list[i] + '\n')
            if predicted_list[i] == actual_list[i]:
                correct += 1.0
    

    return correct / len(predicted_list)


test_accuracy()

0.9833333333333333