In [1]:
import nltk
import numpy
from nltk.util import ngrams
from nltk.corpus import udhr  
import string

In [3]:
#loading the data
eng = 0
fre = 1
ita = 2
spa = 3

nltk.download('udhr')

lang = [eng, fre, ita, spa]
languages = ["English", "French", "Italian", "Spanish"]

english = udhr.raw('English-Latin1') 
french = udhr.raw('French_Francais-Latin1') 
italian = udhr.raw('Italian_Italiano-Latin1') 
spanish = udhr.raw('Spanish_Espanol-Latin1')  

[nltk_data] Downloading package udhr to /home/dhruvdh/nltk_data...
[nltk_data]   Unzipping corpora/udhr.zip.


In [4]:
#calulate probabilty distribution from frequency distribution
def pdf(fdist):
    total = 0
    for i in fdist:
        total = total + fdist[i]
    pdf = {}
    for i in fdist:
        pdf[i] = fdist[i]/total
    return pdf

#remove punctuations
def remove_punct(text):
    corpus = []
    for character in text:
        if character not in string.punctuation:
            corpus.append(character)
    return ''.join(corpus)
    
# remove punctuations and convert to lower case 
def preprocess(text):
    text = text.strip()
    text = remove_punct(text)
    text = str.lower(text)
    return text

In [5]:
data = [preprocess(lang) for lang in [english, french, italian, spanish]]

english = data[eng]
french = data[fre]
italian = data[ita]
spanish = data[spa]

In [6]:
#train and test data for all languages
train = [english[0:1000],
         french[0:1000],
         italian[0:1000],
         spanish[0:1000]]

test = [udhr.words('English-Latin1')[0:1000],
        udhr.words('French_Francais-Latin1')[0:1000],
        udhr.words('Italian_Italiano-Latin1')[0:1000], 
        udhr.words('Spanish_Espanol-Latin1')[0:1000]]

In [7]:
#preprocess the test data
test = [[preprocess(w) for w in test_set] for test_set in test]

In [8]:
#ngram models computed 
Unigrams = [ngrams(train[language], 1) for language in lang]
Bigrams = [ngrams(train[language], 2, ) for language in lang]
Trigrams = [ngrams(train[language], 3 ) for language in lang] 

In [9]:
#Calculating frequency distribution
fdist_Uni = [nltk.FreqDist(unigram) for unigram in Unigrams]
fdist_Bi = [nltk.FreqDist(bigram) for bigram in Bigrams]
fdist_Tri = [nltk.FreqDist(trigram) for trigram in Trigrams]

In [10]:
#Calculating probability distribution
pdist_Uni = [pdf(fdist) for fdist in fdist_Uni]
pdist_Bi = [pdf(fdist) for fdist in fdist_Bi]
pdist_Tri = [pdf(fdist) for fdist in fdist_Tri]
# print("*******\nUnigram probability:\n",pdist_Uni)
# print("*******\nUnigram probability:\n",pdist_Bi)
# print("*******\nUnigram probability:\n",pdist_Tri)

In [11]:
#calulating the unigram model probabilty for a given language 
def unigram_probability(w, language):
    p = 1
    df = pdist_Uni[language]
    for character in w:
        key = (character,)
        p_key = df[key] if key in df else 0
        if p_key == 0:
            return 0
        p = p * p_key
    return p

In [12]:
#calulating the bigram model probabilty for a given language 
def bigram_probabilty(w, language):
    length = len(w)
    if length < 2:
             return unigram_probability(w, language)
    p = 1
    df = pdist_Bi[language]
    for i in range(length+1):
        if i == 0:
            key = (' ',w[i])
        elif i == length:
            key == (w[i-1], ' ')
        else:
            key = (w[i-1],w[i])
        p_key = df[key] if key in df else 0
        if p_key == 0:
            return 0
        prev = ' ' if i == 0 else w[i-1]
        p = p * p_key / unigram_probability(str(prev), language)
    
    return p;

In [13]:
#calulating the trigram model probabilty for a given language 
def trigram_probability(w, language):
    length = len(w)
    if length < 3:
        return bigram_probabilty(w, language)
    p = 1
    df = pdist_Tri[language]
    for i in range(1, length+1):
        if i == 1:
            key = (' ', w[i-1], w[i])
        elif i == length:
            key = (w[i-2], w[i-1], ' ')
        else:
            key = (w[i-2], w[i-1], w[i])
        
        p_key = df[key] if key in df else 0
        
        if p_key == 0:
            return 0
        key2 = (' ', w[i-1]) if i == 1 else (w[i-2], w[i-1])
        p_key2 = pdist_Bi[language][key2]
        p = p * p_key / p_key2
    
    return p

In [14]:
#Calculating accuracy
def predict(test_set, l1, l2):
    n = len(test_set)
    filename = languages[l1] + "_" + languages[l2] +"_" + str(n) + ".txt"
    umatches = 0
    bmatches = 0
    tmatches = 0
    with open(filename, 'w') as f:
        for word in test_set:
            p1 = unigram_probability(word, l1)
            p2 = unigram_probability(word, l2)
        
            p3 = bigram_probabilty(word, l1)
            p4 = bigram_probabilty(word, l2)
        
            p5 = trigram_probability(word, l1)
            p6 = trigram_probability(word, l2)
    
            if p1 >= p2:
                umatches = umatches + 1
        
            if p3 >= p4:
                bmatches = bmatches + 1
        
            if p5 >= p6:
                tmatches = tmatches + 1

    uni_probability = umatches * 100 / n
    bi_probability =  bmatches * 100 / n
    tri_probability = tmatches * 100 / n
    
    print("\n\nAccuracy: " + languages[l1] + " vs " + languages[l2] + "\n")
    print("Unigram: = {0:.5f}".format(uni_probability))
    print("Bigram: = {0:.5f}".format(bi_probability))
    print("Trigram: = {0:.5f}".format(tri_probability))   

In [15]:
print("\n*******TEST SETS*********")
#Question 1
predict(test[eng], eng, fre);
#Question 2
predict(test[spa], spa, ita);


*******TEST SETS*********


Accuracy: English vs French

Unigram: = 78.20000
Bigram: = 91.80000
Trigram: = 99.00000


Accuracy: Spanish vs Italian

Unigram: = 67.00000
Bigram: = 84.80000
Trigram: = 96.30000


As observed from the accuracies above, it is clear that the language Spanish vs Italian is harder to distinguish. This is because the characters of both the language are significantly different from each other.