In [1]:
from nltk.util import ngrams
def generate_ngrams(tokens):

    generated_ngrams = []

    for token in tokens:
        
        for x in range(1, 6): # generate N-grams, for N=1 to 5
            xngrams = ngrams(token, x)

            for xngram in xngrams:
                # convert ('E', 'X', 'T', ' ') to 'EXT '
                ngram = ''.join(xngram)
                generated_ngrams.append(ngram)

    return generated_ngrams

In [2]:
from nltk.tokenize import RegexpTokenizer
import operator
def calculate_ngram_occurrences(text):

    tokenizer = RegexpTokenizer("[a-zA-Z'`éèî]+")
    tokens = tokenizer.tokenize(text)
    ngrams_list = generate_ngrams(tokens)

    ngrams_statistics = {}

    for ngram in ngrams_list:
        if ngram not in ngrams_statistics:
            ngrams_statistics.update({ngram:1})
        else:
            ngram_occurrences = ngrams_statistics[ngram]
            ngrams_statistics.update({ngram:ngram_occurrences+1})

    ngrams_statistics_sorted = sorted(ngrams_statistics.items(),\
                                      key=operator.itemgetter(1),\
                                      reverse=True)[0:300]

    return ngrams_statistics_sorted

In [3]:
text_eng = open('data/eng.txt', mode='r').read()
text_frn = open('data/frn.txt', mode='r').read()
text_eng_train = text_eng[:int(0.8*len(text_eng))]
text_eng_test = text_eng[int(0.8*len(text_eng)):]
text_frn_train = text_frn[:int(0.8*len(text_frn))]
text_frn_test = text_frn[int(0.8*len(text_frn)):]

In [4]:
# Create English profile
profile_ngrams_sorted = calculate_ngram_occurrences(text_eng_train)
fd = open('data/ngrams-eng.dat', mode='w')
for ngram in profile_ngrams_sorted:
    fd.write('%s\t%s\n' % (ngram[0], ngram[1]))
fd.close()



In [5]:
# Create French profile
profile_ngrams_sorted = calculate_ngram_occurrences(text_frn_train)
fd = open('data/ngrams-frn.dat', mode='w')
for ngram in profile_ngrams_sorted:
    fd.write('%s\t%s\n' % (ngram[0], ngram[1]))
fd.close()



In [6]:
def compare_ngram_frequency_profiles(category_profile, document_profile):

    document_distance = 0

    # convert [['eas ', 487], ['going', 437], ...] to ['eas', 'going', ...]
    category_ngrams_sorted = [ngram[0] for ngram in category_profile]
    document_ngrams_sorted = [ngram[0] for ngram in document_profile]

    maximum_out_of_place_value = len(document_ngrams_sorted)

    for ngram in document_ngrams_sorted:
        # pick up index position of ngram
        document_index = document_ngrams_sorted.index(ngram)
        try:
            # check if analyzed ngram exists in pre-computed category
            category_profile_index = category_ngrams_sorted.index(ngram)
        except ValueError:
            #If an N-gram is not in the category profile,
            #it takes some maximum out-of-place value.
            category_profile_index = maximum_out_of_place_value

        distance = abs(category_profile_index-document_index)
        document_distance+=distance

    return document_distance

In [7]:
def guess_language(raw_text):
    
    languages_ratios = {}
    languages_statistics = {}

    ngram_statistics = open('data/ngrams-eng.dat', mode='r').readlines()
    ngram_statistics = map(str.rstrip, ngram_statistics) # remove edge trailing
    languages_statistics.update({'1':ngram_statistics})

    ngram_statistics = open('data/ngrams-frn.dat', mode='r').readlines()
    ngram_statistics = map(str.rstrip, ngram_statistics) # remove edge trailing
    languages_statistics.update({'0':ngram_statistics})

    for language, ngrams_statistics in languages_statistics.items():
        language_ngram_statistics = calculate_ngram_occurrences(raw_text)
        distance = compare_ngram_frequency_profiles(ngrams_statistics, language_ngram_statistics)

        languages_ratios.update({language:distance})

    nearest_language = min(languages_ratios, key=languages_ratios.get)

    return nearest_language

In [8]:
import nltk.data

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentences_eng_test = tokenizer.tokenize(text_eng_test)
sentences_eng_test

tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
sentences_frn_test = tokenizer.tokenize(text_frn_test)
sentences_frn_test

print(sentences_eng_test)
print(sentences_frn_test)

['n or out of wedlock, shall enjoy the same social protection.', 'Article 26 1.', 'Everyone has the right to education.', 'Education shall be free, at least in the elementary and fundamental stages.', 'Elementary education shall be compulsory.', 'Technical and professional education shall be made generally available and higher education shall be equally accessible to all on the basis of merit.', '2.', 'Education shall be directed to the full development of the human personality and to the strengthening of respect for human rights and fundamental freedoms.', 'It shall promote understanding, tolerance and friendship among all nations, racial or religious groups, and shall further the activities of the United Nations for the maintenance of peace.', '3.', 'Parents have a prior right to choose the kind of education that shall be given to their children.', 'Article 27 1.', 'Everyone has the right freely to participate in the cultural life of the community, to enjoy the arts and to share in s

In [9]:
#removing sentences like '1.', '2.' etc.
sentences_eng_test = [x for x in sentences_eng_test if len(x) > 2]
sentences_frn_test = [x for x in sentences_frn_test if len(x) > 2]

X_test = sentences_eng_test + sentences_frn_test
y_true = ([1] * len(sentences_eng_test)) + ([0] * len(sentences_frn_test)) #1 for English, 0 for French
print(len(X_test))
print(len(y_true))

36
36


In [10]:
y_pred = []
for line in X_test:
    y_pred.append(int(guess_language(line)))



In [11]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_true, y_pred))

0.888888888889
