In [1]:
import os
import re
import json
from langdetect import detect

In [2]:
ENGLISH_FILE = '../languageIdentificationData/training/English'
FRENCH_FILE = '../languageIdentificationData/training/French'
ITALIAN_FILE = '../languageIdentificationData/training/Italian'
TEST_FILE = '../languageIdentificationData/test'
SOLUTION_FILE = '../languageIdentificationData/solution'

In [3]:
def dist_freq(content):
    letter_dict = {}
    content = content.lower()
    for letter in content:
        if re.match(r'[a-z]', letter):
            letter_dict[letter] = letter_dict[letter] + 1 if letter in letter_dict else 1
    letter_dict = dict(sorted(letter_dict.items(), key=lambda x:x[0]))
    total_char = sum(letter_dict.values())
    frequencies = {letter: letter_dict[letter] / total_char * 100 for letter in letter_dict}
    return frequencies

In [4]:
def train_model(file):
    frequencies = {}
    with open(os.path.abspath(file), 'r', encoding='iso8859') as f:
        frequencies = dist_freq(f.read())
    return frequencies


In [5]:
def identify_lang(file, lang_freq):
    dict_line_lang = {}
    with open(os.path.abspath(file), 'r', encoding='iso8859') as f:
        for idx, line in enumerate(f):
            content = line.lower()
            line_freq = dist_freq(content)
            best_lang = None
            best_diff = float('inf')
            for lang, this_lang_freq in lang_freq.items():
                diff = sum(abs(line_freq.get(letra, 0) - this_lang_freq.get(letra, 0)) for letra in set(line_freq) | set(this_lang_freq))
                if diff < best_diff:
                    best_lang = lang
                    best_diff = diff
            dict_line_lang[idx+1] = best_lang
    return dict_line_lang


In [6]:
def count_matches(dictionary, file_name):
    matches = 0
    # Open the file in read mode
    with open(file_name, 'r') as file:
        # Iterate over each line in the file
        for line in file:
            # Get the number and language from the line in the file
            number, language = line.strip().split(' ', 1)
            # Check if the language matches the value in the dictionary
            if dictionary[int(number)] == language:
                matches += 1
    return matches

In [7]:
def identify_lang_langdetect(file):
    map_lang = {'en': 'English', 'fr': 'French', 'it': 'Italian', 'other': 'other'}
    dict_line_lang = {}
    with open(os.path.abspath(file), 'r', encoding='iso8859') as f:
        for idx, line in enumerate(f):
            content = line.lower()
            lang_detected = detect(content)
            dict_line_lang[idx+1] = map_lang[lang_detected] if (lang_detected in map_lang) else 'other'
    return dict_line_lang

In [8]:
lang_freq = {
    'English': train_model(ENGLISH_FILE),
    'French': train_model(FRENCH_FILE),
    'Italian': train_model(ITALIAN_FILE)
}
dict_lines_lang = identify_lang(TEST_FILE, lang_freq)
dict_lines_lang_langdetect = identify_lang_langdetect(TEST_FILE)
# print(json.dumps(dict_lines_lang, indent=2))
print(f'Matches: {count_matches(dict_lines_lang, SOLUTION_FILE)}')
print(f'Matches Langdetect: {count_matches(dict_lines_lang_langdetect, SOLUTION_FILE)}')


Matches: 262
Matches Langdetect: 298


El algoritmo propio nos matcheó correctamente el 87,3% de las lineas mientras que la libreria langdetect detectó un 98.6%.