# La détection de langue

Nous utilisons ici la librairie langid:
    
https://pypi.org/project/langid/

## Imports

In [3]:
import os
from collections import defaultdict

import langid
import pycountry

## Forcer l'algorithme à ne détecter que du Français et du Néerlandais

In [4]:
langid.set_languages(['fr', 'nl'])

## Lister tous les documents

In [5]:
root = "../data/txt/"
txts = os.listdir(root)
print(f"{len(txts)} TXT files found")

2827 TXT files found


## Détecter la langue pour tous les documents

Nous allons lire chaque fichier, détecter la langue, et incrémenter `lang_dict` lorsqu'une langue est détectée.

**Important** : pour détecter les langues sur tous les documents, mettez `limit = None` ci-dessous.

In [6]:
limit = 2000
# limit = None

In [7]:
lang_dict = defaultdict(int)
txts = txts[:limit] if limit else texts

In [8]:
for i, txt in enumerate(sorted(txts)):
    if txt.endswith("txt"):
        if i % 50 == 0:
            print(f'{i} document(s) processed...')
        text = open(os.path.join(root, txt)).read()
        text_length = len(text)
        if text_length > 20:
            lang, conf = langid.classify(text)
            lang_dict[lang] += 1
        else:
            print(f"{txt} contains only {text_length} characters, treating as unknown")
            lang_dict['n/a'] += 1
print("Done")

50 document(s) processed...
100 document(s) processed...
150 document(s) processed...
200 document(s) processed...
Bxl_1869_Tome_I1_Part_4.txt contains only 4 characters, treating as unknown
250 document(s) processed...
300 document(s) processed...
350 document(s) processed...
400 document(s) processed...
450 document(s) processed...
500 document(s) processed...
550 document(s) processed...
600 document(s) processed...
650 document(s) processed...
700 document(s) processed...
750 document(s) processed...
Bxl_1903_Tome_I2_2_Part_12.txt contains only 19 characters, treating as unknown
800 document(s) processed...
850 document(s) processed...
900 document(s) processed...
950 document(s) processed...
1000 document(s) processed...
1050 document(s) processed...
1100 document(s) processed...
1150 document(s) processed...
Bxl_1925_Tome_II1_2_Part_8.txt contains only 9 characters, treating as unknown
1200 document(s) processed...
1250 document(s) processed...
Bxl_1929_Tome_I_Part_10.txt contain

## Afficher le nombre de documents par langue

In [9]:
for lang_code, nb_docs in lang_dict.items():
    language = pycountry.languages.get(alpha_2=lang_code)
    try:
        lang_name = language.name
    except AttributeError:
        lang_name = language
    print(f"{lang_name}\t{nb_docs}")

French	1989
None	9
Dutch	1
