In [1]:
import sys
import os
import nltk
import re
import unicodedata

In [6]:
def parse_dictionnaries(dictionnaries_folder):
    filenames = os.listdir(dictionnaries_folder)
    #create a dict of all languages
    languages_index = dict()
    languages_dict = dict()
    for i,filename in enumerate(filenames):
        l = filename.split('.')[0]
        languages_index[i] = l
        
        #open text
        with open(os.path.relpath(os.path.join(dictionnaries_folder, filename)))  as f:
            data = f.read()
        
        #normalize text
        ascii_data = unicodedata.normalize('NFKD',unicode(data,"utf-8")).strip().encode('ASCII','ignore')
        
        #tokenize
        re_tok = nltk.tokenize.RegexpTokenizer(r"\w+")
        punct_tokens = re_tok.tokenize(ascii_data)
        
        #lower case and exclude numbers
        lower_tokens = [w.lower() for w in punct_tokens if w.isalpha() ]
        
        #unique set of words
        set_words = list(sorted(set(lower_tokens)))
        
        #filter on size
        sel_set_words = [w for w in set_words if (len(w) > 2)]
        
        #store the list of words in its dict entry
        languages_dict[l] = sel_set_words
        
    return languages_index, languages_dict

In [7]:
languages_index, languages_dict = parse_dictionnaries("books/")

In [9]:
languages_index

{0: 'afrikaans',
 1: 'arapaho',
 2: 'breton',
 3: 'calo',
 4: 'catalan',
 5: 'cebuano',
 6: 'czech',
 7: 'danish',
 8: 'dutch',
 9: 'english',
 10: 'esperanto',
 11: 'estonian',
 12: 'finnish',
 13: 'french',
 14: 'frisian',
 15: 'friulian',
 16: 'gaelic',
 17: 'galician',
 18: 'german',
 19: 'hugarian',
 20: 'icelandic',
 21: 'interlingua',
 22: 'inuktitut',
 23: 'irish',
 24: 'italian',
 25: 'kashubian',
 26: 'latin',
 27: 'lloko',
 28: 'maori',
 29: 'mayan',
 30: 'north-american-indian',
 31: 'norwegian',
 32: 'occitan',
 33: 'polish',
 34: 'portugese',
 35: 'romanian',
 36: 'sanskrit',
 37: 'slovenian',
 38: 'spanish',
 39: 'swedish',
 40: 'tagalog',
 41: 'welsh'}

In [8]:
languages_dict

{'afrikaans': ['aakligheid',
  'aambeeldklanke',
  'aambeibossie',
  'aan',
  'aanbeveel',
  'aanbidder',
  'aanbidders',
  'aanbidding',
  'aanbied',
  'aanblik',
  'aanbod',
  'aanbreek',
  'aanbrengen',
  'aanbrenging',
  'aanbrengt',
  'aanbring',
  'aand',
  'aandacht',
  'aandag',
  'aande',
  'aandeel',
  'aandete',
  'aandoening',
  'aandoeningloos',
  'aandoenlik',
  'aandra',
  'aandrang',
  'aandrif',
  'aandring',
  'aandskemering',
  'aandskoonheid',
  'aandui',
  'aanduiding',
  'aanduidings',
  'aaneenryging',
  'aaneenskakeling',
  'aaneenstrengeling',
  'aangaan',
  'aangaande',
  'aangaat',
  'aangebied',
  'aangeblaas',
  'aangebore',
  'aangeboren',
  'aangedaan',
  'aangedik',
  'aangedring',
  'aangedroom',
  'aangedui',
  'aangeer',
  'aangegee',
  'aangegooi',
  'aangegryp',
  'aangehaal',
  'aangehaalde',
  'aangehits',
  'aangeklede',
  'aangeklop',
  'aangekomen',
  'aangekondig',
  'aangekweek',
  'aangele',
  'aangeleerde',
  'aangeleg',
  'aangelegde',
  '

In [10]:
#store dict as json
import json

In [11]:
with open('dictionnaries.json', 'w') as fp:
    json.dump(languages_dict, fp, sort_keys=True, indent=4)

In [12]:
with open('dictionnaries.json', 'r') as fp:
    data = json.load(fp)