# Dump lemmas from Wicktionary ES

This notebook will try to parse xml downloaded from Wicktionary in order to generate lemmatization rules for Spanish language for Spacy.

In [1]:
from lxml import etree
import re
import json

Regular expressions which will be used:

In [2]:
category_regex = re.compile(r"== ?\{\{([^}]+)\}\} ?==") # Matches nouns, verbs, adverbs categories
multiple_lang_regex = re.compile(r"\{\{([a-zA-Z]{2})(?:\|\||-)[a-zA-Z0-9]{1,2}\}\}") # Matches language tags like {{ES||1}}, {{ES-EN}}
lengua_regex = re.compile(r"== ?{{lengua\|([^}]+)}} ?==") # Matches language tags like == {{lengua|es}} ==
acception_regex = re.compile(r"\: ?{{([^}]+)}}") # Matches variations from base, like plurals or verb conjugations

Parse [xml file](https://dumps.wikimedia.org/eswiktionary/latest/) with wicktionary content:

In [3]:
tree = etree.parse('eswiktionary-latest-pages-articles.xml')
root = tree.getroot()

All tags will be accesible using the following namespace:

In [4]:
NS = "{%s}" % root.nsmap.get(None)
NS

'{http://www.mediawiki.org/xml/export-0.10/}'

This function will extract basic forms and variations from text within a language:

In [5]:
def process_lang(text):
    results = []
    categories = category_regex.findall(text)
    for category in categories:
        results.append(category)
    acceptions = acception_regex.findall(text)
    for acception in acceptions:
        results.append(acception)
    return results

This function will parse text content looking for different langs and extracting only the targeted lang:

In [6]:
def iterate_languages(text, regex, target_lang):
    match = regex.search(text)
    while (match):
        start = match.end(0)
        cut = text[start:]
        next_match = regex.search(cut)
        language = match[1].lower()
        end = next_match.start(0) if next_match else len(cut)
        if (language == target_lang):
            return process_lang(cut[:end])
        text = cut[end:]
        match = regex.search(text)
    return []    

This function checks which kind of language tag the current document is using and processes its content:

In [7]:
def process_text(text, target_lang):
    
    lengua_match = lengua_regex.search(text)
    if (lengua_match):
        return iterate_languages(text, lengua_regex, target_lang)
    multiple_lang_match = multiple_lang_regex.search(text)
    if (multiple_lang_match):
        return iterate_languages(text, multiple_lang_regex, target_lang)
    return []
    

This function will parse the entire xml dump extracting useful terms:

In [8]:
results = {}
for page in root.findall(NS+'page'):
    titulo = page.find(NS+'title').text
    # These page entries are useless
    if (titulo.startswith('MediaWiki:') or 
       titulo.startswith('Plantilla:') or 
       titulo.startswith('Wikcionario:') or
       titulo.startswith('Archivo:') or
       titulo.startswith('Categoría:') or 
       titulo.startswith('Módulo:') or 
       titulo.startswith('Apéndice:')):
        continue

    revision = page.find(NS+'revision')
    contenido = revision.find(NS+'text')
    result = process_text(contenido.text, 'es')
    if (result != []):
        results[titulo] = result
    
    

Get different kind of terms extracted from XML:

In [9]:
types = set()
for values in results.values():
    for value in values:
        types.add(value.split('|')[0])
display(types)


{'AFI',
 'Aeronáutica',
 'América',
 'Andalucía',
 'Argentina',
 'Arqueología',
 'Arquitectura',
 'Arte',
 'Cantabria',
 'Centroamérica',
 'Chile',
 'Costa Rica',
 'DLE',
 'DRAE',
 'Ecuador',
 'El Salvador',
 'España',
 'Geometría',
 'Honduras',
 'Matemáticas',
 'Milicia',
 'Mitología',
 'México',
 'Náutica',
 'Ocupaciones',
 'Perspective slope',
 'Perú',
 'Química',
 'Tauromaquia',
 'Telecomunicaciones',
 'Venezuela',
 'Zoología',
 'abreviatura',
 'adjetivo',
 'adjetivo cardinal',
 'adjetivo de padecimiento',
 'adjetivo de sustantivo',
 'adjetivo de verbo',
 'adjetivo demostrativo',
 'adjetivo indefinido',
 'adjetivo ordinal',
 'adjetivo posesivo',
 'adjetivo_de_sustantivo',
 'adverbio',
 'adverbio comparativo',
 'adverbio de adjetivo',
 'adverbio de afirmación',
 'adverbio de cantidad',
 'adverbio de cantidad ',
 'adverbio de duda',
 'adverbio de lugar',
 'adverbio de modo',
 'adverbio de negación',
 'adverbio de orden',
 'adverbio de sustantivo',
 'adverbio de tiempo',
 'adverbio in

Here we set POS tags to term categories:

In [10]:
derivative_adjective_tags = [
    'f.adj2',
    'forma adjetivo', 
    'forma adjetivo 2',
    'superlativo',
    'forma participio'
]

derivative_pron_tags = [
     'forma pronombre',
]   

derivative_noun_tags = [
    'f.s.p',
    'forma diminutivo',
    'forma sustantivo',
    'forma sustantivo plural',
    'forma_sustantivo plural ',
    'plural',
    'forma participio',
    'forma adjetivo',
    'forma adjetivo 2',
    'f.adj2'
                       ]
derivative_verb_tags = [
    'gerundio', 
    'participio', 
    'forma verbo',
    'f.v'
]

derivative_adverb_tags = [
    'adverbio de sustantivo',
    'adverbio de adjetivo'
]

Some associations _key_ : _value_ are transitive, so the entry _value_ : _other_ may exist. This function assigns to the entry _key_ the final value _other_:

In [11]:
def resolve_derived_terms(dictionary):
    for key, value in dictionary.items():
        next_value = dictionary.get(value,None)
        while next_value and next_value != value:
            value = next_value
            next_value = dictionary.get(next_value, None)
        dictionary[key] = value

Build list of exceptions for adjectives, pronouns, nouns, verbs and adverbs:

In [12]:
adjectives_irreg = {}
pronouns_irreg = {}
nouns_irreg = {}
verbs_irreg = {}
adverbs_irreg = {}

is_valid_word_regex = re.compile('^[\w-]+$') # Allow dashes and alpha chars in words

for key, values in results.items():
    if (" " not in key):
        is_noun = False
        is_adjective = False
        is_adjective_form = False
        for value in values:
            tokens = value.split('|')
            # Check if considered term is a pure Noun
            if (tokens[0].startswith('sustantivo')):
                    is_noun = True
                    
            if (len(tokens) > 1):
                category = None
                term = None
                form = tokens[0] == 'forma'
                
                for token in tokens[1:]:
                    token = token.strip() # Remove leading and ending whitespaces
                    if ('tipo=' in token):
                        category = token.split('=')[1]
                    elif (is_valid_word_regex.match(token)):
                        term = token
                        break
                
                if not term:
                    continue
                    
                if ((form and category == 'adjetivo') or tokens[0] in derivative_adjective_tags):
                    is_adjective_form = True
                    adjectives_irreg[key] = term
                if ((form and category == 'pronombre') or tokens[0] in derivative_pron_tags):
                    pronouns_irreg[key] = term
                if ((form and category == 'sustantivo') or tokens[0] in derivative_noun_tags):
                    nouns_irreg[key] = term
                if ((form and category == 'verbo') or tokens[0] in derivative_verb_tags):
                    verbs_irreg[key] = term
                if ((form and category == 'adverbio') or tokens[0] in derivative_adverb_tags):
                    adverbs_irreg[key] = term
        
        # Prefer noun forms over adjective forms in noun exceptions
        if is_noun and is_adjective_form and key in nouns_irreg: 
            del nouns_irreg[key]
            

In Wicktionary, some adjective forms can also be a noun, but this is not reported. 

For example in: *Los pobres vivían en la calle.* 
    
The word "pobres" is an adjective form which is acting as a noun. This is why adjective forms are included in noun categories.

However, some adjective forms are also valid nouns. Here are some examples:

* "colina" can be a noun (hill in English) and also the feminine of the adjetive "colín", said of an animal whose tail has been cut.
* "trucha" can be a noun (fish) and also the femenine of the adjective "trucho" which means false or fraudulent in Argentina and Uruguay

In these cases we delete the adjective form from our list of exceptions, giving precedence to the noun.

Manual fixing of wrong terms:

In [13]:
# Singular form of soldados is 'kk' according to Wikctionary -> This is not correct
nouns_irreg['soldados'] = 'soldado'
# Último is not a derived form of 'ulterior' 
adjectives_irreg.pop('último', None)

'ulterior'

Resolve transitive derivations:

In [14]:
resolve_derived_terms(adjectives_irreg)
resolve_derived_terms(pronouns_irreg)
resolve_derived_terms(verbs_irreg)
resolve_derived_terms(nouns_irreg)
resolve_derived_terms(adverbs_irreg)

Cast dictionary values from string to list:

In [15]:
adjectives_irreg = {key: [value] for key, value in adjectives_irreg.items()} 
pronouns_irreg = {key: [value] for key, value in pronouns_irreg.items()} 
verbs_irreg = {key: [value] for key, value in verbs_irreg.items()} 
nouns_irreg = {key: [value] for key, value in nouns_irreg.items()} 
adverbs_irreg = {key: [value] for key, value in adverbs_irreg.items()} 

Generate py files with exceptions:

In [16]:
file = "lemma_exc.json"
content = {
    'adj': adjectives_irreg,
    'adverb': adverbs_irreg,
    'noun': nouns_irreg,
    'verb': verbs_irreg,
    'pron': pronouns_irreg,
    'det': {
        "los": ["el"],
        "unos": ["un"],
        "estos": ["este"],
        "esos": ["ese"],
        "aquellos": ["aquel"],
        "alguno": ["algún"],
        "varios": ["varios"],
        "muchísimo": ["mucho"],
        "muchísima": ["mucha"],
        "muchísimos": ["mucho"],
        "muchísimas": ["mucha"],
    }
}

with open(file, 'w', encoding='utf-8') as destination:
    json.dump(content, destination, sort_keys = True, indent = True, ensure_ascii=False)    

Compare loaded lemmas agains LOOKUP table in spacy:

In [17]:
import json
import pkg_resources

with pkg_resources.resource_stream('spacy.lang.es.lemmatizer', 'lemma_lookup.json') as infile:
    LOOKUP = json.load(infile)

set_results = set(results.keys())
set_look = set(LOOKUP.keys())
print("Lemmas in LOOKUP table which have not been loaded:", len(set_look - set_results))
print("Lemmas loaded which are not in LOOKUP table:", len(set_results - set_look))
print("Number of lemmas in LOOKUP:", len(set_look))
print("Number of lemmas loaded:",len(set_results))
#print("Lemmas loaded which are not in LOOKUP table:\n", set_look - set_results)

Lemmas in LOOKUP table which have not been loaded: 56674
Lemmas loaded which are not in LOOKUP table: 377509
Number of lemmas in LOOKUP: 491547
Number of lemmas loaded: 812382
