In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import requests
from pathlib import Path
from collections import defaultdict
import json

In [3]:
import treetaggerwrapper as ttpw

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

In [5]:
import lerobert.scraping as lrs
import lerobert.processing as lrp

# Terminology

If we look up French words "pêche" and "péché", they will be on the same page and have the same address "https://dictionnaire.lerobert.com/definition/peche". The last part of the address "peche" is going to be without any diacritics. To give the last part a separate term, we are going to call it a `word_path`. We are going to use the `word_path` as the `filename`, when saving the HTML of a page.

# Discovering Definition Pages

`./lerobert/scraping.py` provides functions needed to discover valid `word_path`s when scraping the dictionary:
1. The dictionary has "Explorer le dictionnaire" section, where it lists valid (and also not valid) links to the definitions. To find all the links in that section, you can use `get_explored_links()` function. 
2. You can also find definition pages via the API used by the built-in search. For this purpose, call `get_suggested_word_paths()` function with a search term as an argument.
3. When you have saved some of the definition pages as HTML files, you can go over these files and extract all definition links via `find_word_paths_html_file()` function.
4. The last option is to request a page and check its status code using `word_path` from a wordlist. Do not forget to remove diacritics and replace white spaces and `'` with `-` before that.

All four approaches were taken to compile a list of valid valid definition pages `./assets/html/word_paths.txt`.

# Scraping the Content

Let's use `word_paths.txt` to download definition pages from this list.

In [6]:
with open('./assets/word_paths.txt', 'r', encoding='utf-8') as f:
    word_paths = [line.strip() for line in f]

## HTML

In [7]:
results = lrp.execute_async(lrs.download_html, word_paths[:8])

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 305.48it/s]


`execute_async()` function is based on `concurrent.futures` module and will come in handy when downloading 51000+ pages.

## Audio & Images

In [8]:
results = lrp.execute_async(lrs.download_media, lrp.list_html_files())

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.76it/s]


## CSS

In [9]:
stylesheets = ['aside.css', 'commons.css']
css_path = Path('./assets/css/')
os.makedirs(css_path, exist_ok=True)
for filename in stylesheets:
    response = requests.get(f'https://dictionnaire.lerobert.com/statics/css/{filename}')
    with open(css_path / Path(filename), 'w', encoding='utf-8') as f:
        f.write(response.text)

## JavaScript

In [10]:
filename = 'audioplayer.js'
js_path = Path('./assets/js/')
os.makedirs(js_path, exist_ok=True)
response = requests.get(f'https://dictionnaire.lerobert.com/statics/js/{filename}')
with open(js_path / Path(filename), 'w', encoding='utf-8') as f:
    f.write(response.text)

# Analysing the Structure of HTML (Optional)

Analysing the structure of HTML can be useful for figuring out what tags are found inside example tags (class="d_xpl") and how often. We are going to take just the text part of example tags to get embeddings. To get the structure, we can use `find_definitions()` function as shown below.

In [11]:
html_file = lrp.list_html_files()[0]
definition_tags = lrp.find_definitions(lrp.read_html_file(html_file))
html_file, len(definition_tags)

('a', 4)

Now, we can find all strings in a defintion tag and index them by their parents.

In [12]:
results = lrp.execute_async(lrp.index_strings_by_parents, lrp.list_html_files()[:10], processes=True)
string_parents = defaultdict(lambda: dict())
for res_filename, res_parents in results:
    for key, val in res_parents.items():
        string_parents[key][res_filename] = val
string_parents = dict(string_parents)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  3.06it/s]


In [13]:
for key in list(string_parents.keys())[:5]:
    print(key)

(('h3', None), ('span', 'notBold'))
(('h3', None),)
(('h3', None), ('span', 'd_cat'))
(('div', 'd_ptme'), ('span', 'd_dfn'))
(('div', 'd_ptme'), ('span', 'd_dfn'), ('span', 'd_im'))


Each key is a tuple of tags also represented as tuples. After indexing, finding a page with and its content with a specific HTML structure becomes easy.

In [14]:
string_parents[(('h3', None), ('span', 'notBold'))]

{'a': {0: [(1, 0, 0)], 1: [(1, 0, 0)], 2: [(1, 0, 0)], 3: [(1, 0, 0)]},
 'a-b-c': {0: [(1, 0, 0)]},
 'a-maxima': {0: [(1, 0, 0)]},
 'a-contrario': {0: [(1, 0, 0)]},
 'a-fortiori': {0: [(1, 0, 0)]},
 'a-cappella': {0: [(1, 0, 0)]},
 'a-coup': {0: [(1, 0, 0)]},
 'a-cote': {0: [(1, 0, 0)]}}

In [15]:
lrp.get_content(definition_tags[0], (1, 0, 0))

'Définition de '

The dictionary contains plenty of examples for which it uses tags with `class="d_xpl"`. Let's find all possible combinations with no more than one parent and one child.

In [16]:
tag_classes = set()
for key in string_parents.keys():
    for ind, tag in enumerate(key):  
        if tag[1] == 'd_xpl':
            tag_neighbors = key[ind-1:ind+2]
            tag_classes.add(tag_neighbors)
tag_classes

{(('div', 'd_dvl'), ('span', 'd_xpl')),
 (('div', 'd_dvl'), ('span', 'd_xpl'), ('span', 'd_lca')),
 (('div', 'd_dvl'), ('span', 'd_xpl'), ('span', 'd_mtb')),
 (('div', 'd_dvn'), ('span', 'd_xpl')),
 (('div', 'd_dvn'), ('span', 'd_xpl'), ('span', 'd_gls')),
 (('div', 'd_dvn'), ('span', 'd_xpl'), ('span', 'd_lca')),
 (('div', 'd_ptma'), ('span', 'd_xpl')),
 (('div', 'd_ptma'), ('span', 'd_xpl'), ('span', 'd_gls')),
 (('span', 'd_dvt'), ('span', 'd_xpl')),
 (('span', 'd_dvt'), ('span', 'd_xpl'), ('span', 'd_gls')),
 (('span', 'd_dvt'), ('span', 'd_xpl'), ('span', 'd_lca')),
 (('span', 'd_dvt'), ('span', 'd_xpl'), ('span', 'd_mtb')),
 (('span', 'd_dvt'), ('span', 'd_xpl'), ('span', 'd_rm'))}

The parent of an example tag (`class="d_xpl"`) is a certain meaning in which the word is used.

# Preparing HTML files

## Wrapping words in example sentences in "span" tags with the class="word"

Here, we are wrapping the word for which an example sentence is given into a tag with `class="word"` which is used for pooling.

In [17]:
# you may also need to specify the directory of TreeTagger as TAGDIR='/home/user/treetagger'
tagger = ttpw.TreeTagger(TAGLANG='fr')

In [18]:
results = lrp.execute_async(lrp.process_html, lrp.list_html_files(), processes=False, tagger=tagger)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.76it/s]


## Mapping words in a definition header to that definition 

We will use the words found in the headers of definitions as keys to get those definitions during the search.

In [19]:
results = lrp.execute_async(lrp.map_words, lrp.list_html_files('./assets/html/processed/'), processes=False)
full_word_map = defaultdict(lambda: defaultdict(lambda: []))
for word_map in results:
    for word, word_paths in word_map.items():
        for word_path in word_paths:
            full_word_map[word][word_path] = word_map[word][word_path]
with open('./assets/word_map.json', 'w', encoding='utf-8') as f:
    json.dump(full_word_map, f, indent=4)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 12.44it/s]


# Computing Embeddings

Here, we are computing contextual embeddings for processed HTML files. These embeddings are used later to display how similar they are to the ones computed for a custom text.

In [20]:
model_names = ["sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
               "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"]

In [21]:
for model_name in model_names:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    embedding_path = f'./assets/embeddings/{model_name}'
    os.makedirs(embedding_path, exist_ok=True)
    results = lrp.execute_async(lrp.compute_embeddings_html_file,
                                lrp.list_html_files(html_path='./assets/html/processed'),
                                processes=False,
                                tokenizer=tokenizer,
                                model=model,
                                embedding_path=embedding_path)

100%|█████████████████████████████████████████████| 1/1 [00:03<00:00,  3.17s/it]
100%|█████████████████████████████████████████████| 1/1 [00:01<00:00,  1.44s/it]


# Launching the App

A simple interface of the app is provided in `./assets/index.html`. API calls made by the app are processed in `main.py`. To start the app: `uvicorn main:app --reload`.