In [None]:
import os
import requests
from datetime import datetime, timedelta
from xml.dom import minidom
from xml.etree import ElementTree as ET
import sys
import threading
import pprint

BOE_URL = 'https://boe.es'
BOE_API_SUMARIO = 'https://boe.es/diario_boe/xml.php?id=BOE-S-'
START_DATE = datetime.strptime('20231025', '%Y%m%d')
PATH_DATA = os.path.join('boe', 'dias')
diff_1_day = timedelta(days=1)

# Scrappig BOE

In [None]:
def make_dirs(path):
    try:
        os.makedirs(path)
    except FileExistsError:
        pass

In [None]:
def traer_documento(origen, destino):
    intentos = 0
    max_intentos = 5
    
    while intentos < max_intentos:
        if intentos != 0:
            import time
            time.sleep(5)
            print(f'Intento {intentos}')

        response = requests.get(origen, stream=True)
        if response.status_code == 200:
            print(f'Guardando en: {destino}')
            with open(destino, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            break
        else:
            intentos += 1

In [None]:
def get_pdfs(current_date):
    print('current_date:', current_date.strftime('%Y-%m-%d'))

    fecha_anno, fecha_mes, fecha_dia = current_date.strftime('%Y %m %d').split()

    for dir_path in [os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia),
                    os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'pdfs')]:
        make_dirs(dir_path)

    fichero_sumario_xml = os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'index.xml')

    if os.path.exists(fichero_sumario_xml):
        os.remove(fichero_sumario_xml)

    print('Solicitando', f'{BOE_API_SUMARIO}{current_date.strftime("%Y%m%d")} --> {fichero_sumario_xml}')
    traer_documento(f'{BOE_API_SUMARIO}{current_date.strftime("%Y%m%d")}', fichero_sumario_xml)

    tamano_sumario_xml = os.path.getsize(fichero_sumario_xml)
    print('Recibidos:', tamano_sumario_xml, 'bytes')

    if tamano_sumario_xml < 10:
        print('ERROR: Sumario XML erróneo o incompleto')
        sys.exit(1)

    xml_sumario = minidom.parse(fichero_sumario_xml)

    if xml_sumario.documentElement.nodeName == 'error':
        os.remove(fichero_sumario_xml)
        os.rmdir(os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'pdfs'))
        os.rmdir(os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia))
        print('AVISO: No existen boletines para la current_date', current_date.strftime('%Y-%m-%d'))
    else:
        pdfs = xml_sumario.getElementsByTagName('urlPdf')
        print(f"{len(pdfs)} encontrados")
        for pdf in pdfs:
            fichero_pdf = os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'pdfs', pdf.firstChild.nodeValue).replace(' ', '_').replace('/', '\\')[1:]
            fichero_pdf_tamano_xml = pdf.getAttribute('szBytes')
            if os.path.exists(fichero_pdf):
                print("fichero encontrado")
                if os.path.getsize(fichero_pdf) == int(fichero_pdf_tamano_xml):
                    continue
                else:
                    try:
                        os.remove(fichero_pdf)
                    except:
                        pass


            print('Solicitando', f'{BOE_URL}{pdf.firstChild.nodeValue} --> {fichero_pdf}')
            intentos = 0
            max_intentos = 5
            while intentos < max_intentos:
                if intentos != 0:
                    import time
                    time.sleep(5)
                    print(f'Intento {intentos}')

                traer_documento(f'{BOE_URL}{pdf.firstChild.nodeValue}', fichero_pdf)
                intentos += 1

                if os.path.getsize(fichero_pdf) == int(fichero_pdf_tamano_xml):
                    break

            if os.path.getsize(fichero_pdf) != int(fichero_pdf_tamano_xml):
                print('ERROR: El tamaño del fichero PDF descargado no coincide con el del XML del Sumario '
                    f'(Descargado: {os.getsize(fichero_pdf)} <> XML: {fichero_pdf_tamano_xml})')
                sys.exit(1)

In [None]:
def get_xmls(current_date, semaphore=None):
    print('current_date:', current_date.strftime('%Y-%m-%d'))

    fecha_anno, fecha_mes, fecha_dia = current_date.strftime('%Y %m %d').split()

    for dir_path in [os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia),
                    os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'xmls')]:
        make_dirs(dir_path)

    fichero_sumario_xml = os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'index.xml')

    if os.path.exists(fichero_sumario_xml):
        os.remove(fichero_sumario_xml)

    print('Solicitando', f'{BOE_API_SUMARIO}{current_date.strftime("%Y%m%d")} --> {fichero_sumario_xml}')
    traer_documento(f'{BOE_API_SUMARIO}{current_date.strftime("%Y%m%d")}', fichero_sumario_xml)

    tamano_sumario_xml = os.path.getsize(fichero_sumario_xml)
    print('Recibidos:', tamano_sumario_xml, 'bytes')

    if tamano_sumario_xml < 10:
        print('ERROR: Sumario XML erróneo o incompleto')
        sys.exit(1)

    xml_sumario = minidom.parse(fichero_sumario_xml)

    if xml_sumario.documentElement.nodeName == 'error':
        os.remove(fichero_sumario_xml)
        os.rmdir(os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'xmls'))
        os.rmdir(os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia))
        print('AVISO: No existen boletines para la fecha ', current_date.strftime('%Y-%m-%d'))
    else:
        xmls = xml_sumario.getElementsByTagName('urlXml')
        for xml in xmls:
            fichero_xml = PATH_DATA + '\\' + str(fecha_anno) + '\\' + str(fecha_mes) + '\\' + str(fecha_dia) + '\\' + 'xmls' + '\\' + xml.firstChild.nodeValue.split('=')[-1] + '.xml'
            #fichero_xml = os.path.join(PATH_DATA, fecha_anno, fecha_mes, fecha_dia, 'xmls', xml.firstChild.nodeValue.split('=')[-1] + '.xml').replace(' ', '_').replace('/', '\\')[1:]

            if os.path.exists(fichero_xml):
                print("fichero encontrado")
                continue


            print('Solicitando', f'{BOE_URL}{xml.firstChild.nodeValue} --> {fichero_xml}')
            intentos = 0
            max_intentos = 5
            while intentos < max_intentos:
                if intentos != 0:
                    import time
                    time.sleep(5)
                    print(f'Intento {intentos}')

                traer_documento(f'{BOE_URL}{xml.firstChild.nodeValue}', fichero_xml)
                intentos += 1

                if os.path.getsize(fichero_xml):
                    break

            if not os.path.getsize(fichero_xml):
                print('ERROR: El tamaño del fichero xml descargado no coincide con el del XML del Sumario '
                    f'(Descargado: {os.getsize(fichero_xml)} <> XML: {fichero_xml_tamano_xml})')
                sys.exit(1)

    if semaphore is not None:
        semaphore.release()

In [None]:
current_date = START_DATE
hoy = datetime.now()
while current_date <= hoy and 0:
    get_xmls(current_date)
    get_pdfs(current_date)
    current_date += diff_1_day

In [None]:
n_hilos = 6
semaphore = threading.Semaphore(n_hilos)
dates =  [START_DATE + timedelta(days=i) for i in range((datetime.now() - START_DATE).days + 1)]
hilos = []

print(dates)

for date in dates:
    semaphore.acquire()

    print('current_date:', date.strftime('%Y-%m-%d'))
    
    hilo = threading.Thread(target=get_xmls, args=(date, semaphore))
    #threading.Thread(target=get_pdfs, args=(date,)).start()
    hilos.append(hilo)
    hilo.start()

for hilo in hilos:
    hilo.join()


# Almacenamieto de datos

In [None]:
def proceso_sumario(path_xml, keys_to_save=None):
    data = {}
    xml_sumario = minidom.parse(path_xml)

    anno, mes, dia = path_xml.split('\\')[2:-2]
    file_name = path_xml.split('\\')[-1].split('.')[0]
    date = path_xml.split('\\')[-1].split('.')[0].split('-')[-2]

    data['index'] = file_name.replace(date, f'{anno}{mes}{dia}')
    data['index'] = path_xml
    
    documento = xml_sumario.documentElement
    i = 0
    for element in documento.getElementsByTagName('*'):
        #print("Elemento:", element.tagName)
        if element.tagName == 'p': element.tagName = element.getAttribute('class') + '_' + str(i); i += 1
        data[element.tagName] = {}
        for attr_name, attr_value in element.attributes.items():
            data[element.tagName][attr_name] = attr_value
            #print(f"Atributo: {attr_name} - Valor: {attr_value}")
        if element.firstChild and element.firstChild.nodeType == element.TEXT_NODE:
            data[element.tagName]['Contenido'] = element.firstChild.data
            #print("Contenido:", element.firstChild.data)

    data_to_save = {}
    if keys_to_save is None:
        keys_to_save = data.keys()
    else:
        keys_to_save.extend([e for i, e in enumerate(data.keys()) if e.startswith('parrafo') or e.startswith('articulo')])
        
    for key in keys_to_save:
        try:
            if key == 'index':
                data_to_save[key] = data[key]
            else:
                data_to_save[key] = data[key]['Contenido'].replace('\xa0', ' ').replace('\u2003', ' ')
        except Exception as e:
            pass
            #print(f"{path_xml} excepcion: {e}")
    
    return data_to_save

In [None]:
def process_data_old(keys_to_save=None):
    data = {}
    i = 0
    #itero por las carpetas de boe
    for anio in os.listdir('boe/dias'):
        #itero por los archivos de cada carpeta
        for mes in os.listdir(f'boe/dias/{anio}'):
            #si el archivo es un pdf
            for dia in os.listdir(f'boe/dias/{anio}/{mes}'):
                path_xmls = os.path.join('boe', 'dias', anio, mes, dia, 'xmls')
                print(path_xmls)
                for xml in os.listdir(path_xmls):
                    path_xml = os.path.join(path_xmls, xml)
                    file_data = proceso_sumario(path_xml, keys_to_save)
                    data[file_data['identificador']] = file_data
                    i += 1
                    #if i > 10: break
    
    return data

In [None]:
METADATA_FIELDS = [
    ("identificador", None),
    ("titulo", None),
    ("departamento", None),
    ("fecha_publicacion", lambda x: datetime.strptime(x, "%Y%m%d")),
    ("origen_legislativo", None),
    ("rango", None),
]

def jsonify_boe_entry(xml):
    entry_json = {}

    # get metadata
    metadata = xml.find("metadatos")
    for tag, parser in METADATA_FIELDS:
        element = metadata.find(tag)
        if element is None: continue
        text = element.text
        entry_json[tag] = parser(text) if parser is not None else text
    
    # get topics
    entry_json["materias"] = [topic.text for topic in xml.findall(".//materia")]

    # get references
    past_refs = []
    for ref in xml.findall(".//anterior"):
        past_refs.append({
            "identificador": ref.get("referencia"),
            "texto": ref.find("texto").text,
        })
    entry_json["anteriores"] = past_refs

    future_refs = []
    for ref in xml.findall(".//posterior"):
        future_refs.append({
            "identificador": ref.get("referencia"),
            "texto": ref.find("texto").text,
        })
    entry_json["posteriores"] = future_refs

    # get XML text
    xml_text = xml.find("texto")
    html_text = ET.tostring(xml_text, encoding='utf8', method="html").decode('utf8')
    html_text = "\n".join(html_text.split("\n")[1:-1])
    entry_json["texto"] = html_text

    # get paragraphs
    entry_json["parrafos"] = [paragraph.text for paragraph in xml_text.findall(".//p")]

    return entry_json

In [None]:
# read all files in downloads folder, recursively
def read_xml_files(path):
    for r, _, f in os.walk(path):
        for file in f:
            if '.xml' not in file or "BOE" not in file:
                continue
            yield os.path.join(r, file)

In [None]:
def process_data():
    data = {}
    i = 0
    #itero por las carpetas de boe
    for anio in os.listdir('boe/dias'):
        #itero por los archivos de cada carpeta
        for mes in os.listdir(f'boe/dias/{anio}'):
            #si el archivo es un pdf
            for dia in os.listdir(f'boe/dias/{anio}/{mes}'):
                path_xmls = os.path.join('boe', 'dias', anio, mes, dia, 'xmls')
                print(path_xmls)
                for xml in os.listdir(path_xmls):
                    path_xml = os.path.join(path_xmls, xml)
                    xml = ET.parse(path_xml)
                    entry_json = jsonify_boe_entry(xml)
                    data[entry_json['identificador']] = entry_json

    return data

In [None]:
def get_embeddings(input_text, model, tokenizer, max_length=512):
    enc = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=max_length)
    output = model.encoder(
        input_ids=enc['input_ids'],
        attention_mask=enc['attention_mask'],
        return_dict=True
        )
    return output.last_hidden_state.tolist()[0][0]

## ELASTICSEARCH

In [None]:
#!pip install elasticsearch sentencepiece

In [None]:
from elasticsearch import Elasticsearch
import json

#client = Elasticsearch("http://elasticsearch:9200")
client = Elasticsearch("http://localhost:9200")

### Crear index

In [None]:
config = {
    "mappings": {
        "properties": {
            "texto": {
                "type": "text"
            },
            "semantic_embeddig": {
                "type": "dense_vector",
                "dims": 512,
                "index": True,
                "similarity": "cosine"
            },
            #"titulo": {"type": "text"},
            "doc_id": {"type": "text"},
            #"fecha_publicacion": {"type": "date"}
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "my_analyzer": {
                "tokenizer": "keyword",
                "char_filter": [
                    "html_strip"
                ]
                }
            }
        },
        "number_of_shards": 2,
        "number_of_replicas": 1
    }
}

In [None]:
import torch
from transformers import T5Model, T5Tokenizer

model = T5Model.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [None]:
client = Elasticsearch("http://localhost:9200")
my_index = "boe"
try:
    client.indices.delete(index=my_index)
    print("Index deleted")
except:
    print("Index does not exist")
try:
    client.indices.create(
        index = my_index,
        settings = config["settings"],
        mappings = config["mappings"]
    )
except Exception as error:
    print("Error:", error)

### Poblar índices

In [None]:
data = process_data()
data[list(data.keys())[-5]]

In [None]:
for key in list(data.keys())[:1]:
    print(key)
    fecha = data[key]['fecha_publicacion']
    doc_id = key
    full_text = data[key]['texto']
    titulo = data[key]['titulo']
    semantic_embeddigs = []
    
    for i, text in enumerate(data[key]['parrafos']):
        semantic_embeddigs = get_embeddings(text, model, tokenizer)
        document = {
                "doc_id": doc_id,
                "embeddings": semantic_embeddigs,
                "texto": text,
                #"fecha_publicacion": fecha,
                #"full_text": full_text,
                #"titulo": titulo
            }

        try:
            client.index(
                index = my_index,
                document = document
            )
        except Exception as e:
            print(e)
            print(f"len: {len(document)}")
            print(fecha)
            print(id)
            print(document)
            break

In [None]:
my_query = {
    "match_all": {}
}

res = client.search(index=my_index, body={"query": my_query}, size=100)
print(f"Numero de entradas: {len(res['hits']['hits'])}")
print(f"Entradas encontradas: {res['hits']['hits'][0]['_source'].keys()}")
print(f"Nombre documento: {res['hits']['hits'][0]['_source']['doc_id']}")
print(f"Tamaño embeddings: {len(res['hits']['hits'][0]['_source']['embeddings'])}")
print(f"Texto: {res['hits']['hits'][0]['_source']['texto']}")

### Búsqueda léxica

In [None]:
string_to_search = "seguridad jurídica"
lexic_query = {
    "fields": ["texto"],
    "query" : string_to_search,
}
my_query = {
    "simple_query_string":{
        "fields": ["texto"],
        "query": string_to_search
    }
}
res = client.search(index=my_index, query=my_query)
for resp in res['hits']['hits']:
    print(resp['_source']['doc_id'])
    print(resp['_source']['texto'])


### Búsqueda semántica

In [None]:
string_to_search = "Declaración  de las enfermedades de los animales"
parametrs = {
    "field": "semantic_embeddig",
    "query_vector" : get_embeddings(string_to_search, model, tokenizer),
    "k": 2,
    "num_candidates": 5
}
res = client.search(index=my_index, knn=parametrs)
print("Respuestas búsqueda semántica:")
for resp in res['hits']['hits']:
    print(resp['_source']['doc_id'])
    print(resp['_source']['texto'])

## MongoDB

In [None]:
#!pip install pymongo

In [None]:
from pymongo import MongoClient

In [None]:
client = MongoClient("mongodb://root:example@mongo:27017")
db = client.BOE

In [None]:
data = process_data()
data[list(data.keys())[-5]]

In [None]:
for key in data.keys():
    db.Wikipedia_NER.update_one({'path':data[key]['index']},{"$set": data[key]}, True)