In [None]:
from IPython.display import HTML
from elasticsearch import Elasticsearch, ConnectionError
from sentence_transformers import SentenceTransformer
import re

# Función para resaltar múltiples términos en el texto con HTML, ignorando mayúsculas/minúsculas
def highlight_text(text, terms, color="yellow"):
    for term in terms:
        regex = re.compile(re.escape(term), re.IGNORECASE)
        text = regex.sub(lambda match: f'<span style="background-color: {color};">{match.group(0)}</span>', text)
    return text

# Cargar el modelo de embeddings
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def generate_embedding(text):
    """Genera un embedding para el texto proporcionado usando el modelo cargado."""
    return model.encode(text).tolist()

# Conectarse a Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

def realizar_busqueda(user_query, start_date, end_date):
    terms = user_query.split()
    
    # Generar el embedding de la consulta
    query_embedding = generate_embedding(user_query)

    # Función para realizar búsquedas en diferentes campos
    def search_in_field(field, query, start_date, end_date, size=10):
        must_conditions = [
            {"multi_match": {"query": query, "fields": [field], "operator": "and"}}
        ]
        
        if start_date and end_date:
            must_conditions.append({"range": {"d_date": {"gte": start_date, "lte": end_date}}})
        
        try:
            response = es.search(index="boja_index", body={
                "query": {
                    "bool": {
                        "must": must_conditions
                    }
                },
                "size": size
            })
            return response['hits']['hits']
        except ConnectionError as e:
            print(f"Error de conexión: {e}")
            return []

    # Realizar búsquedas por cada campo especificado
    results_sumario = search_in_field("t_asumarioNoHtml", user_query, start_date, end_date, 10)
    results_cuerpo = search_in_field("t_bodyNoHtml", user_query, start_date, end_date, 10)
    results_entities_asumario = search_in_field("entities_asumario", user_query, start_date, end_date, 5)
    results_entities_body = search_in_field("entities_body", user_query, start_date, end_date, 5)

    # Realizar la consulta de búsqueda por embeddings con rango de fechas si se proporcionan
    if start_date and end_date:
        try:
            response_embeddings = es.search(index="boja_index", body={
                "query": {
                    "bool": {
                        "must": {
                            "script_score": {
                                "query": {"match_all": {}},
                                "script": {
                                    "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                                    "params": {"query_vector": query_embedding}
                                }
                            }
                        },
                        "filter": {
                            "range": {"d_date": {"gte": start_date, "lte": end_date}}
                        }
                    }
                },
                "size": 10
            })
            results_embeddings = response_embeddings['hits']['hits']
        except ConnectionError as e:
            print(f"Error de conexión: {e}")
            results_embeddings = []
    else:
        try:
            response_embeddings = es.search(index="boja_index", body={
                "query": {
                    "script_score": {
                        "query": {"match_all": {}},
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                            "params": {"query_vector": query_embedding}
                        }
                    }
                },
                "size": 10
            })
            results_embeddings = response_embeddings['hits']['hits']
        except ConnectionError as e:
            print(f"Error de conexión: {e}")
            results_embeddings = []

    # Combinar los resultados eliminando duplicados
    combined_results = {hit['_id']: hit for hit in (results_sumario + results_cuerpo + results_entities_asumario + results_entities_body + results_embeddings)}
    combined_results = list(combined_results.values())[:30]  # Obtener los primeros 30 resultados

    # Crear la salida en HTML y aplicar el resaltado
    results = []
    for hit in combined_results:
        sumario = hit['_source'].get('t_asumarioNoHtml', '')
        cuerpo = hit['_source'].get('t_bodyNoHtml', '')

        # Omitir resultados sin sumario ni cuerpo
        if not sumario and not cuerpo:
            continue

        url = hit['_source']['t_publicUrl']
        fecha = hit['_source']['d_date']
#        sumario = highlight_text(sumario, terms)
#        cuerpo = highlight_text(cuerpo, terms)
        
        result = {
            "ID": hit['_id'],
            "URL": url,
            "Fecha": fecha,
            "Sumario": sumario,
            "Cuerpo": cuerpo
        }
        results.append(result)

    return results

# Ejemplo de uso de la función
user_query = "ley de función pública"
start_date = "2021-01-01"
end_date = "2024-05-26"

#resultados = realizar_busqueda(user_query, start_date, end_date)

# Mostrar los resultados en formato HTML
html_output = "<h2>Resultados de la búsqueda:</h2>"
#for result in resultados:
#    html_output += f"<p><strong>ID:</strong> {result['ID']}</p>"
#    html_output += f"<p><strong>URL:</strong> <a href='{result['URL']}'>{result['URL']}</a></p>"
#    html_output += f"<p><strong>Fecha:</strong> {result['Fecha']}</p>"
#    html_output += f"<p><strong>Sumario:</strong> {result['Sumario']}</p>"
#    html_output += f"<p><strong>Fragmento del cuerpo:</strong> {result['Cuerpo']}</p>"
#    html_output += "<hr>"

#display(HTML(html_output))

from flask import Flask, request, jsonify, render_template
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import re

app = Flask(__name__)


@app.route('/search', methods=['POST'])
def search():
    data = request.json  # Obtiene los datos JSON enviados en el cuerpo de la solicitud
    print(data)  # Imprime los datos en la consola
    keyword = data.get('query', '')
    start_date = data.get('startDate', '')
    end_date = data.get('endDate', '')
    sort_option = data.get('sortBy', 'relevance')
   
    results = realizar_busqueda(keyword, start_date, end_date)
    max_length = 500  # Puedes ajustar este valor según tus necesidades
    for result in results:
        if len(result['Sumario']) > max_length:
            result['Sumario'] = result['Sumario'][:max_length] + '...'
        if len(result['Cuerpo']) > max_length:
            result['Cuerpo'] = result['Cuerpo'][:max_length] + '...'
   
   
    print(results)
    return jsonify(result=results)  # Devuelve los resultados como JSON

@app.route('/', methods=['GET'])
def index():
    return render_template('BOJA.ia.html')

if __name__ == '__main__':
    app.run(debug=False)

