In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from flask import Flask, request, jsonify, render_template
from elasticsearch import Elasticsearch, ConnectionError
from sentence_transformers import SentenceTransformer
import re

# Initialize Flask app
app = Flask(__name__)

# Initialize Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme': 'http'}])

# Load the model for embeddings
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def generate_embedding(text):
    """Generates an embedding for the given text."""
    return model.encode(text).tolist()

def realizar_busqueda(user_query, start_date, end_date, search_type, max_results=100):
    query_embedding = generate_embedding(user_query)
    index_name = "boja_index" if search_type == "BOJA" else "boe_index"
    batch_size = max_results // 5

    def search_in_field(field, query, start_date, end_date, size):
        must_conditions = [{"multi_match": {"query": query, "fields": [field], "operator": "and"}}]
        if start_date and end_date:
            must_conditions.append({"range": {"d_date": {"gte": start_date, "lte": end_date}}})
        elif start_date:
            must_conditions.append({"range": {"d_date": {"gte": start_date}}})
        elif end_date:
            must_conditions.append({"range": {"d_date": {"lte": end_date}}})

        query_body = {
            "query": {
                "bool": {
                    "must": must_conditions
                }
            },
            "size": size
        }
        try:
            response = es.search(index=index_name, body=query_body)
            hits = response.get('hits', {}).get('hits', [])
            print(f"Results from {field} search: {len(hits)}")  # Logging the number of hits
            return hits
        except Exception as e:
            print(f"Error in search_in_field for {field}: {e}")
            return []

    def search_with_embeddings(query_embedding, start_date, end_date, size):
        query_body = {
            "query": {
                "bool": {
                    "must": {
                        "script_score": {
                            "query": {"match_all": {}},
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                                "params": {"query_vector": query_embedding}
                            }
                        }
                    },
                    "filter": []
                }
            },
            "size": size
        }

        if start_date and end_date:
            query_body["query"]["bool"]["filter"].append({"range": {"d_date": {"gte": start_date, "lte": end_date}}})
        elif start_date:
            query_body["query"]["bool"]["filter"].append({"range": {"d_date": {"gte": start_date}}})
        elif end_date:
            query_body["query"]["bool"]["filter"].append({"range": {"d_date": {"lte": end_date}}})

        try:
            response = es.search(index=index_name, body=query_body)
            hits = response.get('hits', {}).get('hits', [])
            print(f"Results from embeddings search: {len(hits)}")  # Logging the number of hits
            return hits
        except Exception as e:
            print(f"Error in search_with_embeddings: {e}")
            return []

    try:
        results = []
        fields = ["t_asumarioNoHtml", "t_bodyNoHtml", "entities_asumario", "entities_body"]

        for field in fields:
            if len(results) >= max_results:
                break
            batch_results = search_in_field(field, user_query, start_date, end_date, batch_size)
            results.extend(batch_results)

        if len(results) < max_results:
            embeddings_results = search_with_embeddings(query_embedding, start_date, end_date, max_results - len(results))
            results.extend(embeddings_results)

        combined_results = {hit['_id']: hit for hit in results[:max_results] if hit}
        combined_results = list(combined_results.values())

        final_results = []
        for hit in combined_results:
            source = hit.get('_source', {})
            sumario = source.get('t_asumarioNoHtml', '')
            cuerpo = source.get('t_bodyNoHtml', '')
            url = source.get('t_publicUrl', '')
            fecha = source.get('d_date', '')

            if not (sumario and cuerpo and url and fecha):
                print(f"Missing field in hit: {hit}")
                continue

            result = {
                "ID": hit.get('_id', ''),
                "URL": url,
                "Fecha": fecha,
                "Sumario": sumario,
                "Cuerpo": cuerpo
            }
            final_results.append(result)

        print(f"Final results count: {len(final_results)}")
        return final_results

    except Exception as e:
        print(f"Error during realizar_busqueda: {e}")
        return []

@app.route('/search', methods=['POST'])
def search():
    data = request.get_json()
    print(data)
    keyword = data.get('query', '')
    start_date = data.get('startDate', '')
    end_date = data.get('endDate', '')
    sort_option = data.get('sortBy', 'relevance')
    search_type = data.get('searchType', 'BOJA')
    page = data.get('page', 1)
    results_per_page = data.get('resultsPerPage', 10)
    
    try:
        results = realizar_busqueda(keyword, start_date, end_date, search_type)
        max_length = 500  # Longitud máxima para truncar texto

        for result in results:
            if len(result['Sumario']) > max_length:
                result['Sumario'] = result['Sumario'][:max_length] + '...'
            if len(result['Cuerpo']) > max_length:
                result['Cuerpo'] = result['Cuerpo'][:max_length] + '...'

        if sort_option == 'date':
            results.sort(key=lambda x: x['Fecha'], reverse=True)
        
        total_results = len(results)
        start_index = (page - 1) * results_per_page
        end_index = start_index + results_per_page
        paginated_results = results[start_index:end_index]

        response = {
            "results": paginated_results,
            "totalResults": total_results,
            "page": page,
            "resultsPerPage": results_per_page,
            "originalQuery": keyword
        }

        print(response)
        return jsonify(response)
    except Exception as e:
        print(f"Error during search: {e}")
        return jsonify({"error": "Error al procesar la búsqueda."}), 500

@app.route('/', methods=['GET'])
def index():
    return render_template('Boodi.html')

if __name__ == '__main__':
    app.run(debug=False)
