In [None]:
import pandas as pd
import numpy as np
import time
import pathlib
import os

from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
from langchain_core.prompts import PromptTemplate

from dotenv import load_dotenv

# Load environment variables

In [None]:
path_env = pathlib.Path('/export/usuarios_ml4ds/cggamella/NP-Search-Tool/.env')
print(f"Ruta al archivo .env: {path_env}")

In [None]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
path_env

In [None]:
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

## Create VectorDB with parquet docs

In [None]:
path = '/export/usuarios_ml4ds/cggamella/NP-Company-Process/data/DESCARGAS_ENTREGABLES/outsiders.parquet'
df_out = pd.read_parquet(path)

def unify_colname(col):
    return ".".join([el for el in col if el])

df_out.columns = [unify_colname(col) for col in df_out.columns]

index_names = df_out.index.names
#Se resetea el índice sobre el propio dataframe
df_out.reset_index(inplace=True)
#Crear identifier con los index_names separados con '/', así generamos un id único para cada fila
df_out["identifier"] = df_out[index_names].astype(str).agg("/".join, axis=1)
#Filtrado para quedarme con esas 2 cols
df_out = df_out[['identifier', 'title']]

In [None]:
# Crear el text splitter, [chunk_size: #caracteres de cada chunk];
#[chunk_overlap: #caracteres solapan entre chunks para no perder info.]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Crear una lista para almacenar los documentos
documents = []
# Procesar cada fila como un documento separado
for idx, row in df_out.iterrows():
    example_document = row['title']
    #print("el example doc es:",example_document)
    doc = Document(page_content=example_document, metadata={"url": "local", "source": "initial", "identifier": row['identifier']})
    #print("El doc es:",doc)
    # Dividir el documento en fragmentos
    chunks = text_splitter.split_text(doc.page_content)
    #print("Los chunks son:",chunks)
    for chunk in chunks:
        chunk_doc = Document(page_content=chunk, metadata=doc.metadata)
        #print("EL chunk_doc es:\n",chunk_doc)
        documents.append(chunk_doc)

In [None]:
# Embed and store the texts
path_to_index = '/export/usuarios_ml4ds/cggamella/NP-Search-Tool/aux_scripts/RAG'
# Supplying a persist_directory will store the embeddings on disk
persist_directory = (pathlib.Path(path_to_index) / 'db').as_posix()

start = time.time()
# Define embeddings
embedding = OpenAIEmbeddings()

# Almacenar los fragmentos en una base de datos vectorial usando Chroma
# Se extrae el contenido (page_content).El contenido se pasa a OpenAIEmbeddings
# para obtener embeddings. El vector resultante se almacena en la base de datos junto con los metadatos(índices).
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    persist_directory=persist_directory 
)
# Tiempo total de ejecución
end = time.time()
print(f"Total time is {end - start} seconds")

In [None]:
prompt_template = """
Given the word "{acronym}", understand that an acronym word is a type of abbreviation formed by taking the initial
letters or parts of words from a phrase or term and combining them to form a new word. Acronyms are pronounced
as words themselves, rather than being spelled out letter by letter. 

For example, "aeat" stands for "agencia_estatal_de_administración_tributaria", "csic" stands for "consejo_superior_de_investigaciones_científicas", "ceip" stands for "colegio_educación_infantil_y_primaria", and "avda" stands for "avenida".
Additionally, abbreviations can be formed by taking the initial letters of a phrase (like acronyms), by using only some letters of a word (like "Dr." for "Doctor"), or by shortening a word (like "apt." for "apartmento"). 

Please provide the full expression of the acronym in the following dictionary format: 'acronym:expanded_word',
where spaces between words are replaced by underscores '_'. If the expanded word is too long or if there is any other issue, do NOT provide an answer.

Documents:
{summaries}

--------------------
If you find any issue finding the correct expression, your answer have the following format: ''. Otherwise, your answer should be 'acronym:expanded_word'.
"""

prompt_template = PromptTemplate.from_template(template=prompt_template)

In [None]:
# Set up the turbo LLM
llm = ChatOpenAI(
    temperature=0.1,
    model_name='gpt-4o'
)

# Crear la chain
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    chain_type="stuff",
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt_template,
    }
)

def generate_acronym_expansions(chain, acronyms):
    results = {}
    for acronym in acronyms:
        # Recuperar documentos relevantes utilizando el retriever
        retrieved_docs = chain.retriever.get_relevant_documents(acronym)
        # Crear el resumen de documentos
        summaries = "\n".join([doc.page_content for doc in retrieved_docs])
        # Formar el prompt utilizando el template
        prompt = prompt_template.format(acronym=acronym, summaries=summaries)
        # Ejecutar la cadena con el prompt usando invoke
        response = chain.invoke({"question": prompt, "acronym": acronym, "summaries": summaries})
        print(f"Respuesta del modelo para '{acronym}':\n{response}\n")
        # Obtener la respuesta del modelo de lenguaje
        answer = response['answer']
        # Parsear la respuesta y agregarla a los resultados
        results[acronym] = answer.strip()
    return results


In [None]:
# Lista de acrónimos a buscar
acronyms = ['ghz','ecc','vga','s.l']# 's.a.', 'bop', 'pcap', 'ceip', 'jjmm', 'smp', 'avda', 'ffcc', 'itv']

# Generar las expansiones de los acrónimos
acronym_expansions = generate_acronym_expansions(chain, acronyms)

# Imprimir los resultados
for acronym, expansion in acronym_expansions.items():
    print(f"{acronym}: {expansion}")

In [None]:
acronym_expansions

# NP-Search-Tool sacar acrónimos con RAG

In [None]:
#!pip install chromadb
#!pip install -U langchain-openai

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from sentence_transformers import SentenceTransformer
import re

In [None]:
path = '/export/usuarios_ml4ds/cggamella/NP-Company-Process/data/DESCARGAS_ENTREGABLES/outsiders.parquet'
df_out = pd.read_parquet(path)

In [None]:
def unify_colname(col):
    return ".".join([el for el in col if el])

In [None]:
df_out.columns = [unify_colname(col) for col in df_out.columns]

In [None]:
#Para coger ['zip', 'file name', 'entry']
index_names = df_out.index.names
#Se resetea el índice sobre el propio dataframe
df_out.reset_index(inplace=True)
#Pone como identifier los index names separados con '/', así generamos un id único para cada fila
df_out["identifier"] = df_out[index_names].astype(str).agg("/".join, axis=1)

In [None]:
df_filtered = df_out.loc[:100, ['identifier', 'title']]

In [None]:
# Calcular la longitud media de la columna 'title'
df_filtered['title_length'] = df_filtered['title'].apply(len)
average_length = df_filtered['title_length'].mean()
average_length

In [None]:
# Inicializar el modelo de embeddings
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2', device='cuda')
# Generar embeddings para cada título en el dataframe
embeddings = model.encode(df_out['title'].tolist())

In [None]:
def retrieve_context_similarity(acronym, embeddings, df_out):
    acronym_embedding = model.encode([acronym])
    #print("La query embedding es:\n", query_embedding)
    similarities = cosine_similarity(acronym_embedding, embeddings)[0]
    print("Las cosine similarities son:\n", similarities)
    most_similar_indices = np.argsort(similarities)[-10:][::-1]
    print("Los indices mas similares son:\n", most_similar_indices)
    results = [df_out['title'].iloc[i] for i in most_similar_indices]
    print("Los resultados del retrieve son:\n", results)
    return results

In [None]:
def retrieve_context_contain_acronym(acronym, df_out):
    # Crear el patrón de expresión regular para coincidir con la palabra exacta
    pattern = re.compile(r'\b' + re.escape(acronym) + r'\b', re.IGNORECASE)
    # Filtrar los títulos que contienen el acrónimo exacto
    results = df_out[df_out['title'].str.contains(pattern, na=False)]
    # Obtener los títulos y limitar a los primeros 5 documentos
    titles = results['title'].head(5).tolist()
    # Limitar cada título a los primeros 70 caracteres
    limited_titles = [title[:70] for title in titles]
    return limited_titles

In [None]:
llm = OpenAI()
             
def generate_equivalent_expression(acronym):
    contexts = retrieve_context_similarity(acronym, embeddings, df_out)    
    context_texts = " ".join(contexts)
    prompt = f"Given the acronym {acronym}, and the following context: {context_texts}, provide the large expression of the acronym. Please provide me the result in a dictionary format 'acronym:equivalence'"
    print("EL PROMPTING ES:", prompt)
    response = llm.generate([prompt])
    print("La respuesta es:", response)
    return response.generations[0][0].text 

In [None]:
acronyms = ['ghz','ecc','vga','s.l', 's.a.', 'bop', 'pcap', 'ceip', 'jjmm', 'smp', 'avda', 'ffcc', 'itv']  

In [None]:
results = {}
for acronym in acronyms:
    expression = generate_equivalent_expression(acronym)
    results[acronym] = expression

In [None]:
#Estos son los resultados con la función de retrieve_context_similarity
print(results)

In [None]:
#Estos son los resultados con la función de retrieve_context_contain_acronym
print(results)

In [None]:
df_out['title'].iloc[36711]

In [None]:
# Dividir cada título en palabras y seleccionar aquellas con 3 letras
#words_3 = df_out['title'].str.split().explode().str.lower().apply(lambda x: x.strip(',.')).loc[lambda x: x.str.len() == 4]
# Convertir las palabras seleccionadas en una lista
#words_list = words_3.tolist()