## Imports

In [None]:
!pip install farm-haystack transformers ollama
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install sentence-transformers
!pip install "farm-haystack[inference]"

In [None]:
from haystack.document_stores import InMemoryDocumentStore
from haystack.nodes import EmbeddingRetriever
from haystack.schema import Document
import ollama
import os
import pickle
import pandas as pd

In [3]:
TERMS_PATH = "terms.csv"
CUR_LANG = "es"
ID = "id"
ENGLISH = "en"
ARABIC = "ar"
SPANISH = "es"
FRENCH = "fr"
RUSSIAN = "ru"
CHINESE = "zh"
LANGUAGES = {ENGLISH: "English", ARABIC: "Arabic", SPANISH: "Spanish", FRENCH: "French", RUSSIAN: "Russian", CHINESE: "Chinese"}
def EMBEDDINGS_PATH(lang):
    return f"{lang}_embeddings.pkl"

In [44]:
key_terms = pd.read_csv(TERMS_PATH)
print(type(key_terms.iterrows()))


<class 'generator'>


## Prepare key terms embeddings

In [38]:
if (os.path.exists(EMBEDDINGS_PATH(CUR_LANG))):
    with open(EMBEDDINGS_PATH(CUR_LANG), 'rb') as f:
        document_store = pickle.load(f)
        retriever = EmbeddingRetriever(
            document_store=document_store,
            embedding_model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            use_gpu=False 
        )
else:
    document_store = InMemoryDocumentStore(embedding_dim=384)
    key_terms = pd.read_csv(TERMS_PATH)
    documents = [Document(content=line[1][CUR_LANG]) for line in key_terms.iterrows()]
    document_store.write_documents(documents)
    retriever = EmbeddingRetriever(
            document_store=document_store,
            embedding_model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
            use_gpu=False
        )
    document_store.update_embeddings(retriever)
    with open(EMBEDDINGS_PATH(CUR_LANG), 'wb') as f:
        pickle.dump(document_store, f)

## Prepare question embedding and question for Llama3.2

In [39]:
def get_relevant_key_terms(question, top_k=10):
    results = retriever.retrieve(query=question, top_k=top_k)
    return [doc.content for doc in results]

os.popen('ollama serve')
client = ollama.Client()

def ask_llama(question, relevant_key_terms):
    prompt = f"""
        Here is a search prompt by a user of the UN digital library:
        Question: {question}
        Here is a list of key terms. Each key term is in square brackets.
        Key Terms: {relevant_key_terms}
        Select ideally one, if necessary two key terms that are most relevant to the question.
        Output only the selected key terms as they are presented to you: each key term within square brackets, each set of square brackets separated by a semi-colon and no space.
        """

    return client.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])['message']['content']


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Error: listen tcp 127.0.0.1:11434: bind: address already in use


## Ask question

In [40]:
#question = "I would like to know more about the impact of world war 1 on the economy of Germany."
question = "españa"

In [41]:
# Get relevant key terms
key_terms = get_relevant_key_terms(question)

key_terms_array = ["[" + term.replace('"', '') + "]" for term in key_terms]
print("Key Terms:", key_terms_array)

# Join the key terms array into a single string separated by semi-colons
key_terms_string = ";".join(key_terms_array)
print("Relevant Key Terms:", key_terms_string)

Batches: 100%|██████████| 1/1 [00:00<00:00, 35.86it/s]

Key Terms: ['[s]', '[d]']
Relevant Key Terms: [s];[d]





In [42]:
is_answer_valid = False
while not is_answer_valid:
    answer = ask_llama(question, key_terms_string)
    print("Answer:", answer)

    answer_split = [term.strip() for term in answer.split(';')]
    is_answer_valid = isinstance(answer, str) and len(answer_split) <= 2
    is_answer_valid = is_answer_valid and all([term.strip() in key_terms_array for term in answer.split(';')])
    print(is_answer_valid)

Answer: [d];[s]
True


In [None]:
print(answer_split)
output = ""
for term in answer_split:
    output += f"subjectheading:{term}"
print(output)