A Python code for a Bulgarian question-answering system. This version embeds documents in ChromaDB (RobertaDB) using the rmihaylov/roberta-base-use-qa-bg model for retrieval. It then uses the INSAIT BGGPT model to generate a refined answer based on the best document retrieved. The Hugging Face pipeline is used for question answering.

In [None]:
import os
import pandas as pd
import PyPDF2
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
from chromadb import Client
from chromadb.config import Settings
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

# Initialize ChromaDB client
def initialize_chromadb():
    client = Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory="RobertaDB"))
    return client

# Load the QA pipeline
def load_qa_pipeline(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
    return qa_pipeline

# Extract text from PDFs
def extract_text_from_pdfs(pdf_files):
    texts = []
    for pdf_file in pdf_files:
        try:
            with open(pdf_file, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                text = " ".join([page.extract_text() for page in reader.pages if page])
                texts.append(text)
        except Exception as e:
            print(f"Error reading {pdf_file}: {e}")
    return texts

# Extract text from HTML files
def extract_text_from_html(html_files):
    texts = []
    for html_file in html_files:
        try:
            with open(html_file, "r", encoding="utf-8") as f:
                soup = BeautifulSoup(f, "html.parser")
                texts.append(soup.get_text())
        except Exception as e:
            print(f"Error reading {html_file}: {e}")
    return texts

# Extract text from Excel files
def extract_text_from_xlsx(xlsx_files):
    texts = []
    for xlsx_file in xlsx_files:
        try:
            df = pd.read_excel(xlsx_file)
            for col in df.columns:
                texts.extend(df[col].dropna().astype(str).tolist())
        except Exception as e:
            print(f"Error reading {xlsx_file}: {e}")
    return texts

# Load Q&A from CSV
def load_qa_from_csv(csv_file):
    try:
        df = pd.read_csv(csv_file)
        texts = (df["question"] + " " + df["answer"]).tolist()
        return texts
    except Exception as e:
        print(f"Error reading {csv_file}: {e}")
        return []

# Embed and store data in ChromaDB
def embed_data_in_chromadb(client, collection_name, texts, qa_pipeline):
    embedding_function = SentenceTransformerEmbeddingFunction(
        tokenizer=qa_pipeline.tokenizer, model=qa_pipeline.model
    )
    collection = client.get_or_create_collection(collection_name, embedding_function=embedding_function)
    for idx, text in enumerate(texts):
        collection.add_document({"id": f"doc_{idx}", "text": text})
    return collection

# Query ChromaDB for answers
def query_chromadb(collection, question, qa_pipeline):
    # Retrieve top documents from ChromaDB
    results = collection.query(query_texts=[question], n_results=5)  # Get top 5 documents
    documents = [doc["text"] for doc in results["documents"]]

    if not documents:
        return "Не е намерен подходящ отговор.", None

    # Use the QA pipeline to refine answers
    best_answer = None
    best_context = None
    best_score = -float("inf")

    for document in documents:
        try:
            answer = qa_pipeline(question=question, context=document)
            if answer["score"] > best_score:
                best_answer = answer["answer"]
                best_context = document
                best_score = answer["score"]
        except Exception as e:
            print(f"Error processing document: {e}")

    return best_answer or "Не е намерен подходящ отговор.", best_context

# Generate answer using INSAIT BGGPT
def generate_bggpt_answer(context, question):
    # Simulating a BGGPT call (this requires an actual API or model locally)
    # Replace the following with the actual BGGPT integration
    print("\n--- Using INSAIT BGGPT to refine the answer ---")
    refined_answer = f"Въз основа на контекста: {context}\nОтговор на въпроса '{question}': [INSAIT BGGPT Generated Answer]"
    return refined_answer

# Main function
def main():
    # Directory with documents
    documents_dir = "documents"
    pdf_files = [os.path.join(documents_dir, f) for f in os.listdir(documents_dir) if f.endswith(".pdf")]
    html_files = [os.path.join(documents_dir, f) for f in os.listdir(documents_dir) if f.endswith(".html")]
    xlsx_files = [os.path.join(documents_dir, f) for f in os.listdir(documents_dir) if f.endswith(".xlsx")]
    csv_file = os.path.join(documents_dir, "questions-answers.csv")

    # Initialize ChromaDB
    client = initialize_chromadb()

    # Load QA pipeline
    model_name = "rmihaylov/roberta-base-use-qa-bg"
    qa_pipeline = load_qa_pipeline(model_name)

    # Extract texts
    pdf_texts = extract_text_from_pdfs(pdf_files)
    html_texts = extract_text_from_html(html_files)
    xlsx_texts = extract_text_from_xlsx(xlsx_files)
    qa_texts = load_qa_from_csv(csv_file)

    # Combine all texts
    all_texts = pdf_texts + html_texts + xlsx_texts + qa_texts

    # Embed data
    collection_name = "RobertaDB"
    collection = embed_data_in_chromadb(client, collection_name, all_texts, qa_pipeline)

    # Prompt user for questions
    print("Задайте въпрос на български език (или напишете 'изход', за да излезете):")
    while True:
        question = input("Вашият въпрос: ")
        if question.lower() == "изход":
            break
        initial_answer, context = query_chromadb(collection, question, qa_pipeline)
        if context:
            refined_answer = generate_bggpt_answer(context, question)
            print(f"Refined Answer: {refined_answer}")
        else:
            print(f"Отговор: {initial_answer}")

if __name__ == "__main__":
    main()


#### Data Embedding:
Embeds documents into a ChromaDB collection (RobertaDB) using rmihaylov/roberta-base-use-qa-bg.

#### Question Answering:
Retrieves top candidate documents from ChromaDB.
Uses qa_pipeline for initial question-answering based on the retrieved documents.

#### INSAIT BGGPT Integration:
Simulates a call to INSAIT’s BGGPT model to refine the answer. Replace the generate_bggpt_answer function with an actual API call or local model inference.

#### Multilingual Processing:
Handles Bulgarian documents and questions seamlessly.

#### Dependencies:
Install the required Python libraries:

pip install torch transformers chromadb PyPDF2 beautifulsoup4 pandas openpyxl

#### Notes:
Place all documents (PDFs, HTML files, Excel files, and the CSV) in a documents directory.
Ensure the CSV file contains two columns: question and answer.
Modify generate_bggpt_answer to integrate with the actual INSAIT BGGPT model or API.