In [None]:
import os
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from bs4 import BeautifulSoup
import pdfplumber

# Load RoBERTa QA model and tokenizer
model_name = "deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Function to extract text from PDFs
def extract_text_from_pdfs(directory):
    text = ""
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
    return text

# Function to extract text from HTML files
def extract_text_from_htmls(directory):
    text = ""
    for filename in os.listdir(directory):
        if filename.endswith(".html") or filename.endswith(".htm"):
            html_path = os.path.join(directory, filename)
            with open(html_path, "r", encoding="utf-8") as html_file:
                soup = BeautifulSoup(html_file, "html.parser")
                text += soup.get_text() + "\n"
    return text

# Function to extract context from CSV files
def extract_context_from_csv(directory):
    text = ""
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            csv_path = os.path.join(directory, filename)
            df = pd.read_csv(csv_path)
            for _, row in df.iterrows():
                text += f"Q: {row['question']} A: {row['answer']}\n"
    return text

# Function to process context
def preprocess_context(pdf_dir, html_dir, csv_dir):
    pdf_context = extract_text_from_pdfs(pdf_dir)
    html_context = extract_text_from_htmls(html_dir)
    csv_context = extract_context_from_csv(csv_dir)
    combined_context = pdf_context + html_context + csv_context
    return combined_context

# Function to retrieve an answer using the model
def get_answer(question, context):
    result = qa_pipeline(question=question, context=context)
    return result

# Main function
if __name__ == "__main__":
    # Directories for the files
    pdf_dir = "pdf_files"
    html_dir = "html_files"
    csv_dir = "csv_files"

    # Process and combine the context
    print("Extracting and combining context from files...")
    context = preprocess_context(pdf_dir, html_dir, csv_dir)

    # Start the question-answering loop
    print("\nContext loaded. You can now ask questions.")
    while True:
        question = input("Enter your question (or type 'exit' to quit): ").strip()
        if question.lower() == "exit":
            print("Goodbye!")
            break

        # Retrieve answer
        answer = get_answer(question, context)
        print("\nAnswer:")
        print(f"Text: {answer['answer']}")
        print(f"Confidence: {answer['score']:.2f}\n")


#### Example Interaction
##### Input:
Enter your question (or type 'exit' to quit): Какво е RoBERTa?

##### Output:
Answer:
Text: Модел за обработка на текст.
Confidence: 0.95