In [None]:
import os
import pandas as pd
from transformers import pipeline
from bs4 import BeautifulSoup
import pdfplumber

In [None]:
# Load RoBERTa QA pipeline
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

In [None]:
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [None]:
# Function to extract text from an HTML file
def extract_text_from_html(html_path):
    with open(html_path, "r", encoding="utf-8") as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
        return soup.get_text()

In [None]:
# Function to extract question-answer pairs from a CSV file
def extract_context_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    context = ""
    for _, row in df.iterrows():
        context += f"Q: {row['question']} A: {row['answer']}\n"
    return context

In [None]:
# Function to handle context extraction based on file type
def extract_context_from_files(file_paths):
    combined_context = ""
    for file_path in file_paths:
        _, ext = os.path.splitext(file_path)
        if ext.lower() == ".pdf":
            combined_context += extract_text_from_pdf(file_path) + "\n"
        elif ext.lower() in [".html", ".htm"]:
            combined_context += extract_text_from_html(file_path) + "\n"
        elif ext.lower() == ".csv":
            combined_context += extract_context_from_csv(file_path) + "\n"
        else:
            print(f"Unsupported file type: {file_path}")
    return combined_context

In [None]:
# Function to answer a question given a context
def answer_question(question, context):
    if not context.strip():
        return "No context available for answering."
    result = qa_pipeline(question=question, context=context)
    return result

In [None]:
# Main Function
if __name__ == "__main__":
    # List of file paths
    file_paths = [
        "example1.pdf",  # Replace with your PDF files
        "example2.html", # Replace with your HTML files
        "questions_answers.csv" # Replace with your CSV file
    ]

In [None]:
# Extract combined context from files
    combined_context = extract_context_from_files(file_paths)

In [None]:
# Ask a question
    question = "Какво е RoBERTa?"  # Example Cyrillic question
    result = answer_question(question, combined_context)

In [None]:
# Display the answer
    print(f"Question: {question}")
    if isinstance(result, dict):
        print(f"Answer: {result['answer']}")
        print(f"Confidence Score: {result['score']:.2f}")
    else:
        print(result)