In [None]:
import os
import openai
from PyPDF2 import PdfReader
from PIL import Image
import pytesseract
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor

# Set your OpenAI API key
openai.api_key = 'key'

# Function to extract text from PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to extract text from image using pytesseract
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

# Function to generate embeddings (vectors) using OpenAI's GPT-3 with retries
def generate_embeddings_with_retry(text):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo-0613",
                messages=[
                    {"role": "system", "content": text}
                ],
                temperature=0.7,
                max_tokens=150,
            )
            return response.choices[0].message['content'].strip()
        except openai.error.RateLimitError as e:
            print("Rate limit reached. Waiting for 60 seconds before retrying...")
            time.sleep(60)  # Wait for 60 seconds before retrying

# Sample user queries
user_queries = [
    "What are the points to be followed while using Azure cloud resource.",
    "Detail me the steps to login to AWS Security Account or switch to another account"
]

# Create a DataFrame to store results
results_df = pd.DataFrame(columns=["Query", "Answer", "Source", "Page Number"])

# Process documents (PDFs and images) and answer queries
documents_folder = r"C:/Users/MOHIT MEHTA/MF"

def process_document(document_file):
    document_path = os.path.join(documents_folder, document_file)
    if document_file.endswith(".pdf"):
        text_content = extract_text_from_pdf(document_path)
    elif document_file.endswith((".jpg", ".jpeg", ".png")):
        text_content = extract_text_from_image(document_path)
    else:
        return

    # Iterate through user queries
    for query in user_queries:
        # Combine user query and document text for GPT-3 input
        combined_text = f"User Query: {query}\nDocument Text: {text_content}"

        # Generate embeddings using GPT-3 with retries
        embeddings = generate_embeddings_with_retry(combined_text)

        # Extract answer from GPT-3 response
        answer = " ".join(embeddings.split()[:1000])  # Limiting answer length for readability

        # Update results DataFrame
        new_row = {
            "Query": query,
            "Answer": answer,
            "Source": document_file,
            "Page Number": 1,  # Assuming the answer is on the first page for simplicity
        }
        results_df.loc[len(results_df)] = new_row

# Process documents concurrently
with ThreadPoolExecutor() as executor:
    executor.map(process_document, os.listdir(documents_folder))

# Save results to Excel
results_df.to_excel(r"C:/Users/MOHIT MEHTA/MF/docqa_results.xlsx", index=False)
