In [9]:
import PyPDF2
import os
import nltk
from nltk.tokenize import sent_tokenize
from opensearchpy import OpenSearch


In [10]:
EMBEDDING_MODEL_NAME="BAAI/bge-small-en"

In [11]:
# extract text from data sources

def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            pdf_text += page.extract_text()
    return pdf_text

# Extract text from your PDF files
pdf_texts = []
source_files = os.listdir('data')
for pdf_file in source_files:
    pdf_texts.append(extract_text_from_pdf(f"data/{pdf_file}"))


In [12]:
# preprocess

# Download necessary NLTK data
nltk.download('punkt')

def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    return sentences

# Preprocess each extracted PDF text
preprocessed_texts = [preprocess_text(text) for text in pdf_texts]

[nltk_data] Downloading package punkt to /home/citi-ai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
# indexing
from dotenv import load_dotenv
load_dotenv()
password = os.getenv("OPENSEARCH_INITIAL_ADMIN_PASSWORD")

# Initialize OpenSearch client
client = OpenSearch(
    hosts=[{'host': 'localhost', 'port': 9200}],
    http_compress=True,
    http_auth=('admin', password)
)

# Create an index
index_name = 'documents'
client.indices.create(index=index_name, ignore=400)

# Index the preprocessed sentences
doc_id = 0
for text in preprocessed_texts:
    for sentence in text:
        document = {"content": sentence}
        client.index(index=index_name, id=doc_id, body=document)
        doc_id += 1


In [14]:
import openai

# Define the retrieval function
def retrieve_documents(query, index_name='documents', top_k=3):
    response = client.search(
        index=index_name,
        body={
            "query": {
                "multi_match": {
                    "query": query,
                    "fields": ["content"]
                }
            },
            "size": top_k
        }
    )
    return [hit['_source'] for hit in response['hits']['hits']]

# Define the generation function
openai.api_key = os.getenv("OPENAI_KEY")


# Function to generate a response using the OpenAI API
def generate_response(prompt, documents):
    context = " ".join([doc['content'] for doc in documents])
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"{context}\n\n{prompt}"}
    ]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Use the appropriate model for your use case
        messages=messages,
        max_tokens=150
    )
    return response.choices[0].message['content'].strip()

# Example usage
query = "Tell me about Priority for client orders: order handling and recording."
retrieved_docs = [{"content": "Document content example 1."}, {"content": "Document content example 2."}]
print(generate_response(query, retrieved_docs))



APIRemovedInV1: 

You tried to access openai.Completion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
