In [25]:
import PyPDF2
import os
import nltk
from nltk.tokenize import sent_tokenize
from opensearchpy import OpenSearch, helpers
from sentence_transformers import SentenceTransformer


In [26]:

# Load the model
model = SentenceTransformer('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')


In [27]:
# extract text from data sources

def extract_text_from_pdf(pdf_path):
    pdf_text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            pdf_text += page.extract_text()
    return pdf_text

# Extract text from your PDF files
pdf_texts = []
source_files = os.listdir('data')
for pdf_file in source_files:
    pdf_texts.append(extract_text_from_pdf(f"data/{pdf_file}"))


In [28]:
# preprocess

# Download necessary NLTK data
nltk.download('punkt')

def preprocess_text(text):
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    # can add more cleaning here if necessary
    return sentences

# Preprocess each extracted PDF text
preprocessed_texts = [preprocess_text(text) for text in pdf_texts]

[nltk_data] Downloading package punkt to /home/citi-ai/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
preprocessed_texts

[['Code of Conduct for Persons \nLicensed by or Registered with the \nSecurities and Futures Commission\nCode of Conduct for Persons Licensed by or\nRegistered with the Securities and Futures \nCommission \n © Securities and Futures Commission 2024 \n1994 first edition  \n1996 second edition  \n1998 third edition  \n1999 fourth edition  \n2001 fifth edition  \n2003 sixth edition  \n2006 seventh edition  \n2010 eighth edition \nJune 2011 ninth edition  \nSeptember 2011 tenth edition \nOctober 2011 eleventh edition \nJune 2012 twelfth edition \nOctober 2013 thirteenth edition January 2014 Fourteenth edition \nMarch 2014 Fifteenth edition \nDecember 2015 Sixteenth edition March 2016 Seventeenth edition \nJune 2017 Eighteenth edition \nJuly 2018 Nineteenth edition August 2018 Twentieth edition\n \nNovember 2018 Twenty-first edition June 2019 Twenty-second edition July 2019 Twenty-third edition \nSeptember 2019 Twenty-fourth edition \nJune 2020 Twenty-fifth edition September 2020 Twenty-six

In [34]:
import openai

def generate_query_embedding(query):
    return model.encode(query, convert_to_tensor=False)


# Define the retrieval function
def retrieve_documents(query, index_name='documents', top_k=3):
    query_embedding = generate_query_embedding(query)
    response = client.search(
        index=index_name,
        body={
            "size": top_k,
            "query": {
                "knn": {
                    "embedding": {
                        "vector": query_embedding.tolist(),
                        "k": top_k
                    }
                }
            }
        }
    )
    return [hit['_source'] for hit in response['hits']['hits']]

# Define the generation function
openai.api_key = os.getenv("OPENAI_KEY")


def generate_response(prompt, documents):
    context = " ".join([doc['content'] for doc in documents])
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": f"{context}\n\n{prompt}"}
    ]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=150
    )
    return response.choices[0].message['content'].strip()

def rag_system(query):
    retrieved_docs = retrieve_documents(query)
    response = generate_response(query, retrieved_docs)
    return response, retrieved_docs

# Example usage

# Example usage
query = "Tell me about Priority for client orders: order handling and recording."
#query = "Tell me about the use of special purpose vehicles."
print("response", rag_system(query)[0])
print("docs", rag_system(query)[1])



response Priority for client orders: order handling and recording is an essential aspect of financial services to ensure that client orders are handled promptly, fairly, and in compliance with regulations. 

This priority emphasizes that licensed or registered individuals should handle client orders fairly and in the sequence they are received. It is crucial to prioritize the execution of client orders accurately and promptly. Additionally, licensed professionals must take into account the client’s investment objectives, investment strategy, and financial position when providing recommendations.

Recording details of client order instructions and the time of receipt is crucial for maintaining transparency and ensuring that orders are handled correctly. In situations where the telephone recording system cannot be accessed, written records by hand should be utilized to document client orders accurately.

By following these guidelines, financial professionals can ensure
docs [{'content': 

In [32]:
retrieved_docs

[{'content': 'Use of Special Purpose Vehicles  \n \n7.5 The scheme may hold real estate through special purpose vehicles only  if: \n \n(a) the special purpose vehicles are legally and beneficially owned by the scheme;  \n (aa) the scheme has majority ownership and control of the special purpose \nvehicles;  \n \n Note:  The Commission expects the special purpose vehicles to be wholly owned by the scheme, except in special and limited circumstances, \nsuch as the need to comply with regulatory requirements in an \noverseas jurisdiction where such requirements are relevant to the scheme and/or its portfolio.',
  'embedding': [0.04224720224738121,
   0.3259965181350708,
   0.3852410614490509,
   0.326850026845932,
   -0.006995340343564749,
   -0.32421261072158813,
   0.4768611788749695,
   -0.6281512975692749,
   -0.5583289265632629,
   0.6207882165908813,
   -0.9162147641181946,
   0.6679205298423767,
   -0.7066274881362915,
   -0.09251774847507477,
   -0.119582898914814,
   0.142203986