In [2]:
import httpx
from typing import Any, Dict, Optional


class HttpClient:
    def __init__(self, base_url: str, headers: Dict[str, str]) -> None:
        self.base_url = base_url
        self.headers = headers
        self.client = httpx.AsyncClient()

    async def __aenter__(self):
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.client.aclose()

    async def make_request(self, method: str, endpoint: str, data: Optional[Dict[str, Any]] = None) -> httpx.Response:
        url = f"{self.base_url}{endpoint}"
        response = await self.client.request(method, url, headers=self.headers, json=data)
        response.raise_for_status()
        return response


class HttpHandler:
    def __init__(self, http_client: HttpClient) -> None:
        self.http_client = http_client

    async def get_json_response(self, method: str, endpoint: str, data: Optional[Dict[str, Any]] = None) -> Any:
        try:
            response = await self.http_client.make_request(method, endpoint, data)
            if response.text:
                json_response = response.json()
            else:
                json_response = {}
            return json_response
        except httpx.HTTPError as e:
            raise e
        except ValueError as e:
            raise e

from typing import Any, Dict, List


SCHEMA_ENDPOINT = "/v1/schema"
GRAPHQL_ENDPOINT = "/v1/graphql"
OBJECTS_ENDPOINT = "/v1/objects"
BATCH_OBJECTS_ENDPOINT = "/v1/batch/objects"


class WeaviateClient:
    def __init__(self, http_handler: HttpHandler) -> None:
        self.http_handler = http_handler

    async def get_schema(self) -> Dict[str, Any]:
        return await self.http_handler.get_json_response("GET", SCHEMA_ENDPOINT)

    async def create_class(self, class_info: Dict[str, Any]) -> None:
        await self.http_handler.get_json_response("POST", SCHEMA_ENDPOINT, class_info)

    async def delete_class(self, class_name: str) -> None:
        endpoint = f"{SCHEMA_ENDPOINT}/{class_name}"
        await self.http_handler.get_json_response("DELETE", endpoint)

    async def create_object(self, data: Dict[str, Any], class_name: str) -> str:
        payload = {"class": class_name, "properties": data}
        response = await self.http_handler.get_json_response("POST", OBJECTS_ENDPOINT, payload)
        return response.get("id")

    async def batch_create_objects(self, objects: List[Dict[str, Any]], class_name: str) -> bool:
        transformed_objects = [{"class": class_name, "properties": obj} for obj in objects]
        batch_data = {"objects": transformed_objects}
        response = await self.http_handler.get_json_response("POST", BATCH_OBJECTS_ENDPOINT, batch_data)
        return response[0].get("result", {}).get("status") == "SUCCESS"

    async def get_object(self, uuid: str, class_name: str) -> Dict[str, Any]:
        endpoint = f"{OBJECTS_ENDPOINT}/{class_name}/{uuid}"
        return await self.http_handler.get_json_response("GET", endpoint)

    async def update_object(self, uuid: str, data: Dict[str, Any], class_name: str) -> bool:
        endpoint = f"{OBJECTS_ENDPOINT}/{class_name}/{uuid}"
        await self.http_handler.get_json_response("PATCH", endpoint, data)
        return True

    async def delete_object(self, uuid: str, class_name: str) -> bool:
        endpoint = f"{OBJECTS_ENDPOINT}/{class_name}/{uuid}"
        await self.http_handler.get_json_response("DELETE", endpoint)
        return True

    async def run_query(self, graphql_query: str) -> Dict[str, Any]:
        return await self.http_handler.get_json_response("POST", GRAPHQL_ENDPOINT, {"query": graphql_query})


In [3]:
def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)

In [4]:
from pypdf import PdfReader

reader = PdfReader("./try/microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

FileNotFoundError: [Errno 2] No such file or directory: './try/microsoft_annual_report_2022.pdf'

In [None]:
print(word_wrap(pdf_texts[0]))

1 Dear shareholders, colleagues, customers, and partners:  
We are
living through a period of historic economic, societal, and
geopolitical change. The world in 2022 looks nothing like 
the world in
2019. As I write this, inflation is at a 40 -year high, supply chains
are stretched, and the war in Ukraine is 
ongoing. At the same time, we
are entering a technological era with the potential to power awesome
advancements 
across every sector of our economy and society. As the
world’s largest software company, this places us at a historic

intersection of opportunity and responsibility to the world around us.
 
Our mission to empower every person and every organization on the
planet to achieve more has never been more 
urgent or more necessary.
For all the uncertainty in the world, one thing is clear: People and
organizations in every 
industry are increasingly looking to digital
technology to overcome today’s challenges and emerge stronger. And no

company is better positioned to help th

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [None]:
character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)
character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

In [None]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [None]:
print(word_wrap(token_split_texts[0]))
print(f"\nTotal chunks: {len(token_split_texts)}")

1 dear shareholders, colleagues, customers, and partners : we are
living through a period of historic economic, societal, and
geopolitical change. the world in 2022 looks nothing like the world in
2019. as i write this, inflation is at a 40 - year high, supply chains
are stretched, and the war in ukraine is ongoing. at the same time, we
are entering a technological era with the potential to power awesome
advancements across every sector of our economy and society. as the
world ’ s largest software company, this places us at a historic
intersection of opportunity and responsibility to the world around us.
our mission to empower every person and every organization on the
planet to achieve more has never been more urgent or more necessary.
for all the uncertainty in the world, one thing is clear : people and
organizations in every industry are increasingly looking to digital
technology to overcome today ’ s challenges and emerge stronger. and no

Total chunks: 349


In [None]:
token_split_texts[0]

'1 dear shareholders, colleagues, customers, and partners : we are living through a period of historic economic, societal, and geopolitical change. the world in 2022 looks nothing like the world in 2019. as i write this, inflation is at a 40 - year high, supply chains are stretched, and the war in ukraine is ongoing. at the same time, we are entering a technological era with the potential to power awesome advancements across every sector of our economy and society. as the world ’ s largest software company, this places us at a historic intersection of opportunity and responsibility to the world around us. our mission to empower every person and every organization on the planet to achieve more has never been more urgent or more necessary. for all the uncertainty in the world, one thing is clear : people and organizations in every industry are increasingly looking to digital technology to overcome today ’ s challenges and emerge stronger. and no'

In [None]:
client = WeaviateClient(HttpHandler(HttpClient(
    "http://localhost:8080", 
    {"X-OpenAI-Api-Key": "sk-proj-opIa80M26lL0hI6djNKvT3BlbkFJ7gYp5KAy2N7ZagJ4euUw"})
    )
)

schema = {
    "classes": [
        {
            "class": "MicrosoftAnnualReport2022Document",
            "vectorizer": "text2vec-openai",
            "description": "A document class to store documents used for knowledge base",
            "properties": [
                {
                    "name": "title",
                    "dataType": [
                        "string"
                    ],
                    "description": "The title of the document"
                },
                {
                    "name": "content",
                    "dataType": [
                        "text"
                    ],
                    "description": "The entire content of the document"
                }
            ]
        }
    ]
}

from pprint import pprint

for class_info in schema["classes"]:
    await client.create_class(class_info)

In [None]:
for content in token_split_texts:
    await client.create_object(
        class_name="MicrosoftAnnualReport2022Document",
        data={
            "title": 'microsoft_annual_report_2022',
            "content": content
        }
    )

In [None]:
async def get_class_objects(class_name):
    query = f"""
    {{
      Get {{
        {class_name} {{
          _additional {{
            id
            creationTimeUnix
            lastUpdateTimeUnix
            distance
            certainty
            score
            vector
          }}
          title
          content
        }}
      }}
    }}
    """
    result = await client.run_query(query)
    return result

In [None]:
class_name = "MicrosoftAnnualReport2022Document"

# Get and print the objects in the specified class
class_objects = await get_class_objects(class_name)
print(f"Objects in class '{class_name}':")

Objects in class 'MicrosoftAnnualReport2022Document':


In [None]:
class_objects

{'data': {'Get': {'MicrosoftAnnualReport2022Document': [{'_additional': {'certainty': None,
      'creationTimeUnix': '1718252868139',
      'distance': None,
      'id': '0162b90e-b6bb-4be2-9cab-b0de2cf3df7a',
      'lastUpdateTimeUnix': '1718252868139',
      'score': '0',
      'vector': [-0.012650611,
       -0.034276497,
       -0.016099567,
       -0.013609394,
       0.005629522,
       -0.0002983713,
       -0.03744581,
       -0.012524105,
       -0.021639204,
       -0.020853532,
       0.03837796,
       0.027804712,
       -0.019441992,
       0.010553272,
       -0.017737487,
       0.0011402195,
       0.014621443,
       -0.0003322866,
       0.00843596,
       -0.046048224,
       -0.0100072995,
       0.005969091,
       -0.047805995,
       0.00033249467,
       -0.028310735,
       0.010473374,
       0.019721637,
       -0.027778078,
       0.036593556,
       0.0088154785,
       -0.006465128,
       -0.036833253,
       0.004321182,
       -0.0039583095,
       -0

In [None]:
import weaviate
import weaviate.classes as wvc
from weaviate.classes.query import MetadataQuery

clientv4 = weaviate.connect_to_local(
    host='0.0.0.0',
    port='8080',
    headers = {"X-OpenAI-Api-Key": "sk-proj-opIa80M26lL0hI6djNKvT3BlbkFJ7gYp5KAy2N7ZagJ4euUw"}
)

In [None]:
query = "What is this document about?"

reviews = clientv4.collections.get("MicrosoftAnnualReport2022Document")
response = reviews.query.near_text(
    query=query,
    limit=5,
    # target_vector="title_country",  # Specify the target vector for named vector collections
    return_metadata=MetadataQuery(distance=True)
)

retrieved_documents = []
for o in response.objects:
    retrieved_documents.append(o.properties['content'])
    print(word_wrap(o.properties['content']))
    print(o.metadata.distance)
    print()

when the world around us does well. that ’ s what i believe will lead
to widespread human progress and ultimately improve the lives of
everyone. there is no more powerful input than digital technology to
drive the world ’ s economic output. this is the core thesis for our
being as a company, but it ’ s not enough. as we drive global economic
growth, we must also commit to creating a more inclusive, equitable,
sustainable, and trusted future. support inclusive economic growth we
must ensure the growth we drive reaches every person, organization,
community, and country. this starts with increasing access to digital
skills. this year alone, more than 23 million people accessed digital
skills training as part of our global skills initiative.
0.22096699476242065

1 dear shareholders, colleagues, customers, and partners : we are
living through a period of historic economic, societal, and
geopolitical change. the world in 2022 looks nothing like the world in
2019. as i write this, inflation i

In [5]:
import os
import openai
from openai import OpenAI

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = 'sk-proj-opIa80M26lL0hI6djNKvT3BlbkFJ7gYp5KAy2N7ZagJ4euUw'

openai_client = OpenAI()

In [None]:
def rag(query, retrieved_documents, model="gpt-3.5-turbo"):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]
    
    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    return content

In [None]:
output = rag(query=query, retrieved_documents=retrieved_documents)

print(word_wrap(output))

The document is the annual report of Microsoft, a technology company
with a mission to empower every person and every organization on the
planet to achieve more. The report discusses the company's commitment
to creating a more inclusive, equitable, sustainable, and trusted
future through digital technology. It mentions that Microsoft is
focused on driving global economic growth and ensuring that the growth
reaches every person, organization, community, and country.
Additionally, the report highlights that more than 23 million people
accessed digital skills training as part of Microsoft's global skills
initiative in the recent year.


In [7]:
completion = openai_client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are an assistance to help students optimise their schedules, facilitate collaboration, and provide tailored support to enhance their learning experience and job readiness."},
      {"role": "user", "content": 'hello'}
    ]
  )

In [16]:
completion.choices[0].message.content

'Hello! How can I assist you today?'