In [1]:
def set_jupyter_widescreen():
    from IPython.display import display, HTML
    
    display(HTML(data="""
    <style>
        div#notebook-container    {width: 95%; }
        div#menubar-container     {width: 65%; }
        div#maintoolbar-container {width: 99%; }
    </style>
    """))
set_jupyter_widescreen()

In [None]:
# installing packages for text extraction
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install langchain
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install unstructured==0.5.6
# !/Volumes/develop/anaconda3/envs/llm/bin/pip show pdfminer.six
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install --upgrade langchain pdfminer.six
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install pypdf
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install pymupdf
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install faiss-cpu

In [3]:
import openai
import os
import pandas as pd
import numpy as np
import tiktoken
from utils import *
from langchain.document_loaders import PyMuPDFLoader
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import  cosine_similarity
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [4]:
from dotenv import load_dotenv, find_dotenv
found = load_dotenv(find_dotenv())
if found:
    openai.api_key  = os.getenv('OPENAI_API_KEY')
else:
    print("couldn't find the key")

In [None]:
# from langchain.document_loaders import UnstructuredFileLoader
# loader = UnstructuredFileLoader('./sample_form_150522.pdf')
# documents = loader.load()
# documents_content = '\n'.join(doc.page_content for doc in documents)

### a. PDF loader to text

In [5]:
loader = PyMuPDFLoader("./sample_form_150522.pdf")
data = loader.load()
#print(data[19].page_content)
for i in range(82):
    print(i, '---', data[i].page_content[-25:])
doc_content = '\n'.join(doc.page_content for doc in data)

0 ---  advisory business as of

1 --- to less than $50 billion

2 --- GISTRANT'S ADMINISTRATOR

3 --- n 2.A.(8) of Schedule D.

4 --- egistering with the SEC.

5 --- e Part 1A Instruction 4.

6 --- worth individuals)
0
$ 0

7 --- he investment company or

8 --- in rule 206(4)-1(a)(5))?

9 --- sets included in "Other"

10 --- (a) Interest
(b) Foreign

11 --- ures commission merchant

12 --- sed under Item 5.B.(2). 

13 --- xempt from registration?

14 --- ed.
No Information Filed

15 ---  same physical location?

16 --- AL PARTNERS GP VIII, LLC

17 --- er 
4.
Related Person's 

18 --- Page: 
15
Total Funds: 9

19 --- the following questions:

20 --- lands Monetary Authority

21 --- ment Company Act of
1940

22 --- tor in the private fund:

23 --- ion : 2 Record(s) Filed.

24 --- : 
BARCLAYS CAPITAL INC.

25 --- tity
identifier (if any)

26 --- ANK NATIONAL ASSOCIATION

27 --- t of the) private fund's

28 --- k
Country:
United States

29 --- n About the Private Fund

30 ---  or by a rela

### b. Split the documents into chunks

In [6]:
text_splitter = RecursiveCharacterTextSplitter(        
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)
doc_chunks = text_splitter.split_text(doc_content)
# showing some examples of the chunks and overlaps
for i in range(len(doc_chunks)):
    print(f'\nchunk: {i}\n')    
    print(doc_chunks[i][:300])
    print(''.join(50*['-']))
    print(doc_chunks[i][-300:])
    print(''.join(100*['*']))


chunk: 0

FORM ADV
UNIFORM APPLICATION FOR INVESTMENT ADVISER REGISTRATION AND REPORT BY EXEMPT REPORTING ADVISERS
Primary Business Name: ONE WILLIAM STREET CAPITAL MANAGEMENT, LP.
CRD Number: 150522
Other-Than-Annual Amendment - All Sections
Rev. 10/2021
4/6/2023 8:04:11 AM
--------------------------------------------------
u are filing an umbrella registration, the
information in Item 1 should be provided for the filing adviser only. General Instruction 5 provides information to assist you with filing an umbrella registration.
A.  Your full legal name (if you are a sole proprietor, your last, first, and middle names):
****************************************************************************************************

chunk: 1

A.  Your full legal name (if you are a sole proprietor, your last, first, and middle names): 
ONE WILLIAM STREET CAPITAL MANAGEMENT, LP.
B.
(1) Name under which you primarily conduct your advisory business, if different from Item 1.A.
ONE WILLIAM STREET CAPIT

### c. Create embeddings and construct a vector database

In [8]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" 
cost = 0.0001/1000 

# get the token numbers for the document chunks
encoding = tiktoken.encoding_for_model(embedding_model)
print(f'encoding name for {embedding_model}: {encoding.name}')
print(f"{''.join(2*['-'])}")
#save embeddings
df = pd.DataFrame()
df['text'] = doc_chunks
df['tokens'] = df['text'].apply(lambda x: len(encoding.encode(x)))
df['embedding'] = df.text.apply(lambda x: get_embedding(x, engine=embedding_model))
df['cost'] = cost*df['tokens']

print(f"maximum tokens:(chunk number:{df.sort_values(by='tokens', ascending=False).iloc[0].name}, token numbers: {df.sort_values(by='tokens', ascending=False).iloc[0].values[1]})")

print(f"{''.join(2*['-'])}")

print(f'Total embedding cost: ${np.round(df.cost.sum(),4)}')
df.to_csv("./sample_form_150522_chunks_with_embeddings.csv")

encoding name for text-embedding-ada-002: cl100k_base
--
maximum tokens:(chunk number:324, token numbers: 415)
--
Total embedding cost: $0.0068


In [10]:
df.head()

Unnamed: 0,text,tokens,embedding,cost
0,FORM ADV\nUNIFORM APPLICATION FOR INVESTMENT A...,226,"[0.001449032686650753, 0.005998051725327969, -...",2.3e-05
1,A. Your full legal name (if you are a sole pr...,231,"[0.003249892732128501, 0.00674661248922348, -0...",2.3e-05
2,D.\n(1) If you are registered with the SEC as ...,263,"[-0.004217831883579493, 0.00452733738347888, 0...",2.6e-05
3,299 PARK AVENUE\nNumber and Street 2:\n25TH FL...,216,"[-0.019397292286157608, 0.005637296941131353, ...",2.2e-05
4,most recently completed fiscal year.\n(2) Days...,114,"[0.0030712413135915995, 0.009709085337817669, ...",1.1e-05


In [9]:
query = 'how many employees do you have?'

In [None]:
ask(query, df.copy(), verbose=True, max_tokens=1250)

In [None]:
# from langchain.vectorstores import FAISS
# from langchain.embeddings.openai import OpenAIEmbeddings
# from dotenv import load_dotenv, find_dotenv
# found = load_dotenv(find_dotenv())
# if found:
#     os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
# else:
#     print("couldn't find the key")


# def get_doc_search(texts, embedding_model=None):
#     if embedding_model is None:
#         embeddings = OpenAIEmbeddings()
#     else:
#         embeddings = OpenAIEmbeddings(model=embedding_model, deployment=embedding_model)
#     return FAISS.from_texts(texts, embeddings)

# query = 'how many employees do you have?'
# documents = doc_search.similarity_search(query)

# for d in documents:
#     if 'employee' in d.page_content:
#         print(d.page_content)
        
# from langchain.llms import OpenAI
# from langchain.chains.question_answering import load_qa_chain
# llm = OpenAI(max_tokens=250,
#              temperature=0,
#              top_p=1,
#              frequency_penalty=0,
#              presence_penalty=0)
# chain = load_qa_chain(llm, chain_type = "map_rerank",  
#                       return_intermediate_steps=True)

# from langchain.callbacks import get_openai_callback

# with get_openai_callback() as cb:
#     results = chain({"input_documents":documents, 
#                     "question": query},
#                     return_only_outputs=False)
#     print(f"Total Tokens: {cb.total_tokens}")
#     print(f"Prompt Tokens: {cb.prompt_tokens}")
#     print(f"Completion Tokens: {cb.completion_tokens}")
#     print(f"Total Cost (USD): ${cb.total_cost}")
    
# results = chain({
#                     "input_documents":documents, 
#                     "question": query
#                 }, 
#                 return_only_outputs=True)

# results['intermediate_steps'][0]['answer'].strip()