In [1]:
def set_jupyter_widescreen():
    from IPython.display import display, HTML
    
    display(HTML(data="""
    <style>
        div#notebook-container    {width: 95%; }
        div#menubar-container     {width: 65%; }
        div#maintoolbar-container {width: 99%; }
    </style>
    """))
set_jupyter_widescreen()

In [None]:
# installing packages for text extraction
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install langchain
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install unstructured==0.5.6
# !/Volumes/develop/anaconda3/envs/llm/bin/pip show pdfminer.six
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install --upgrade langchain pdfminer.six
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install pypdf
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install pymupdf
# !/Volumes/develop/anaconda3/envs/llm/bin/pip install faiss-cpu

In [3]:
import openai
import os
import pandas as pd
import numpy as np
import tiktoken
from utils import *
from langchain.document_loaders import PyMuPDFLoader
from openai.embeddings_utils import get_embedding
from openai.embeddings_utils import  cosine_similarity
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [4]:
from dotenv import load_dotenv, find_dotenv
found = load_dotenv(find_dotenv())
if found:
    openai.api_key  = os.getenv('OPENAI_API_KEY')
else:
    print("couldn't find the key")

In [None]:
# from langchain.document_loaders import UnstructuredFileLoader
# loader = UnstructuredFileLoader('./sample_form_150522.pdf')
# documents = loader.load()
# documents_content = '\n'.join(doc.page_content for doc in documents)

### a. PDF loader to text

In [5]:
loader = PyMuPDFLoader("./sample_form_150522.pdf")
data = loader.load()
#print(data[19].page_content)
for i in range(82):
    print(i, '---', data[i].page_content[-25:])
doc_content = '\n'.join(doc.page_content for doc in data)

0 ---  advisory business as of

1 --- to less than $50 billion

2 --- GISTRANT'S ADMINISTRATOR

3 --- n 2.A.(8) of Schedule D.

4 --- egistering with the SEC.

5 --- e Part 1A Instruction 4.

6 --- worth individuals)
0
$ 0

7 --- he investment company or

8 --- in rule 206(4)-1(a)(5))?

9 --- sets included in "Other"

10 --- (a) Interest
(b) Foreign

11 --- ures commission merchant

12 --- sed under Item 5.B.(2). 

13 --- xempt from registration?

14 --- ed.
No Information Filed

15 ---  same physical location?

16 --- AL PARTNERS GP VIII, LLC

17 --- er 
4.
Related Person's 

18 --- Page: 
15
Total Funds: 9

19 --- the following questions:

20 --- lands Monetary Authority

21 --- ment Company Act of
1940

22 --- tor in the private fund:

23 --- ion : 2 Record(s) Filed.

24 --- : 
BARCLAYS CAPITAL INC.

25 --- tity
identifier (if any)

26 --- ANK NATIONAL ASSOCIATION

27 --- t of the) private fund's

28 --- k
Country:
United States

29 --- n About the Private Fund

30 ---  or by a rela

### b. Split the documents into chunks

In [12]:
text_splitter = RecursiveCharacterTextSplitter(        
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)
doc_chunks = text_splitter.split_text(doc_content)
#showing some examples of the chunks and overlaps
for i in range(4):
    print(f'\nchunk: {i}\n')    
    print(doc_chunks[i][:300])
    print(''.join(50*['-']))
    print(doc_chunks[i][-300:])
    print(''.join(100*['*']))


chunk: 0

FORM ADV
UNIFORM APPLICATION FOR INVESTMENT ADVISER REGISTRATION AND REPORT BY EXEMPT REPORTING ADVISERS
Primary Business Name: ONE WILLIAM STREET CAPITAL MANAGEMENT, LP.
CRD Number: 150522
Other-Than-Annual Amendment - All Sections
Rev. 10/2021
4/6/2023 8:04:11 AM
--------------------------------------------------
u are filing an umbrella registration, the
information in Item 1 should be provided for the filing adviser only. General Instruction 5 provides information to assist you with filing an umbrella registration.
A.  Your full legal name (if you are a sole proprietor, your last, first, and middle names):
****************************************************************************************************

chunk: 1

A.  Your full legal name (if you are a sole proprietor, your last, first, and middle names): 
ONE WILLIAM STREET CAPITAL MANAGEMENT, LP.
B.
(1) Name under which you primarily conduct your advisory business, if different from Item 1.A.
ONE WILLIAM STREET CAPIT

### c. Create embeddings and construct a vector database

In [8]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" 
cost = 0.0001/1000 

# get the token numbers for the document chunks
encoding = tiktoken.encoding_for_model(embedding_model)
print(f'encoding name for {embedding_model}: {encoding.name}')
print(f"{''.join(2*['-'])}")
#save embeddings
df = pd.DataFrame()
df['text'] = doc_chunks
df['tokens'] = df['text'].apply(lambda x: len(encoding.encode(x)))
df['embedding'] = df.text.apply(lambda x: get_embedding(x, engine=embedding_model))
df['cost'] = cost*df['tokens']

print(f"maximum tokens:(chunk number:{df.sort_values(by='tokens', ascending=False).iloc[0].name}, token numbers: {df.sort_values(by='tokens', ascending=False).iloc[0].values[1]})")

print(f"{''.join(2*['-'])}")

print(f'Total embedding cost: ${np.round(df.cost.sum(),4)}')
df.to_csv("./sample_form_150522_chunks_with_embeddings.csv")

encoding name for text-embedding-ada-002: cl100k_base
--
maximum tokens:(chunk number:324, token numbers: 415)
--
Total embedding cost: $0.0068


In [10]:
df.head()

Unnamed: 0,text,tokens,embedding,cost
0,FORM ADV\nUNIFORM APPLICATION FOR INVESTMENT A...,226,"[0.001449032686650753, 0.005998051725327969, -...",2.3e-05
1,A. Your full legal name (if you are a sole pr...,231,"[0.003249892732128501, 0.00674661248922348, -0...",2.3e-05
2,D.\n(1) If you are registered with the SEC as ...,263,"[-0.004217831883579493, 0.00452733738347888, 0...",2.6e-05
3,299 PARK AVENUE\nNumber and Street 2:\n25TH FL...,216,"[-0.019397292286157608, 0.005637296941131353, ...",2.2e-05
4,most recently completed fiscal year.\n(2) Days...,114,"[0.0030712413135915995, 0.009709085337817669, ...",1.1e-05


In [9]:
query = 'how many employees do you have?'

In [11]:
ask(query, df.copy(), verbose=True, max_tokens=1250)

query embedding cost:$7.000000000000001e-07
--------------------

Answer the following Question based on the Context only. Only answer from the Context. If you don't know the answer, say 'I don't know'.

SECTION 4 Successions
No Information Filed
Item 5 Information About Your Advisory Business - Employees, Clients, and Compensation
Responses to this Item help us understand your business, assist us in preparing for on-site examinations, and provide us with data we use when making
regulatory policy. Part 1A Instruction 5.a. provides additional guidance to newly formed advisers for completing this Item 5.
Employees
If you are organized as a sole proprietorship, include yourself as an employee in your responses to Item 5.A. and Items 5.B.(1), (2), (3), (4), and (5). If an
employee performs more than one function, you should count that employee in each of your responses to Items 5.B.(1), (2), (3), (4), and (5).
A.  Approximately how many employees do you have? Include full- and part-time em

'Approximately 96 employees.'

In [25]:
queries = [
    'what is the name of the company?',
    'what is the address?',
    'list the names of the partners and managers?',
    'list the names of all private funds?',
    'is this a feeder fund?',
    'how many employees do you have?',
    'what is the total numebr of accounts?',
    'what percentage of your clients are non-United States persons?',
    'how much asset under management do you have under pooled investment vehicles?',
    'what is a Pooled Investment Vehicle?',
    'how mcuh is under pension and profit sharing plans?',
    'give me the full table of indirect owners?',
]

In [27]:
print(f'\nquestion: {queries[0]}')
answer = ask(queries[0], df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: what is the name of the company?
Answer: The name of the company is ONE WILLIAM STREET CAPITAL INTERMEDIATE FUND, L.P.


In [28]:
q = queries[1]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: what is the address?
Answer: The address is 299 Park Avenue, 25th Floor, New York, New York, United States, 10171.


In [29]:
q = queries[2]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: list the names of the partners and managers?
Answer: The names of the partners and managers are Alaina Danley, David Sherr, John D'Agostino, Kurt Locher, and Vanessa Gilman.


In [31]:
q = queries[3]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=5000)
print(f'Answer: {answer}')


question: list the names of all private funds?


InvalidRequestError: This model's maximum context length is 4097 tokens. However, you requested 5557 tokens (557 in the messages, 5000 in the completion). Please reduce the length of the messages or completion.

In [33]:
q = queries[3]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: list the names of all private funds?
Answer: ONE WILLIAM STREET CAPITAL PARTNERS II, L.P.


In [34]:
q = queries[4]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: is this a feeder fund?
Answer: I don't know.


In [35]:
q = queries[5]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: how many employees do you have?
Answer: Approximately 96 employees.


In [36]:
q = queries[6]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: what is the total numebr of accounts?
Answer: The total number of accounts is not provided in the given context.


In [47]:
print(data[7].page_content)

(b) High net worth individuals
0
$ 0
(c) Banking or thrift institutions
0
$ 0
(d) Investment companies
0
$ 0
(e) Business development companies
0
$ 0
(f) Pooled investment vehicles (other than investment companies and
business development companies)
21
$ 7,622,640,223
(g) Pension and profit sharing plans (but not the plan participants or
government pension plans)
$ 486,203,304
(h) Charitable organizations
0
$ 0
(i) State or municipal government entities (including government pension
plans)
0
$ 0
(j) Other investment advisers
0
$ 0
(k) Insurance companies
0
$ 0
(l) Sovereign wealth funds and foreign official institutions
0
$ 0
(m) Corporations or other businesses not listed above
0
$ 0
(n) Other:  
0
$ 0
Compensation Arrangements
E.
You are compensated for your investment advisory services by (check all that apply):
(1)   A percentage of assets under your management
(2)
Hourly charges
(3)
Subscription fees (for a newsletter or periodical)
(4)
Fixed fees (other than subscription fees)
(5

In [37]:
q = queries[7]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: what percentage of your clients are non-United States persons?
Answer: Approximately 70% of the clients are non-United States persons.


In [38]:
q = queries[8]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: how much asset under management do you have under pooled investment vehicles?
Answer: Based on the context provided, the amount of assets under management under pooled investment vehicles is $8,108,843,527.


In [39]:
q = queries[9]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: what is a Pooled Investment Vehicle?
Answer: A Pooled Investment Vehicle is a type of investment fund where multiple investors pool their money together to invest in various securities or assets.


In [40]:
q = queries[10]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: how mcuh is under pension and profit sharing plans?
Answer: $ 486,203,304 is under pension and profit sharing plans.


In [41]:
q = queries[11]
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: give me the full table of indirect owners?
Answer: I don't know.


In [42]:
q = 'what are the list of indirect owners?'
print(f'\nquestion: {q}')
answer = ask(q, df.copy(), verbose=False, max_tokens=2000)
print(f'Answer: {answer}')


question: what are the list of indirect owners?
Answer: Based on the given context, the list of indirect owners is not provided.


In [None]:
# from langchain.vectorstores import FAISS
# from langchain.embeddings.openai import OpenAIEmbeddings
# from dotenv import load_dotenv, find_dotenv
# found = load_dotenv(find_dotenv())
# if found:
#     os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')
# else:
#     print("couldn't find the key")


# def get_doc_search(texts, embedding_model=None):
#     if embedding_model is None:
#         embeddings = OpenAIEmbeddings()
#     else:
#         embeddings = OpenAIEmbeddings(model=embedding_model, deployment=embedding_model)
#     return FAISS.from_texts(texts, embeddings)

# query = 'how many employees do you have?'
# documents = doc_search.similarity_search(query)

# for d in documents:
#     if 'employee' in d.page_content:
#         print(d.page_content)
        
# from langchain.llms import OpenAI
# from langchain.chains.question_answering import load_qa_chain
# llm = OpenAI(max_tokens=250,
#              temperature=0,
#              top_p=1,
#              frequency_penalty=0,
#              presence_penalty=0)
# chain = load_qa_chain(llm, chain_type = "map_rerank",  
#                       return_intermediate_steps=True)

# from langchain.callbacks import get_openai_callback

# with get_openai_callback() as cb:
#     results = chain({"input_documents":documents, 
#                     "question": query},
#                     return_only_outputs=False)
#     print(f"Total Tokens: {cb.total_tokens}")
#     print(f"Prompt Tokens: {cb.prompt_tokens}")
#     print(f"Completion Tokens: {cb.completion_tokens}")
#     print(f"Total Cost (USD): ${cb.total_cost}")
    
# results = chain({
#                     "input_documents":documents, 
#                     "question": query
#                 }, 
#                 return_only_outputs=True)

# results['intermediate_steps'][0]['answer'].strip()