In [18]:
import numpy as np

In [5]:
import getpass
import os

os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = "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"

In [7]:
from langchain_core.documents import Document

documents = [
    Document(
        page_content="Dogs are great companions, known for their loyalty and friendliness.",
        metadata={"source": "mammal-pets-doc"},
    ),
    Document(
        page_content="Cats are independent pets that often enjoy their own space.",
        metadata={"source": "mammal-pets-doc"},
    ),
]

In [9]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
print(len(docs))

107


In [11]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F

{'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': 'nke-10k-2023.pdf', 'total_pages': 107, 'page': 0, 'page_label': '1'}


In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

516

In [15]:
from langchain_core.embeddings import DeterministicFakeEmbedding

embeddings = DeterministicFakeEmbedding(size=4096)

In [19]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 4096

[np.float64(0.7926341431206548), np.float64(-0.2812335879080366), np.float64(-0.10753613121424813), np.float64(0.3154027699615671), np.float64(-0.1859997540042947), np.float64(0.3255190366031182), np.float64(1.114889943253859), np.float64(-0.6823437996276469), np.float64(-0.5607505304952727), np.float64(0.8247465179470694)]


In [21]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [22]:
ids = vector_store.add_documents(documents=all_splits)

In [23]:
results = vector_store.similarity_search(
    "How many distribution centers does Nike have in the US?"
)

print(results[0])

page_content='(3)    Global Brand Divisions revenues include NIKE Brand licensing and other miscellaneous revenues that are not part of a geographic operating segment.
(4)    Corporate revenues primarily consist of foreign currency hedge gains and losses related to revenues generated by entities within the NIKE Brand geographic operating segments and Converse, butmanaged through our central foreign exchange risk management program.
The primary financial measure used by the Company to evaluate performance is Earnings Before Interest and Taxes ("EBIT"). As discussed in Note 15 — Operating
Segments and Related Information in the accompanying Notes to the Consolidated Financial Statements, certain corporate costs are not included in EBIT.
The breakdown of EBIT is as follows:
(Dollars in millions) FISCAL 2023 FISCAL 2022 % CHANGE FISCAL 2021 % CHANGE
North America $ 5,454 $ 5,114 7 % $ 5,089 0 %
Europe, Middle East & Africa 3,531 3,293 7 % 2,435 35 %
Greater China 2,283 2,365 -3 % 3,243 -27

In [24]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

page_content='Equipment 633 564 193 234 — 1,624 26 — 1,650 
Other — — — — 102 102 123 (72) 153 
TOTAL REVENUES $ 18,353 $ 12,479 $ 7,547 $ 5,955 $ 102 $ 44,436 $ 2,346 $ (72)$ 46,710 
Revenues by:
Sales to Wholesale Customers $ 9,621 $ 8,377 $ 4,081 $ 3,529 $ — $ 25,608 $ 1,292 $ — $ 26,900 
Sales through Direct to Consumer 8,732 4,102 3,466 2,426 — 18,726 931 — 19,657 
Other — — — — 102 102 123 (72) 153 
TOTAL REVENUES $ 18,353 $ 12,479 $ 7,547 $ 5,955 $ 102 $ 44,436 $ 2,346 $ (72)$ 46,710 
(1)
2023 FORM 10-K 83' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': 'nke-10k-2023.pdf', 'total_pages': 107, 'page': 85, 'page

In [25]:
# Note that providers implement different scores; the score here
# is a distance metric that varies inversely with similarity.

results = vector_store.similarity_search_with_score("What was Nike's revenue in 2023?")
doc, score = results[0]
print(f"Score: {score}\n")
print(doc)

Score: 0.04600777030762427

page_content='• Reduced consumer demand for our products, including as a result of a rise in unemployment rates, higher costs of borrowing, inflation and diminished consumer
confidence;
• Cancellation or postponement of sports seasons and sporting events in multiple countries, and bans on large public gatherings, which have reduced and in the future
could reduce consumer spending on our products and could impact the effectiveness of our arrangements with key endorsers;
• The risk that any safety protocols in NIKE-owned or affiliated facilities, including our offices, will not be effective or not be perceived as effective, or that any virus-
related illnesses will be linked or alleged to be linked to such facilities, whether accurate or not;
• Incremental costs resulting from the adoption of preventative measures and compliance with regulatory requirements, including providing facial coverings and hand' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40

In [26]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

page_content='the Company to hedge this risk are receive-fixed, pay-variable interest rate swaps. The Company had no interest rate swaps designated as fair value hedges as of
May 31, 2023.
NET INVESTMENT HEDGES
The Company has, in the past, hedged and may, in the future, hedge the risk of variability in foreign currency-denominated net investments in wholly-owned international
operations. All changes in fair value of the derivatives designated as net investment hedges are reported in Accumulated other comprehensive income (loss) along with
the foreign currency translation adjustments on those investments. The Company had no outstanding net investment hedges as of May 31, 2023.
UNDESIGNATED DERIVATIVE INSTRUMENTS
The Company may elect to enter into foreign exchange forwards to mitigate the change in fair value of specific assets and liabilities on the Consolidated Balance Sheets.' metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'cre

In [27]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)


retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(id='7675dbdf-0291-4f3f-8072-b356831c43d4', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': 'nke-10k-2023.pdf', 'total_pages': 107, 'page': 38, 'page_label': '39', 'start_index': 1506}, page_content='(3)    Global Brand Divisions revenues include NIKE Brand licensing and other miscellaneous revenues that are not part of a geographic operating segment.\n(4)    Corporate revenues primarily consist of foreign currency hedge gains and losses related to revenues generated by entities within the NIKE Brand geographic operating segments and Converse, butmanaged through our central foreign exchange risk management p