# 下载依赖

In [7]:
%pip install -qU pypdf langchain_community langchain-openai langchain_chroma

In [9]:
pip list

Package                                  Version
---------------------------------------- ---------------------
absl-py                                  1.4.0
aiohttp                                  3.9.5
aiosignal                                1.3.1
alabaster                                0.7.16
albumentations                           1.3.1
altair                                   4.2.2
annotated-types                          0.7.0
anyio                                    3.7.1
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
array_record                             0.5.1
arviz                                    0.15.1
asgiref                                  3.8.1
astropy                                  5.3.4
astunparse                               1.6.3
async-timeout                            4.0.3
atpublic                                 4.1.0
attrs                                    23.2.0
audioread                            

# 下载PDF文件

In [11]:
import requests

# PDF文件的URL
pdf_url = 'https://s1.q4cdn.com/806093406/files/doc_downloads/2023/414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf'

# 指定本地保存的文件路径
local_filename = './nke-10k-2023.pdf'

# 发送GET请求
response = requests.get(pdf_url, stream=True)

# 检查请求是否成功
if response.status_code == 200:
    # 打开本地文件以写入二进制模式
    with open(local_filename, 'wb') as f:
        # 将PDF文件的内容写入本地文件
        f.write(response.content)
    print(f'PDF文件已下载到 {local_filename}')
else:
    print('下载失败，状态码：', response.status_code)

PDF文件已下载到 ./nke-10k-2023.pdf


# 配置环境变量

In [13]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_BASE"] = userdata.get('OPENAI_API_BASE')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = userdata.get('LANGCHAIN_API_KEY')

# 加载PDF文件

In [14]:
from langchain_community.document_loaders import PyPDFLoader

file_path =  "./nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

106


In [20]:
print(docs[1].page_content[0:100])
print(docs[1].metadata)

UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K 
(Mark One)
☑ ANNU
{'source': './nke-10k-2023.pdf', 'page': 1}


新建大模型对象

In [21]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

文章片段切块、存储向量数据库。转换retriever检索器

In [23]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [25]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt =  (
 "You are an assistant for question-answering tasks. "
 "Use the following pieces of retrieved context to answer "
 "the question. If you don't know the answer, say that you "
 "don't know. Use three sentences maximum and keep the "
 "answer concise."
 "\n\n" "{context}")

prompt = ChatPromptTemplate.from_messages(
 [ ("system", system_prompt), ("human",  "{input}"), ])


# 问题---》答案
question_answer_chain = create_stuff_documents_chain(llm, prompt)
# rag链，问题+相似文章片段 ---》最终答案
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

#results = rag_chain.invoke({"input":  "耐克2022收入是多少?"})
results = rag_chain.invoke({"input":  "耐克2023年的增长是多少？"})

results

{'input': '耐克2023年的增长是多少？',
 'context': [Document(page_content="GREATER CHINA\n(Dollars in millions) FISCAL 2023 FISCAL 2022 % CHANGE% CHANGE \nEXCLUDING \nCURRENCY \nCHANGES FISCAL 2021 % CHANGE% CHANGE \nEXCLUDING \nCURRENCY \nCHANGES\nRevenues by:\nFootwear $ 5,435 $ 5,416  0 %  8 % $ 5,748  -6 %  -10 %\nApparel  1,666  1,938  -14 %  -7 %  2,347  -17 %  -21 %\nEquipment  147  193  -24 %  -18 %  195  -1 %  -6 %\nTOTAL REVENUES $ 7,248 $ 7,547  -4 %  4 % $ 8,290  -9 %  -13 %\nRevenues by:    \nSales to Wholesale Customers $ 3,866 $ 4,081  -5 %  2 % $ 4,513  -10 %  -14 %\nSales through NIKE Direct  3,382  3,466  -2 %  5 %  3,777  -8 %  -12 %\nTOTAL REVENUES $ 7,248 $ 7,547  -4 %  4 % $ 8,290  -9 %  -13 %\nEARNINGS BEFORE INTEREST \nAND TAXES $ 2,283 $ 2,365  -3 %  $ 3,243  -27 %  \nFISCAL 2023 COMPARED TO FISCAL 2022  \n•Greater China revenues increased 4% on a currency-neutral basis, primarily due to higher revenues in the Jordan Brand, \npartially offset by lower revenues in Men's an

In [27]:
print(results["context"][0].page_content)

GREATER CHINA
(Dollars in millions) FISCAL 2023 FISCAL 2022 % CHANGE% CHANGE 
EXCLUDING 
CURRENCY 
CHANGES FISCAL 2021 % CHANGE% CHANGE 
EXCLUDING 
CURRENCY 
CHANGES
Revenues by:
Footwear $ 5,435 $ 5,416  0 %  8 % $ 5,748  -6 %  -10 %
Apparel  1,666  1,938  -14 %  -7 %  2,347  -17 %  -21 %
Equipment  147  193  -24 %  -18 %  195  -1 %  -6 %
TOTAL REVENUES $ 7,248 $ 7,547  -4 %  4 % $ 8,290  -9 %  -13 %
Revenues by:    
Sales to Wholesale Customers $ 3,866 $ 4,081  -5 %  2 % $ 4,513  -10 %  -14 %
Sales through NIKE Direct  3,382  3,466  -2 %  5 %  3,777  -8 %  -12 %
TOTAL REVENUES $ 7,248 $ 7,547  -4 %  4 % $ 8,290  -9 %  -13 %
EARNINGS BEFORE INTEREST 
AND TAXES $ 2,283 $ 2,365  -3 %  $ 3,243  -27 %  
FISCAL 2023 COMPARED TO FISCAL 2022  
•Greater China revenues increased 4% on a currency-neutral basis, primarily due to higher revenues in the Jordan Brand, 
partially offset by lower revenues in Men's and Women's. NIKE Direct revenues increased 5%, due to comparable store


In [28]:
print(results["context"][0].metadata)

{'page': 42, 'source': './nke-10k-2023.pdf'}
