In [3]:
import os
import requests
from langchain.document_loaders import PyPDFLoader

In [4]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
URLS = [
    "https://arxiv.org/pdf/2306.06031v1.pdf",
    "https://arxiv.org/pdf/2306.12156v1.pdf",
    "https://arxiv.org/pdf/2306.14289v1.pdf",
    "https://arxiv.org/pdf/2306.10973v1.pdf",
    "https://arxiv.org/pdf/2306.13643v1.pdf",
]

In [7]:
ml_papers = []

if not os.path.exists("../files"):
    os.makedirs("../files")

for i, url in enumerate(URLS):
    response = requests.get(url)
    filename = f"../files/paper_{i+1}.pdf"
    with open(filename, "wb") as f:
        f.write(response.content)
        print(f"Downloading {filename}")

        loader = PyPDFLoader(filename)
        data = loader.load()
        ml_papers.extend(data)

Downloading ../files/paper_1.pdf
Downloading ../files/paper_2.pdf
Downloading ../files/paper_3.pdf
Downloading ../files/paper_4.pdf
Downloading ../files/paper_5.pdf


In [8]:
type(ml_papers), len(ml_papers), ml_papers[3]

(list,
 71,
 Document(page_content='Figure 1: FinGPT Framework.\n4.1 Data Sources\nThe first stage of the FinGPT pipeline involves the collec-\ntion of extensive financial data from a wide array of online\nsources. These include, but are not limited to:\n•Financial news: Websites such as Reuters, CNBC, Yahoo\nFinance, among others, are rich sources of financial news\nand market updates. These sites provide valuable informa-\ntion on market trends, company earnings, macroeconomic\nindicators, and other financial events.\n•Social media : Platforms such as Twitter, Facebook, Red-\ndit, Weibo, and others, offer a wealth of information in\nterms of public sentiment, trending topics, and immediate\nreactions to financial news and events.\n•Filings : Websites of financial regulatory authorities, such\nas the SEC in the United States, offer access to company\nfilings. These filings include annual reports, quarterly earn-\nings, insider trading reports, and other important company-\nspecific in

## Split the documents

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    length_function=len,
)

documents = text_splitter.split_documents(ml_papers)

In [10]:
len(documents), documents[10]

(202,
 Document(page_content='highly volatile, changing rapidly in response to news events\nor market movements.\nTrends , often observable through websites like Seeking\nAlpha, Google Trends, and other finance-oriented blogs and\nforums, offer critical insights into market movements and in-\nvestment strategies. They feature:\n•Analyst perspectives: These platforms provide access to\nmarket predictions and investment advice from seasoned\nfinancial analysts and experts.\n•Market sentiment: The discourse on these platforms can\nreflect the collective sentiment about specific securities,\nsectors, or the overall market, providing valuable insights\ninto the prevailing market mood.\n•Broad coverage: Trends data spans diverse securities and\nmarket segments, offering comprehensive market coverage.\nEach of these data sources provides unique insights into\nthe financial world. By integrating these diverse data types,\nfinancial language models like FinGPT can facilitate a com-\nprehensive 

## Ingest the documents to the embeddings and data base

In [15]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [14]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

In [17]:
chat = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo", temperature=0.0
)

qa_chain = RetrievalQA.from_chain_type(
    llm=chat, chain_type="stuff", retriever=retriever
)

In [18]:
query = "What is finGPT?"
qa_chain.run(query)

'FinGPT is an open-source framework designed for applying large language models (LLMs) in the financial domain. It aims to address the challenges of utilizing language models in finance, such as obtaining high-quality and up-to-date data, handling diverse data formats, and managing data quality inconsistencies. FinGPT adopts a data-centric approach, implementing rigorous cleaning and preprocessing methods to ensure high-quality data. It consists of four fundamental components: Data Source, Data Engineering, LLMs, and Applications, which work together to enable the functionality and adaptability of FinGPT in addressing dynamic financial data and market conditions.'