In [2]:
import util_funs.globalsettings as gs

In [10]:
# Document Loader
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(gs.the_files.PROBE_PDF)
documents = loader.load()

In [11]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [12]:
print(docs[10])

page_content='The problem with using maximum likelihood estimators for this problem is that they are biased.\nA way to circumvent this is to use restricted maximum likelihood estimators (REML) [8]. In\nthis study, we adopted REML. However, we found that the choice of estimator, ML or REML,\nactually made little difference to the coefﬁcient estimates.\nWe require a special case of this model structure with just a random intercept for each account.\nThen for each account i,\nYi=αi+Xiβ+εi (2.10)\nwhere αi\x18N(0;σ2\nα)is the random intercept for each account.\nThe previous lagged balance, Yi;t\x00lis deliberately not included in the model structure on the\nright-hand side since this changes the formulation into an autoregressive panel model which is\nnot straightforward to estimate, since the predictor variable Yi;t\x00lwill itself be dependent on the\nrandom effect; a typical approach to estimating such a model is to use instrumental variables\n[15]. However, in this case, it is not clea

In [13]:
# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

In [14]:
# Similarities

from langchain.vectorstores import FAISS

db = FAISS.from_documents(docs, embeddings)

query = "What is this paper about?"
docs = db.similarity_search(query)

In [19]:
len(docs)

4

In [20]:
print(docs[0].page_content)

method, Section 3 describes the data used in this study along with data processing, Section 4
gives results in terms of model estimates and forecasts and Section 5 gives some conclusions
and discussion of possible future work.
1.1 Distribution of credit card balance
For this study, a positive value of credit card balance indicates the credit used and outstanding at
the end of each accounting month. Credit card balance typically has a right-skewed distribution
with lower frequencies of accounts having larger balances. Also, many values will be exactly
zero, representing those individuals who either do not use the credit card for several months
or who repay their balance in full. Additionally, it is possible to record a negative balance for
account months when the credit card holder overpays or a refund is paid to an account. Figure
1.1 shows the distribution of balance for the credit card data used in this study (this data is
described in more detail in Section 3) and it illustrates the

In [16]:
# Q&A
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

retriever = db.as_retriever()
qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)
qa_chain.run(query)

' This paper is about predicting the credit card balance of individuals using statistical models. It discusses the use of OLS regression, two-stage regression models, mixture regression and panel regression models. It also describes the evaluation framework used to assess the accuracy of the predictions.'

In [29]:
# One line index creation - https://python.langchain.com/en/latest/modules/indexes/getting_started.html
from langchain.indexes import VectorstoreIndexCreator

index = VectorstoreIndexCreator().from_loaders([loader])

Using embedded DuckDB without persistence: data will be transient


In [38]:
index.query(query + ". Start your answer with: 'ANSWER: '")

' ANSWER: This paper is about modeling and forecasting credit card balance at an account level. It discusses different models, such as ordinary least squares, two-stage regression, mixture regression, and panel regression. It also discusses the importance of previous lagged balance as a predictor variable.'

In [31]:
index = VectorstoreIndexCreator(vectorstore_cls=FAISS,
                                embedding=embeddings,
                                text_splitter=text_splitter).from_loaders([loader])

In [32]:
index.query(query)

' This paper is about using statistical models to forecast credit card balance. It discusses the data used, the models used, and the evaluation of the models.'

In [41]:
import tempfile

fp = tempfile.TemporaryFile()
print(fp.file.name)
fp.close()

C:\Users\migue\AppData\Local\Temp\tmp7hbcscfb
