In [43]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import CharacterTextSplitter

loader = PyPDFLoader("2023.findings-acl.242.pdf")
pages = loader.load_and_split()
for p in pages:
    p.page_content = p.page_content.replace('\n', ' ')
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(pages, embedding_function)

# query it
query = "Findings of the Association for Computational Linguistics"
docs = db.similarity_search(query,3)

# print results
print(docs)

[Document(page_content='Findings of the Association for Computational Linguistics: ACL 2023 , pages 3923–3931July 9-14, 2023 ©2023 Association for Computational LinguisticsA Simple Yet Strong Domain-Agnostic De-bias Methodfor Zero-Shot Sentiment ClassificationYang Zhao†, Tetsuya Nasukawa†, Masayasu Muraoka†, and Bishwaranjan Bhattacharjee♢†IBM Research - Tokyo, 19-21 Nihonbashi Hakozaki-cho, Chuo City, Tokyo, 103-8510, Japan,♢IBM Research, Yorktown Heights, New York 10598, USAyangzhao@ibm.com ,{nasukawa,mmuraoka}@jp.ibm.com ,bhatta@us.ibm.comAbstractZero-shot prompt-based learning has mademuch progress in sentiment analysis, and con-siderable effort has been dedicated to design-ing high-performing prompt templates. How-ever, two problems exist; First, large languagemodels are often biased to their pre-trainingdata, leading to poor performance in prompttemplates that models have rarely seen. Sec-ond, in order to adapt to different domains, re-designing prompt templates is usually requir

In [45]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = PyPDFLoader("2023.findings-acl.242.pdf")
pages = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(pages)

In [41]:
print(pages[0])

page_content='Findings of the Association for Computational Linguistics: ACL 2023 , pages 3923–3931 July 9-14, 2023 ©2023 Association for Computational Linguistics A Simple Yet Strong Domain-Agnostic De-bias Method for Zero-Shot Sentiment Classification Yang Zhao†, Tetsuya Nasukawa†, Masayasu Muraoka†, and Bishwaranjan Bhattacharjee♢ †IBM Research - Tokyo, 19-21 Nihonbashi Hakozaki-cho, Chuo City, Tokyo, 103-8510, Japan, ♢IBM Research, Yorktown Heights, New York 10598, USA yangzhao@ibm.com ,{nasukawa,mmuraoka}@jp.ibm.com ,bhatta@us.ibm.com Abstract Zero-shot prompt-based learning has made much progress in sentiment analysis, and con- siderable effort has been dedicated to design- ing high-performing prompt templates. How- ever, two problems exist; First, large language models are often biased to their pre-training data, leading to poor performance in prompt templates that models have rarely seen. Sec- ond, in order to adapt to different domains, re- designing prompt templates is usuall

[Document(page_content='standing by generative pre-training.\nAlec Radford, Jeff Wu, Rewon Child, David Luan,\nDario Amodei, and Ilya Sutskever. 2019. Language\nmodels are unsupervised multitask learners.\nRichard Socher, Alex Perelygin, Jean Wu, Jason\nChuang, Christopher D. Manning, Andrew Ng, and\nChristopher Potts. 2013. Recursive deep models for\nsemantic compositionality over a sentiment treebank.\nInProceedings of the 2013 Conference on Empiri-\ncal Methods in Natural Language Processing , pages\n1631–1642, Seattle, Washington, USA. Association\nfor Computational Linguistics.\nZihao Zhao, Eric Wallace, Shi Feng, Dan Klein, and\nSameer Singh. 2021. Calibrate before use: Improv-\ning few-shot performance of language models. In\nProceedings of the 38th International Conference\non Machine Learning , volume 139 of Proceedings\nof Machine Learning Research , pages 12697–12706.\nPMLR.3928', metadata={'page': 5, 'source': '2023.findings-acl.242.pdf'}), Document(page_content='standing b

In [23]:
for d in docs:
    print(d.page_content)

standing by generative pre-training.
Alec Radford, Jeff Wu, Rewon Child, David Luan,
Dario Amodei, and Ilya Sutskever. 2019. Language
models are unsupervised multitask learners.
Richard Socher, Alex Perelygin, Jean Wu, Jason
Chuang, Christopher D. Manning, Andrew Ng, and
Christopher Potts. 2013. Recursive deep models for
semantic compositionality over a sentiment treebank.
InProceedings of the 2013 Conference on Empiri-
cal Methods in Natural Language Processing , pages
1631–1642, Seattle, Washington, USA. Association
for Computational Linguistics.
Zihao Zhao, Eric Wallace, Shi Feng, Dan Klein, and
Sameer Singh. 2021. Calibrate before use: Improv-
ing few-shot performance of language models. In
Proceedings of the 38th International Conference
on Machine Learning , volume 139 of Proceedings
of Machine Learning Research , pages 12697–12706.
PMLR.3928
standing by generative pre-training.
Alec Radford, Jeff Wu, Rewon Child, David Luan,
Dario Amodei, and Ilya Sutskever. 2019. Language
model

In [14]:
!pip3 install langchain_community

[33mDEPRECATION: Loading egg at /Users/xyd/anaconda3/lib/python3.11/site-packages/slither_analyzer-0.10.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
