In [2]:

from langchain.vectorstores import Chroma
from langchain.chains import VectorDBQA
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from chromadb.utils import embedding_functions
import chromadb
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
data_dir = "/home/msaad/workspace/honors-thesis/data-collection/data/"
loader = DirectoryLoader(
    data_dir + "vectordb_split2", 
    glob="./*.txt",
    use_multithreading=True
)
doc = loader.load()

len(doc)

The MIME type of '/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_split2/chunk_brockport.edu_about_president_college-handbook.txt' is 'message/news'. This file type is not currently supported in unstructured.


2501

In [4]:
# Splitting the text into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=200
)
texts = text_splitter.split_documents(doc)

# Count the number of chunks
len(texts)

4978

In [5]:
persist_directory = data_dir + "chroma_vectordb2"

# By default uses 'hkunlp/instructor-large'
embedding_function = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the School paragraph for retrieval: "
)

# This also generates embeddings, which is quite GPU taxing.
vectordb = Chroma.from_documents(
    documents = texts,
    embedding = embedding_function,
    persist_directory = persist_directory
)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/msaad/miniconda3/envs/thesis/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


max_seq_length  512


In [14]:
vectordb.persist()

In [15]:
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_function
)

In [6]:
retreiver = vectordb.as_retriever()

docs = retreiver.get_relevant_documents("How can I apply as an undergraduate?")

docs

[Document(page_content='In certain cases, international undergraduate applicants may be required to submit an academic credential evaluation. If this is required of you, you will be instructed to do so by the International Admissions office. This checklist requirement will also appear on your application.', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_split2/chunk_brockport.edu_academics_international_education_student_services_academic_cred_evaluation.html.txt'}),
 Document(page_content='is designed to introduce eligible undergraduates to leading scholars. Funding is often provided. Each scientific area has its own application/website. Applications are typically due in December/January.', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_split2/chunk_brockport.edu_academics_biology_research-opportunities.txt'}),
 Document(page_content='We encourage you to review our undergraduate programs of study and notif

In [26]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

retreiver.get_relevant_documents("How can I apply as an undergraduate?")

[Document(page_content='is designed to introduce eligible undergraduates to leading scholars. Funding is often provided. Each scientific area has its own application/website. Applications are typically due in December/January.', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2352.txt'}),
 Document(page_content='To apply for any of these positions, please indicate so on your graduate school application. For any questions about the position, please contact', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2342.txt'}),
 Document(page_content='In certain cases, international undergraduate applicants may be required to submit an academic credential evaluation. If this is required of you, you will be instructed to do so by the International Admissions office. This checklist requirement will also appear on your application.', metadata={'source': '/home/msaad/workspace/honors-thesis/d

In [27]:
retriever.search_type

'similarity'

In [28]:
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    verbose=True
)

In [24]:
# Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [31]:
query = "How can I apply as a prospective undergrad?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 You can apply as a prospective undergrad by carefully reading the description of the thesis option and verifying your eligibility. You should also begin exploring this option at least two semesters ahead of your intended graduation and complete and submit a Thesis Application Form to the Thesis Coordinator. Additionally, you should review the application instructions page for the program you are interested in.


Sources:
/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2342.txt
/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2352.txt
/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2630.txt
/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_1548.txt
/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2080.txt


In [32]:
llm_response

{'query': 'How can I apply as a prospective undergrad?',
 'result': ' You can apply as a prospective undergrad by carefully reading the description of the thesis option and verifying your eligibility. You should also begin exploring this option at least two semesters ahead of your intended graduation and complete and submit a Thesis Application Form to the Thesis Coordinator. Additionally, you should review the application instructions page for the program you are interested in.',
 'source_documents': [Document(page_content='To apply for any of these positions, please indicate so on your graduate school application. For any questions about the position, please contact', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore/chunk_2342.txt'}),
  Document(page_content='is designed to introduce eligible undergraduates to leading scholars. Funding is often provided. Each scientific area has its own application/website. Applications are typically du