In [1]:
from langchain.vectorstores import Chroma
from langchain.chains import VectorDBQA
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from chromadb.utils import embedding_functions
import chromadb
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
# Moves all admissions and scholarships-aid files to a separate dir. For testing different semantic search ideas.
if False:
    import os
    import shutil

    # The directory where the files are currently located
    source_dir = "/home/msaad/workspace/honors-thesis/data-collection/data/vectordb_filestore"
    # The directory where you want to move the matched files
    destination_dir = "/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks"

    # Iterate over all .txt files in the source directory
    for filename in os.listdir(source_dir):
        # Check if the filename matches your keywords
        if ("admissions" in filename or "scholarships-aid" in filename) and filename.endswith(".txt"):
            # If there is a match, copy the file to the destination directory
            source_file = os.path.join(source_dir, filename)
            destination_file = os.path.join(destination_dir, filename)
            shutil.copy2(source_file, destination_file)

In [16]:
data_dir = "/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks"
loader = DirectoryLoader(
    data_dir, 
    glob="./*.txt",
    use_multithreading=True
)
doc = loader.load()

len(doc)

123

In [17]:
# Splitting the chunked text into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256, 
    chunk_overlap=32
)
texts = text_splitter.split_documents(doc)

# Count the number of chunks
len(texts)

599

In [18]:
persist_directory = data_dir + "chroma_vectordb2"

# By default uses 'hkunlp/instructor-large'
embedding_function = HuggingFaceInstructEmbeddings(
    query_instruction="Represent the School document for retrieval: "
)

# This also generates embeddings, which is quite GPU taxing.
vectordb = Chroma.from_documents(
    documents = texts,
    embedding = embedding_function
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [24]:
retreiver = vectordb.as_retriever()
docs = retreiver.get_relevant_documents("How can I apply for financial aid??")

docs

[Document(page_content='need of financial aid, please contact the Financial Aid Office before you apply.', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks/chunk_brockport.edu_admissions_apply_non-degree.txt'}),
 Document(page_content='The Free Application for Federal Student Aid (FAFSA) is the first step to determine eligibility for federal and state financial aid programs. You are encouraged to file your The FSA ID is your identifier to let you access your personal information in', metadata={'source': '/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks/chunk_brockport.edu_admissions_aid_financial_aid_applications_forms.html.txt'}),
 Document(page_content='In order to be considered for grant money awarded by the federal government, you need to complete the Free Application For Federal Student Aid (FAFSA) application each year. If you are interested in finding out more information about the FAFSA, includin

In [20]:
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=retreiver,
    return_source_documents=True,
    verbose=True
)

# Cite sources
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [27]:
query = "Does Brockport have a computer science program?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
 I don't know.


Sources:
/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks/chunk_brockport.edu_admissions_apply_transfer.txt
/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks/chunk_brockport.edu_admissions_contact_communication-team.txt
/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks/chunk_brockport.edu_admissions_information_counselors.txt
/home/msaad/workspace/honors-thesis/data-collection/data/admissions-faid-chunks/chunk_brockport.edu_admissions_information_eop.txt
