In [None]:
# Specify a different mountpoint
mountpoint = "/content/my_drive"

# Mount the Google Drive
from google.colab import drive
drive.mount(mountpoint)


Mounted at /content/my_drive


In [None]:
!pip install --upgrade --quiet langchain langchain-openai langchain-chroma beautifulsoup4
!pip install unstructured
!pip install "unstructured[pdf]"
!apt-get install -y poppler-utils
!pip install pytesseract
!apt-get install -y tesseract-ocr


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
from langchain.document_loaders import TextLoader,DirectoryLoader,WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter
#from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [None]:
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap = 100

)

loader = DirectoryLoader("/content/my_drive/MyDrive/chatgpt-retrieval/Lums")
#loader = WebBaseLoader("https://pdc.lums.edu.pk/bakery.php?CUID=VkZaU2FtVkZOVVZSV0c5NlZGZHdjbVZyTlRaaE0yYzk=")
docs = loader.load()
splits = text_splitter.split_documents(docs)

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"
embedding = OpenAIEmbeddings()

persist_directory = "/content/my_drive/MyDrive/chatgpt-retrieval/Lums/persist"


vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

# save the database so we can use it later
vectordb.persist()

# check that the database have been created and get the number of documents
print(vectordb._collection.count())

775


In [None]:
# similarity search
question = "What is fee strucutre?"


docs = vectordb.similarity_search_with_score(question,k=3)

for result in docs:
    print("\n")
    print(result[1])
    print(result[0].page_content)



0.4102347195148468
Fee Structure
Tuition Fee at LUMS is based on the number of credit hours taken in a semester. However, the tuition fee for Super Senior/Third-year law is the same as last year based on the 12-20 credit hours fee policy. Students are responsible for buying their own books and other reading material. For all undergraduate programmes at LUMS, a student is required to take at least 12 credit hours per semester to acquire full-time status.
•
• However, the credit hours requirements in first year varies amongst undergraduate programmes due to its specific design, therefore the tuition fee amount also varies according to the number of credit hours requirements in each programme. The program-wise full-time credit hours requirements for first-year students are as follows:
e
e


0.4126439094543457
Fee for FY 2022-23
The fee structure for 2022-2023 has already been shared with the students at the time of admission through fee information document. Furthermore, LUMS website ca

In [None]:
persist_directory = "/content/my_drive/MyDrive/chatgpt-retrieval/Lums/persist"
# load again the db
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

# Q&A
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

result = qa_chain({"query": question})
print(result["result"])

The fee structure at LUMS is based on the number of credit hours taken in a semester. The tuition fee varies depending on the number of credit hours required for each program. For the academic year 2022-2023, the per credit hour rate is Rs. 26,910/- for most undergraduate programs, while for Super Seniors/Third Year Law students, the per credit hour rate is Rs. 36,420/-. Students are responsible for purchasing their own books and reading materials. For specific details, students can refer to the fee information document provided at the time of admission or visit the LUMS website.


In [None]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
print(QA_CHAIN_PROMPT)


# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

print(result["result"])

input_variables=['context', 'question'] template="Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n{context}\nQuestion: {question}\nHelpful Answer:"
The fee structure at LUMS is based on the number of credit hours taken in a semester. The tuition fee varies depending on the number of credit hours required for each program. For the academic year 2022-2023, the per credit hour rate is Rs. 26,910/- for most undergraduate programs, while for Super Seniors/Third Year Law students, the per credit hour rate is Rs. 36,420/-. Students are responsible for purchasing their own books and reading materials. For specific details, students can refer to the fee information document provided at the time of admission or visit the LUMS website.


In [None]:
print(result)

{'query': 'What is fee strucutre?', 'result': 'The fee structure at LUMS is based on the number of credit hours taken in a semester. The tuition fee varies depending on the number of credit hours required for each program. For the academic year 2022-2023, the per credit hour rate is Rs. 26,910/- for most undergraduate programs, while for Super Seniors/Third Year Law students, the per credit hour rate is Rs. 36,420/-. Students are responsible for purchasing their own books and reading materials. For specific details, students can refer to the fee information document provided at the time of admission or visit the LUMS website.'}
