# 1. Environment Setup


In [None]:
!apt-get install -y libmagic1 poppler-utils tesseract-ocr pandoc
!pip install -U langchain langchain-community langchain-core \
chromadb transformers sentence-transformers bitsandbytes beautifulsoup4 \
unstructured[html] requests lxml pdfminer.six


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
pandoc is already the newest version (2.9.2.1-3ubuntu2).
tesseract-ocr is already the newest version (4.1.1-2.1build1).
libmagic1 is already the newest version (1:5.41-3ubuntu0.1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.11).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.


# 2. Crawl & load the website content

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

urls = [
    "https://diu.ac/",
    "https://diu.ac/#",
]

loader = UnstructuredURLLoader(urls=urls)
raw_docs = loader.load()

In [None]:
raw_docs

[Document(metadata={'source': 'https://diu.ac/'}, page_content='slider1\n\nWelcome To Dhaka International University\n\nDhaka International University, rated among the top private universities of Bangladesh, is an institution that promotes eastern culture and values, and meaningfully blends eastern and western thoughts and innovation.\n\nApply Now\n\nslider2\n\nAdmission Open Now\n\nUndergraduate - Postgraduate 2025\n\nJoin us for higher study\n\nApply Now\n\nslider3\n\nResult Based Scholarship\n\nAvail Upto 50% Merit Scholarships\n\nScholarship\n\nBuilding Leaders of Tomorrow\n\nTo achieve this, DIU is taking various proactive steps by engaging with professionals and industry leaders graduated from different universities of the country to gain valuable insights that will:\n\nDesign a State-of-the-Art Career Placement Centre: Tailored to address the dynamic demands of the job market, ensuring our students are equipped for success.\n\nEnhance Student Skills Development: Identifying core

# 3. Split documents into manageable chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(raw_docs)

# 4. Embed the chunks & build ChromaDB vector store

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Use a lightweight HF embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Create (or connect to) a local Chroma collection
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    collection_name="DIU_website"
)

# 5. Load Llama 2 via HuggingFace and wrap as a LangChain LLM

In [None]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


# model_name = "meta-llama/Llama-2-7b-chat-hf"
model_name = "daryl149/llama-2-7b-chat-hf"

# 1. Tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          use_auth_token=True,)



model = AutoModelForCausalLM.from_pretrained(model_name,
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                              load_in_8bit=True,
                                              #load_in_4bit=True
                                             )



# 2. Wrap in a text-generation pipeline
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 512,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )


# 3. LangChain LLM
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})




tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0
  llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})


In [None]:
llm.predict("Please provide a concise summary of the Book Unmasking AI")

  llm.predict("Please provide a concise summary of the Book Unmasking AI")


"Please provide a concise summary of the Book Unmasking AI: The Ethical And Social Implications Of The Fourth Industrial Revolution by Ramez Naam, including the main arguments and conclusions.\nUnmasking AI: The Ethical And Social Implications Of The Fourth Industrial Revolution is a book written by Ramez Naam that explores the ethical and social implications of artificial intelligence (AI) and the Fourth Industrial Revolution. Here is a concise summary of the book's main arguments and conclusions:\nArguments:\n1. AI is transforming society faster than we realize: Naam argues that AI is changing the world faster than we realize, with far-reaching consequences for our economy, society, and politics. He emphasizes that we need to understand the implications of AI and prepare for the challenges ahead.\n2. AI is a double-edged sword: Naam highlights the potential benefits of AI, such as improved healthcare and transportation, but also the risks, including job displacement, increased inequa

# 6. Assemble the RetrievalQA chain


In [None]:
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever
)

In [None]:
query = "Tell me about Dhaka International University?"
answer = qa_chain.run(query)
print(answer)

  answer = qa_chain.run(query)


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Shameem Haider Patwary

Chairman, BOT, Dhaka International University

founder

founder

Dhaka International University, one of the top private universities in Bangladesh, was founded on April 7, 1995, by the late Alhaj Professor Dr. A.B.M Mafizul Islam Patwari. An academic and edupreneur, he believed that knowledge is power. Our dream is to ensure our graduates excel both academically and morally.

Dr. S. Quadir Patwari

Vice-Chairman, BOT, Dhaka International University

CURRENT STATISTICS: OUR STRENGTH

Academic Programs

38

Convocatons

CURRENT FOREIGN STUDENTS

92

CURRENT STUDENTS

14992

FULL TIME TEACHERS AND EMPLOYEES

536

NUMBER OF ALUMNI

28210

OUR PARTNERS

Shameem Haider Patwary

Chairman, BOT, Dhaka International University

founder

founder

Dhaka International University, one of the top private universitie

# For github issue fix


In [None]:
!python fix_notebook_widgets_metadata.py Web_Intelligence_powered_by_Open_LLM.ipynb


python3: can't open file '/content/fix_notebook_widgets_metadata.py': [Errno 2] No such file or directory
