In [1]:
!pip install -q torch transformers accelerate bitsandbytes sentence-transformers unstructured[all-docs]==0.15.0 langchain chromadb langchain_community

In [2]:
!pip install unstructured==0.15.0




In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!mkdir -p "./documents"
!wget https://www.gov.nl.ca/ecc/files/env-protection-pesticides-business-manuals-applic-chapter7.pdf -O "./documents/env-protection-pesticides-business-manuals-applic-chapter7.pdf"
!wget https://ipm.ifas.ufl.edu/pdfs/Citrus_IPM_090913.pptx -O "./documents/Citrus_IPM_090913.pptx"
!wget https://www.gutenberg.org/ebooks/45957.epub3.images -O "./documents/45957.epub"
!wget https://blog.fifthroom.com/what-to-do-about-harmful-garden-and-plant-insects-and-pests.html -O "./documents/what-to-do-about-harmful-garden-and-plant-insects-and-pests.html"

--2024-10-22 13:19:27--  https://www.gov.nl.ca/ecc/files/env-protection-pesticides-business-manuals-applic-chapter7.pdf
Resolving www.gov.nl.ca (www.gov.nl.ca)... 98.143.128.70
Connecting to www.gov.nl.ca (www.gov.nl.ca)|98.143.128.70|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1914250 (1.8M) [application/pdf]
Saving to: ‘./documents/env-protection-pesticides-business-manuals-applic-chapter7.pdf’


2024-10-22 13:19:29 (2.27 MB/s) - ‘./documents/env-protection-pesticides-business-manuals-applic-chapter7.pdf’ saved [1914250/1914250]

--2024-10-22 13:19:29--  https://ipm.ifas.ufl.edu/pdfs/Citrus_IPM_090913.pptx
Resolving ipm.ifas.ufl.edu (ipm.ifas.ufl.edu)... 128.227.68.231
Connecting to ipm.ifas.ufl.edu (ipm.ifas.ufl.edu)|128.227.68.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4248570 (4.1M) [application/vnd.openxmlformats-officedocument.presentationml.presentation]
Saving to: ‘./documents/Citrus_IPM_090913.pptx’


2024-10-22

Can use the unstractured libraty to preprocess documemts one by one and write your own script to walk through a directory.
Several tools : S3 buckets, blob storage, SFTP,
We use Local source connector. [Unstructured documentation](https://docs.unstructured.io/open-source/ingest/source-connectors/overview)

In [5]:
import logging
logger = logging.getLogger("unstructed.ingest")
logger.root.removeHandler(logger.root.handlers[0])

In [6]:
import os
from unstructured.ingest.connector.local import SimpleLocalConfig
from unstructured.ingest.interfaces import PartitionConfig, ProcessorConfig, ReadConfig
from unstructured.ingest.runner import LocalRunner

In [7]:
output_path ='/content/local-ingest-output'
!mkdir -p '/content/local-ingest-output'

In [8]:
!cd ..

this example uses API, and for this reason requires Unstructured API key. You can get yours here. The free Unstructured API is capped at 1000 pages, and offers better OCR models for image-based documents than a local installation of Unstructured.
If you remove these two parameters, the documents will be processed locally, but you may need to install additional dependencies if the documents require OCR and/or document understanding models. Namely, you may need to install poppler and tesseract in this case, which you can get with brew:

Copied
!brew install poppler
!brew install tesseract

In [9]:
runner = LocalRunner(
    processor_config=ProcessorConfig(
        # logs verbosity
        verbose=True,
        # the local directory to store outputs
        output_dir='/content/local-ingest-output',
        work_dir='/content/',
        num_processes=2,
    ),
    read_config=ReadConfig(),
    partition_config=PartitionConfig(
        partition_by_api=False,
        #api_key="cCdQl5Zi9MG2lcoKfLhxgJBfQggDwC",
    ),
    connector_config=SimpleLocalConfig(
        input_path="/content/documents",
        # whether to get the documents recursively from given directory
        recursive=True,
    ),
)
runner.run()

2024-10-22 13:19:43,681 MainProcess INFO     running pipeline: DocFactory -> Reader -> Partitioner -> Copier with config: {"reprocess": false, "verbose": true, "work_dir": "/content/", "output_dir": "/content/local-ingest-output", "num_processes": 2, "raise_on_error": false}
2024-10-22 13:19:43,866 MainProcess INFO     Running doc factory to generate ingest docs. Source connector: {"processor_config": {"reprocess": false, "verbose": true, "work_dir": "/content/", "output_dir": "/content/local-ingest-output", "num_processes": 2, "raise_on_error": false}, "read_config": {"download_dir": "", "re_download": false, "preserve_downloads": false, "download_only": false, "max_docs": null}, "connector_config": {"input_path": "/content/documents", "recursive": true, "file_glob": null}}
2024-10-22 13:19:43,882 MainProcess INFO     processing 4 docs via 2 processes
2024-10-22 13:19:43,896 MainProcess INFO     Calling Reader with 4 docs
2024-10-22 13:19:43,897 MainProcess INFO     Running source nod

In [10]:
from unstructured.staging.base import elements_from_json

element =[]
for filename in os.listdir(output_path):
  filepath = os.path.join(output_path, filename)
  element.extend(elements_from_json(filepath))

In [11]:
from unstructured.chunking.title import chunk_by_title
chunk_elements = chunk_by_title(elements=element, max_characters=512, combine_text_under_n_chars=200)

In [12]:
len(chunk_elements)

159

In [13]:
from langchain_core.documents import Document

In [14]:
documents =[]
for chunk_element in chunk_elements:
  metadata =chunk_element.metadata.to_dict()
  metadata["source"]= metadata["filename"]
  del metadata["languages"]
  documents.append(Document(page_content=chunk_element.text, metadata=metadata))


In [15]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import utils as chroma_utils
# ChromaDB doestn't support complex metadata, e.g. lists so we drop it

docs = chroma_utils.filter_complex_metadata(documents)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
vectorstore= Chroma.from_documents(docs, embeddings)
 #instructs the retriever to return top 3 results.
retriver = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [19]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.chains import RetrievalQA

In [20]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [21]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=200,
    eos_token_id=terminators,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [36]:
prompt_template = """
<|start_header_id|>user<|end_header_id|>
You are an assistant for answering questions using provided context.
You are given the extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "I do not know." Don't make up an answer.
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)


qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriver, chain_type_kwargs={"prompt": prompt},return_source_documents=True)

In [31]:
question = "Are aphids a pest?"


In [32]:
qa_chain.invoke(question)["result"]


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


"Yes, aphids are considered a pest. According to the text, they suck the nutrient-rich liquids out of plants, multiply quickly, and need to be controlled immediately. In fact, the author mentions having a problem with aphids this growing season and considering moving their plants due to the infestation. The text also describes the signs of aphid infestation, such as the sticky residue called honeydew left behind by the aphids. So, it's clear that aphids are a pest that gardeners should be aware of and take steps to manage."

In [37]:
# Return source documents
qa_chain.invoke(question)["source_documents"]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


[Document(metadata={'file_directory': '/content/documents', 'filename': 'what-to-do-about-harmful-garden-and-plant-insects-and-pests.html', 'filetype': 'text/html', 'last_modified': '2024-10-22T11:52:42', 'orig_elements': 'eJylU9tu2zAM/RXCz3HqS9Im2RcUG7qXAnvoioCR6FiLLRmS3Cwo+u8j5TQt1u5pMBA71NHh4SH58JxRRz3ZuDU620C2rotdtSoWy+tyXVXNat3cKFK45Keqm12TzSDrKaLGiIx/zuRjG9zoFb3+p63yxK9EWBXVIi+LvKqgrDflerMo5nVRFcsboUro3mnTmI/wcrOsNovqghu8UxTC38B75q2KTbGYL4qyLGrBD+R7E4JxNmzPUh+eM04kIuu6XK1eHhk2+k64rpSzkT240k6NYka4OrYY8+hy7XLcuTHmLfq+Gbt8j16TzdHqfOjQxtzYQCqGKUIhhnkb+y57YfrGdLTVxvO586fPM2VnnMVetGX/nfjMF09D4ov0O169HvC9/Yh7CuJHRnafPaZoiP9owv37Jgzo3waFh4IWy7LeUb1c7/RNpdQKa1wUN9dY7mqVDJDkAr6NP8eqKNcBBm96gqkWY/fA4iGVA4EwODtLkdiSJzABrIutwI7OB+IwWjiXPd2UsjnyhFpQJx5D2BGO0bBjE2+Yww8SCEMhOOhH1UJkETPonaXTlLBzTwKKDoRikhfexIUJZSI0RF0ATQ0PuKTkG42Row+qNP96d/pEFxdzmMMtYMdrok/QogYEaTYcTWwBh9bowNWyAXvvjkIxuSMOxIRnEzXtJg2mEWVHDPBr5CIZaciLsp6rmsM3F87ErTtKmDzbpXijxOc+ST4QDdCfLtXegjaa7eI0nji3Z9Oc7Yyl1/6wFuwOpCdmXs2G0r4h2

In [38]:
# Try to hybrid search