##Example using unstructured to ingest data from google cloud service.

In [None]:
!pip install "unstructured[gcs,all-docs]==0.15.0" langchain chromadb bitsandbytes sentence-transformers "unstructured-ingest[gcs]" langchain-community


In [6]:
import os
from google.cloud import storage


In [9]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'json file to key'

In [11]:
command = [
 "unstructured-ingest",
 "gcs",
 "--remote-url", "gs://",
 "--download-dir", "/content/documents",
 "--output-dir", "/content/output",
 "--num-processes", "2",
 "--api-key", "",
 "--verbose",
 "--strategy", "hi_res",
 "--partition-by-api",
 "--partition-endpoint","https://api.unstructured.io/general/v0/general",
 "--service-account-key", "json file to key",
]

In [26]:
import subprocess

# Run the command and capture the result
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

# Check if the command succeeded
if result.returncode == 0:
    print("Command executed successfully!")
    # If you want to see the output:
    print(result.stdout.decode('utf-8'))
else:
    print("Command failed!")
    print(f"Exit code: {result.returncode}")
    # If you want to see the error message:
    print(result.stderr.decode('utf-8'))


Command executed successfully!



In [13]:
import json

In [27]:
from unstructured.staging.base import elements_from_json
output_path ='/content/output'
element =[]
for filename in os.listdir(output_path):
  filepath = os.path.join(output_path, filename)
  element.extend(elements_from_json(filepath))

In [28]:
from unstructured.chunking.title import chunk_by_title
chunk_elements = chunk_by_title(elements=element, max_characters=512, combine_text_under_n_chars=200)

In [29]:
from langchain_core.documents import Document

In [30]:
documents =[]
for chunk_element in chunk_elements:
  metadata =chunk_element.metadata.to_dict()
  metadata["source"]= metadata["filename"]
  del metadata["languages"]
  documents.append(Document(page_content=chunk_element.text, metadata=metadata))


In [31]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import utils as chroma_utils
# ChromaDB doestn't support complex metadata, e.g. lists so we drop it

docs = chroma_utils.filter_complex_metadata(documents)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
vectorstore= Chroma.from_documents(docs, embeddings)
 #instructs the retriever to return top 3 results.
retriver = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.chains import RetrievalQA

In [34]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [35]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    do_sample=True,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=200,
    eos_token_id=terminators,
)
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [36]:
prompt_template = """
<|start_header_id|>user<|end_header_id|>
You are an assistant for answering questions using provided context.
You are given the extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "I do not know." Don't make up an answer.
Question: {question}
Context: {context}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)


qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriver, chain_type_kwargs={"prompt": prompt},return_source_documents=True)

In [41]:
question = "what is Nguyen Tien Nam?"


In [42]:
qa_chain.invoke(question)["result"]


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


'Based on the provided context, it appears that Nguyen Tien Nam is a job applicant, specifically a candidate for the Data Scientist position at Ekimetrics. He has expressed his interest in the role and highlighted his relevant expertise in deep learning and computer vision.'

In [None]:
qa_chain.invoke(question)["source_documents"]