### Get Started

In [1]:
import os
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings

In [3]:
DIR = os.path.dirname(os.path.abspath(__name__))
DB_PATH = os.path.join(DIR, 'db/')
DB_PATH

'/home/sanchez/VScode_consolidated/task1/db/'

In [4]:
# Connect to the running ChromaDB instance
client = chromadb.PersistentClient(
  path=DB_PATH,
  settings=Settings(),
  tenant=DEFAULT_TENANT,
  database=DEFAULT_DATABASE,)


In [None]:
# Create or get a collection
collection = client.create_collection("weka")

In [51]:
# Add data to the collection
documents = ["This is a beach", "The forest is green", "Urban areas are crowded"]
metadatas = [{"category": "beach"}, {"category": "forest"}, {"category": "urban"}]
ids = ["1", "2", "3"]

collection.add(documents=documents, metadatas=metadatas, ids=ids)

In [46]:
# Query the collection
results = collection.query(
    query_texts=["weka supporting dataset formats"],
    n_results=3  # Number of results to return
)

# Print the results
print(results)

{'ids': [['2', '3', '4']], 'distances': [[0.8691518582313926, 0.9016820837823974, 0.9688559958091476]], 'metadatas': [[None, None, None]], 'embeddings': None, 'documents': [['the default setting of 16 to 64MB is usually too small. If you get error s that\nclasses are not found, check your CLASSPATH : does it include weka.jar ? You\ncan explicitly set CLASSPATH via the-cpcommand line option as well.\nWe will begin by describing basic concepts and ideas. Then, we will desc ribe\ntheweka.filters package, which is used to transform input data, e.g. for\npreprocessing, transformation, feature generation and so on.', 'Then we will focus on the machine learning algorithms themselves. The se\nare called Classiﬁers in WEKA. We will restrict ourselves to common set tings\nfor all classiﬁers and shortly note representatives for all main app roaches in\nmachine learning.\nAfterwards, practical examples are given.\nFinally, in the docdirectory of WEKA you ﬁnd a documentation of all java\nclasses wi

### SentenceTransformer

In [53]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [54]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [60]:
DOG_embeddings = model.encode(["DOG"])
dog_embeddings = model.encode(["dog"])
(DOG_embeddings == dog_embeddings).all()

np.True_

In [62]:
good_embeddings = model.encode(["good"])
better_embeddings = model.encode(["better"])
best_embeddings = model.encode(["best"])
print(
  (good_embeddings == better_embeddings).all(),
  (good_embeddings == best_embeddings).all(),
  (better_embeddings == best_embeddings).all()
)

False False False


In [64]:
very_good_embeddings = model.encode(["very good"])
better_embeddings_embeddings = model.encode(["better"])
print((very_good_embeddings == better_embeddings_embeddings).all())

False


### Add Langchain

In [79]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

In [80]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [81]:
vector_store = Chroma(
    collection_name="documnet_collection",
    embedding_function=embeddings,
    persist_directory="db/",
)

In [82]:
# Add documents
from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)


In [161]:
document_11 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_12 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_13 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

In [163]:
documents = [
    document_11,
    document_12,
    document_13,
]

# uuids = [str(uuid4()) for _ in range(len(documents))]
ids = [11, 12, 13]
vector_store.add_documents(documents=documents)

['0638bc76-8c5d-48e4-8c75-c3fcaf38478f',
 '33fa3b82-e01c-4313-8707-b97cb2db4044',
 '3c48a076-f043-4e4e-b663-7a1406080471']

### upload to database

In [1]:
import os
DIR = os.path.dirname(os.path.abspath(__name__))
DB_PATH = os.path.join(DIR, 'db/')

from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
import chromadb
persistent_client = chromadb.PersistentClient(
    path=DB_PATH,
    settings=Settings(),
    tenant=DEFAULT_TENANT,
    database=DEFAULT_DATABASE,)

In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [3]:
from langchain_chroma import Chroma
collection_name = "weka"
vector_store = Chroma(
  client=persistent_client,
  collection_name=collection_name,
  embedding_function=embeddings,
)

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
  chunk_size=500,
  chunk_overlap=50,
  length_function=len,
  keep_separator=False,
  add_start_index=True,
  is_separator_regex=False,
  separators=['\n\n', '\n', ' ', '']
)

In [5]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.docstore.document import Document

In [6]:
# Split the large document into small documnets
async def split_document(Document:Document):
  return Document.metadata, splitter.create_documents([Document.page_content])
  # return splitter.split_documents([text])

In [7]:
from typing import List
# Split text into chunks
async def split_text(input:str|List[str]):
  if isinstance(input,str):
    return splitter.create_documents([input])
  if isinstance(input,List[str]):
    return splitter.create_documents(input)
  return ''

In [9]:
# Load the pdf file
async def load_pdf(pdf_file):
  # Initialize the PDF loader inside the function
  loader = PyPDFLoader(
    pdf_file,
    extract_images=True,
    headers=None,
    extraction_mode="plain",
  )
  pacge_count = 1
  async for page in loader.alazy_load():
    document = Document(
      metadata=page.metadata,
      page_content=page.page_content
    )
    print("page:",pacge_count)
    pacge_count += 1

    yield await split_document(document) # Yield each chunk one at a time 

In [10]:
async def upload_pdf(pdf_file:str):
  ids = list()
  id_calibration = 1
  async for metadata, chunks in load_pdf(pdf_file):
    for id, chunk in enumerate(chunks):
      chunk.metadata.update(metadata)
      ids.append(id_calibration+id)

    results = vector_store.add_documents(documents=chunks)
    print(len(results))
    # break
    id_calibration = ids[-1]+1
    ids = list()

In [11]:
async def main():
  await upload_pdf("WekaManual_13to15.pdf")
await main()

page: 1
4
page: 2
5
page: 3
3


In [12]:
results = vector_store.similarity_search(
  "what are weka supported dataset formats?", k=1)
print(results)

[Document(metadata={'page': 2, 'source': 'WekaManual_13to15.pdf', 'start_index': 889}, page_content='the main() routine of weka.core.Instances :\njava weka.core.Instances data/soybean.arff\nweka.core oﬀers some other useful routines, e.g. converters.C45Loader and\nconverters.CSVLoader ,whichcanbeusedtoimportC45datasetsandcomma/tab-\nseparated datasets respectively, e.g.:\njava weka.core.converters.CSVLoader data.csv > data.arf f\njava weka.core.converters.C45Loader c45_filestem > data .arff')]


### Integrate LLM Model

In [13]:
from llmware.models import ModelCatalog

In [14]:
# to load the model and make a basic inference
model = ModelCatalog().load_model("bling-phi-3-gguf", temperature=0.0, sample=False)

In [15]:
response = model.inference("what dataset file format does weka support", add_context='the main() routine of weka.core.Instances :\njava weka.core.Instances data/soybean.arff\nweka.core oﬀers some other useful routines, e.g. converters.C45Loader and\nconverters.CSVLoader ,whichcanbeusedtoimportC45datasetsandcomma/tab-\nseparated datasets respectively, e.g.:\njava weka.core.converters.CSVLoader data.csv > data.arf f\njava weka.core.converters.C45Loader c45_filestem > data .arff')
response

{'llm_response': 'weka supports the ARFF format.',
 'usage': {'input': 160,
  'output': 8,
  'total': 168,
  'metric': 'tokens',
  'processing_time': 60.769243240356445}}

#### Qwen2.5

In [5]:
from transformers import pipeline
pipe = pipeline("text-generation", model="Qwen/Qwen2.5-0.5B-Instruct")

In [7]:
# Conversation history
messages = [
    {"role": "user", "content": "Who are you?"},
]

# Prepare input for the pipeline
input_text = messages[-1]["content"]  # Get the latest user message

In [8]:
# Generate a response
response = pipe(input_text, max_length=50, num_return_sequences=1)

# Print the generated response
print("Generated Response:", response[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Generated Response: Who are you? I am an AI language model created by Alibaba Cloud, and my primary purpose is to assist users in generating human-like text. My ability to understand natural language and generate coherent responses allows me to be used for a variety of applications such


## Pipeline to connect database and model

In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
# @tile set up vectorstore
import os
DIR = os.path.dirname(os.path.abspath(__name__))
DB_PATH = os.path.join(DIR, 'db/')

from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
import chromadb
persistent_client = chromadb.PersistentClient(
  path=DB_PATH,
  settings=Settings(),
  tenant=DEFAULT_TENANT,
  database=DEFAULT_DATABASE,)
    
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

from langchain_chroma import Chroma
collection_name = "weka"
vectorstore = Chroma(
  client=persistent_client,
  collection_name=collection_name,
  embedding_function=embeddings,
)
retriever = vectorstore.as_retriever()

In [20]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")



In [37]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from transformers import AutoModelForCausalLM, AutoTokenizer

In [40]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

In [42]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [55]:
template = """Answer the question based only on the following context:

{context}

Question: {question}
"""

In [54]:
prompt = ChatPromptTemplate.from_template(template)
# prompt = "Give me a short introduction to large language model."
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [56]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [57]:
chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt)
chain.invoke("what dataset file format does weka support")

ChatPromptValue(messages=[HumanMessage(content='Answer the question based only on the following context:\n\nthe main() routine of weka.core.Instances :\njava weka.core.Instances data/soybean.arff\nweka.core oﬀers some other useful routines, e.g. converters.C45Loader and\nconverters.CSVLoader ,whichcanbeusedtoimportC45datasetsandcomma/tab-\nseparated datasets respectively, e.g.:\njava weka.core.converters.CSVLoader data.csv > data.arf f\njava weka.core.converters.C45Loader c45_filestem > data .arff\n\n14 CHAPTER 1. A COMMAND-LINE PRIMER\n1.2 Basic concepts\n1.2.1 Dataset\nA set of data items, the dataset, is a very basic concept of machine learning. A\ndatasetisroughlyequivalenttoatwo-dimensionalspreadsheetor databasetable.\nIn WEKA, it is implemented by the weka.core.Instances class. A dataset is\na collection of examples, each one of class weka.core.Instance . Each Instance\nconsists of a number of attributes, any of which can be nominal (= one of a\n\nversions, date/time attribute ty

In [58]:
query = "Answer the question based only on the following context:\n\nthe main() routine of weka.core.Instances :\njava weka.core.Instances data/soybean.arff\nweka.core oﬀers some other useful routines, e.g. converters.C45Loader and\nconverters.CSVLoader ,whichcanbeusedtoimportC45datasetsandcomma/tab-\nseparated datasets respectively, e.g.:\njava weka.core.converters.CSVLoader data.csv > data.arf f\njava weka.core.converters.C45Loader c45_filestem > data .arff\n\n14 CHAPTER 1. A COMMAND-LINE PRIMER\n1.2 Basic concepts\n1.2.1 Dataset\nA set of data items, the dataset, is a very basic concept of machine learning. A\ndatasetisroughlyequivalenttoatwo-dimensionalspreadsheetor databasetable.\nIn WEKA, it is implemented by the weka.core.Instances class. A dataset is\na collection of examples, each one of class weka.core.Instance . Each Instance\nconsists of a number of attributes, any of which can be nominal (= one of a\n\nversions, date/time attribute types are also supported.\nBy default, the last attribute is considered the class/target varia ble, i.e. the\nattribute which should be predicted as a function of all other attrib utes. If this\nis not the case, specify the target variable via -c. The attribute numbers are\none-based indices, i.e. -c 1speciﬁes the ﬁrst attribute.\nSome basic statistics and validation of given ARFF ﬁles can be obtained via\nthe main() routine of weka.core.Instances :\n\nthe default setting of 16 to 64MB is usually too small. If you get error s that\nclasses are not found, check your CLASSPATH : does it include weka.jar ? You\ncan explicitly set CLASSPATH via the-cpcommand line option as well.\nWe will begin by describing basic concepts and ideas. Then, we will desc ribe\ntheweka.filters package, which is used to transform input data, e.g. for\npreprocessing, transformation, feature generation and so on.\n\nQuestion: what dataset file format does weka support\n'"

In [59]:
messages = [
    {"role": "system", "content": "You are a chatbot. You are here to help users with their questions."},
    {"role": "user", "content": query} #propmt
]
messages

[{'role': 'system',
  'content': 'You are a chatbot. You are here to help users with their questions.'},
 {'role': 'user',
  'content': "Answer the question based only on the following context:\n\nthe main() routine of weka.core.Instances :\njava weka.core.Instances data/soybean.arff\nweka.core oﬀers some other useful routines, e.g. converters.C45Loader and\nconverters.CSVLoader ,whichcanbeusedtoimportC45datasetsandcomma/tab-\nseparated datasets respectively, e.g.:\njava weka.core.converters.CSVLoader data.csv > data.arf f\njava weka.core.converters.C45Loader c45_filestem > data .arff\n\n14 CHAPTER 1. A COMMAND-LINE PRIMER\n1.2 Basic concepts\n1.2.1 Dataset\nA set of data items, the dataset, is a very basic concept of machine learning. A\ndatasetisroughlyequivalenttoatwo-dimensionalspreadsheetor databasetable.\nIn WEKA, it is implemented by the weka.core.Instances class. A dataset is\na collection of examples, each one of class weka.core.Instance . Each Instance\nconsists of a number o

In [60]:
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
text

"<|im_start|>system\nYou are a chatbot. You are here to help users with their questions.<|im_end|>\n<|im_start|>user\nAnswer the question based only on the following context:\n\nthe main() routine of weka.core.Instances :\njava weka.core.Instances data/soybean.arff\nweka.core oﬀers some other useful routines, e.g. converters.C45Loader and\nconverters.CSVLoader ,whichcanbeusedtoimportC45datasetsandcomma/tab-\nseparated datasets respectively, e.g.:\njava weka.core.converters.CSVLoader data.csv > data.arf f\njava weka.core.converters.C45Loader c45_filestem > data .arff\n\n14 CHAPTER 1. A COMMAND-LINE PRIMER\n1.2 Basic concepts\n1.2.1 Dataset\nA set of data items, the dataset, is a very basic concept of machine learning. A\ndatasetisroughlyequivalenttoatwo-dimensionalspreadsheetor databasetable.\nIn WEKA, it is implemented by the weka.core.Instances class. A dataset is\na collection of examples, each one of class weka.core.Instance . Each Instance\nconsists of a number of attributes, any o

In [61]:
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
model_inputs

{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,   6236,   6331,     13,
           1446,    525,   1588,    311,   1492,   3847,    448,    862,   4755,
             13, 151645,    198, 151644,    872,    198,  16141,    279,   3405,
           3118,   1172,    389,    279,   2701,   2266,   1447,   1782,   1887,
            368,  14021,    315,    582,   4554,   4871,   5337,   9436,   6260,
          10042,    582,   4554,   4871,   5337,   9436,    821,     14,    704,
             88,  17479,  16711,    542,    198,    896,   4554,   4871,    297,
         145730,    388,   1045,   1008,   5390,  29497,     11,    384,   1302,
             13,  88888,    727,     19,     20,   9181,    323,    198,  14166,
            388,    727,  17803,   9181,   1154,   8206,   4814,   1371,   2591,
            983,    474,     34,     19,     20,  65546,    437,  45386,  78859,
           6913,    325,  49600,  29425,  15576,     11,    384,   1302,     13,
            51

In [62]:
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512
)
generated_ids

tensor([[151644,   8948,    198,   2610,    525,    264,   6236,   6331,     13,
           1446,    525,   1588,    311,   1492,   3847,    448,    862,   4755,
             13, 151645,    198, 151644,    872,    198,  16141,    279,   3405,
           3118,   1172,    389,    279,   2701,   2266,   1447,   1782,   1887,
            368,  14021,    315,    582,   4554,   4871,   5337,   9436,   6260,
          10042,    582,   4554,   4871,   5337,   9436,    821,     14,    704,
             88,  17479,  16711,    542,    198,    896,   4554,   4871,    297,
         145730,    388,   1045,   1008,   5390,  29497,     11,    384,   1302,
             13,  88888,    727,     19,     20,   9181,    323,    198,  14166,
            388,    727,  17803,   9181,   1154,   8206,   4814,   1371,   2591,
            983,    474,     34,     19,     20,  65546,    437,  45386,  78859,
           6913,    325,  49600,  29425,  15576,     11,    384,   1302,     13,
            510,  10042,    

In [63]:
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
generated_ids

[tensor([    54,  52313,  11554,   3807,  19856,    369,  10337,   3542,     13,
           3776,   4185,   3561,    374,    279,  27445,    320,   1092,   1728,
           6222,  11584,    657,  24979,      8,   3561,     13,  13293,   5411,
           3561,    374,    279,   1644,    542,    320,   3907,     12,   1130,
          15042,    568,   1205,   4554,   1083,   5707,  88888,    311,   2795,
           5257,   4494,    315,  29425,   1741,    438,    356,     19,     13,
             20,  29425,    323,  31683,  72692,   2750,  29465,  18104,      8,
           3542,     13, 151645])]

In [64]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

'Weka supports several formats for dataset files. One common format is the CSV (Comma-Separated Values) format. Another popular format is the Arff (Attribute-Value Format). Weka also provides converters to load various types of datasets such as C4.5 datasets and comma-separated values (.csv) files.'