In [136]:
import weaviate

client = weaviate.connect_to_local()

print(client.is_ready())  # Should print: `True`

# client.close()  

True


## 1. Create Collection

In [None]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType

client = weaviate.connect_to_local()

# client.collections.delete("ResearchPapers")  # THIS WILL DELETE THE SPECIFIED COLLECTION(S) AND THEIR OBJECTS

try:
    papers = client.collections.create(
        name="ResearchPapers",
        vectorizer_config=Configure.Vectorizer.text2vec_ollama(     # Configure the Ollama embedding integration
            api_endpoint="http://host.docker.internal:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
            model="nomic-embed-text",                               # The model to use
        ),
        generative_config=Configure.Generative.ollama(              # Configure the Ollama generative integration
            api_endpoint="http://host.docker.internal:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
            model="llama3.2",                                       # The model to use
        ),
        properties=[
            Property(name="paper_title", data_type=DataType.TEXT, skip_vectorization=True),
            Property(name="doc_id", data_type=DataType.TEXT, skip_vectorization=True),
            Property(name="chunk_text", data_type=DataType.TEXT, skip_vectorization=False),
            Property(name="tag", data_type=DataType.TEXT, skip_vectorization=True),
        ]
    )

    print("Collection 'ResearchPapers' created successfully.")
except weaviate.exceptions.WeaviateQueryError as e:
    print(f"Error creating collection: {e}")
    # Optionally, handle the error or exit
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    # Handle other exceptions
finally:
    client.close()


Collection 'ResearchPapers' created successfully.


/Users/moraish/Desktop/ams691/project_llm/.venv/lib/python3.9/site-packages/weaviate/collections/classes/config.py:1950: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:
            Please make sure to close the connection using `client.close()`.
  papers = client.collections.create(


## 2. Ingest Data

In [130]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import SentenceTransformersTokenTextSplitter
# from unstructured.partition.auto import partition

# Define paths
pdf_path = '/Users/moraish/Desktop/ams691/project_llm/data/1_Prefix-Tuning- Optimizing Continuous Prompts for Generation.pdf'

def get_chunks(document, chunk_size=1000, chunk_overlap=200):
    splitter = SentenceTransformersTokenTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    
    if isinstance(document, str):
        return splitter.split_text(document)
    elif isinstance(document, list):
        # Assume document is a list of Document objects with a "page_content" attribute
        chunks = splitter.split_documents(document)
        return [chunk.page_content for chunk in chunks]
    else:
        raise ValueError("Unsupported document type: Expected str or list of Document objects.")

In [131]:
def process_document(filename, chunks):
    base = os.path.basename(filename)
    # Remove the file extension and split into paper id and title
    if not base.lower().endswith(".pdf"):
        raise ValueError("Filename must be a PDF.")
    
    name_without_ext = base[:-4]
    # Split at first underscore to get id and title
    parts = name_without_ext.split("_", 1)
    if len(parts) != 2:
        raise ValueError("Filename does not match the expected format '{paper_id}_{paper_title}.pdf'")
    
    paper_id, paper_title = parts[0], parts[1]
    
    # Generate a metadata object for each chunk.
    documents = []
    for index, chunk in enumerate(chunks):
        doc_id = f"{paper_id}_chunk_{index+1}"
        documents.append({
            "paper_title": paper_title,
            "doc_id": doc_id,
            "chunk_text": chunk,
            "tag": ""
        })
    return documents



In [132]:
import os
from langchain_community.document_loaders import PyPDFLoader

# Assume get_chunks and process_document functions are defined in previous cells

data_directory = "/Users/moraish/Desktop/ams691/project_llm/data/"

# Get all PDF files in the directory
pdf_files = [
    os.path.join(data_directory, f)
    for f in os.listdir(data_directory)
    if f.lower().endswith(".pdf")
]

all_docs = []

for pdf_path in pdf_files:
    loader = PyPDFLoader(pdf_path)
    document = loader.load()
    
    # Get text chunks from the document using the defined get_chunks function
    chunks = get_chunks(document, chunk_size=1000, chunk_overlap=200)
    
    # Process the document to get metadata for each chunk using the defined process_document function
    docs = process_document(pdf_path, chunks)
    all_docs.extend(docs)


  return NameObject(ret)
  return NameObject(ret)
  return NameObject(ret)
  return NameObject(ret)
Ignoring wrong pointing object 31 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)


In [133]:
import weaviate
from weaviate.util import generate_uuid5

client = weaviate.connect_to_local()

# Get the collection "ResearchPapers"
collection = client.collections.get("ResearchPapers")

with collection.batch.dynamic() as batch:
    for doc in all_docs:   # each doc is a dictionary with keys (`paper_title`, `doc_id`, `chunk_text`, `tag`)
        obj_uuid = generate_uuid5(doc)
        batch.add_object(
            properties=doc,
            uuid=obj_uuid
        )
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

client.close()


{'message': 'Failed to send 10 in a batch of 192', 'errors': {'send POST request: Post "http://host.docker.internal:11434/api/embed": read tcp 172.18.0.2:41010->192.168.65.2:11434: read: connection reset by peer'}}
{'message': 'Failed to send 10 objects in a batch of 192. Please inspect client.batch.failed_objects or collection.batch.failed_objects for the failed objects.'}


Number of failed imports: 10
First failed object: ErrorObject(message='send POST request: Post "http://host.docker.internal:11434/api/embed": read tcp 172.18.0.2:41010->192.168.65.2:11434: read: connection reset by peer', object_=BatchObject(collection='ResearchPapers', properties={'paper_title': 'TOWARDS A UNIFIED VIEW OF PARAMETER-EFFICIENT TRANSFER LEARNING', 'doc_id': '4_chunk_95', 'chunk_text': '##sncj / bpi6ww2asshjaynjiemhgkkl4xgbssnjmybjl + [UNK] + wsstose4sstmq + ovpzl719 / [UNK] + kwujkoucoufco + eeocfr8jrz / ppoggqx + w8m2tmcoaavmcrvurqjnghsqhghephmldoxikdp9kkkyolvbeb1rc8rxscuvzrcuuyzgvuctvqytiokesiqjljrtdkzvqbgduwsu / pxiueo9kungxvmjfvqqslu4vlxbr + 1tcxbymei7idzvw8w6phit2ryvcra9uwkx6yo5x + a / [UNK] / bzowwunxwxackhdc4e2xdbwxabtfzdfhz8fmbhjygrenb1bec / vg04jx / pecckuminuorko9ofujatmg5i + eklqvahzjatd0olqtqpjhhurjkknfv3 + 7ncuk6b + z0tdblnmo237osnel / j53b2fe9njkb1lw2esbatx1waia + qrmnlkfaqg3tecchnfi6gib7pq976ugceezikjl9tl6 / 5uzvpzfrrodxsvkecmpu5jvww1aynw4kyxja5aag8dg2ekqi

In [134]:
import json

def parse_query_return(query_return):
    """
    Parses a QueryReturn-like object into a JSON string.
    """
    parsed_objects = []

    for obj in query_return.objects:
        parsed_obj = {
            "uuid": str(obj.uuid),
            "collection": getattr(obj, "collection", None),
            "properties": getattr(obj, "properties", {}),
            "metadata": {
                "creation_time": getattr(obj.metadata, "creation_time", None),
                "last_update_time": getattr(obj.metadata, "last_update_time", None),
                "distance": getattr(obj.metadata, "distance", None),
                "certainty": getattr(obj.metadata, "certainty", None),
                "score": getattr(obj.metadata, "score", None),
                "explain_score": getattr(obj.metadata, "explain_score", None),
                "is_consistent": getattr(obj.metadata, "is_consistent", None),
                "rerank_score": getattr(obj.metadata, "rerank_score", None),
            }
        }
        parsed_objects.append(parsed_obj)

    return json.dumps(parsed_objects, indent=2)


In [137]:
import weaviate, json
import weaviate.classes as wvc

papers = client.collections.get("ResearchPapers")

response = papers.query.near_text(
    query="What is LORA and how does it work?",
    distance=1,
    return_metadata=wvc.query.MetadataQuery(certainty=True, distance=True)
)

In [138]:
json_output = parse_query_return(response)
print(json_output)


[
  {
    "uuid": "76f41b71-9230-5d36-96c5-4ae3078f20a6",
    "collection": "ResearchPapers",
    "properties": {
      "paper_title": "QLORA- Efficient Finetuning of Quantized LLMs",
      "tag": "",
      "doc_id": "34_chunk_27",
      "chunk_text": "##paca, we find that the most critical lora hyper - parameter is how many lora adapters are used in total and that lora on all linear transformer block layers are required to match full finetuning perfor - mance. other lora hyperparameters, such as the projection dimension r, do not affect performance ( see appendix a ). 1010 1011 t otal model bits 0. 60 0. 61 0. 62 0. 63 0. 64 0. 65 0. 66 0. 67mean zeroshot accuracy 4 - bit llama float nfloat nfloat + dq data type figure 3 : mean zero - shot accuracy over wino - grande, hellaswag, piqa, arc - easy, and arc - challenge using llama models with different 4 - bit data types. the normalfloat data type significantly improves the bit - for - bit accuracy gains compared to regular 4 - bit float

In [139]:
import weaviate
from weaviate.classes.query import MetadataQuery

# Reconnect to client
client = weaviate.connect_to_local(skip_init_checks=True)
papers = client.collections.get("ResearchPapers")

# User question
question = "What is LoRA and how does it work?"

# Generate an answer using the most relevant chunks
response = papers.generate.near_text(
    query=question,
    distance=0.8,
    limit=2,
    return_metadata=MetadataQuery(distance=True),  # optional
    single_prompt=f"Answer this question using the given context: {question}\nContext: {{chunk_text}}",
    grouped_task=f"Based on the following context, answer the question: {question}"
)

# View generated summary answer
print("🧠 Final Answer:\n", response.generated)


🧠 Final Answer:
 LoRA (Low-Rank Adaptation) is a method used in transformer-based language models to reduce the computational cost of fine-tuning large models on specific tasks. It works by adapting a low-rank matrix that represents the relationships between input and output embeddings, rather than fine-tuning the entire model.

In traditional fine-tuning methods, the entire model is updated based on the task-specific dataset, which can be computationally expensive. LoRA reduces this cost by using a separate adapter layer to adapt the input/output embeddings in a low-rank matrix format. This allows for efficient transfer learning and improved performance on specific tasks.

The key idea behind LoRA is that the relationships between input and output embeddings are largely learned during pre-training, and can be adapted to the specific task using a smaller number of parameters. By using a low-rank adaptation, LoRA reduces the number of parameters required for fine-tuning, making it more 

In [140]:
import weaviate
from weaviate.classes.query import MetadataQuery

# Reconnect to client
client = weaviate.connect_to_local(skip_init_checks=True)
papers = client.collections.get("ResearchPapers")

# User question
question = "What are scaling laws in LLMs?"

# Generate an answer using the most relevant chunks
response = papers.generate.near_text(
    query=question,
    distance=0.8,
    limit=2,
    return_metadata=MetadataQuery(distance=True),  # optional
    single_prompt=f"Answer this question using the given context: {question}\nContext: {{chunk_text}}",
    grouped_task=f"Based on the following context, answer the question: {question}"
)

# View generated summary answer
print("🧠 Final Answer:\n", response.generated)


🧠 Final Answer:
 Based on the provided context, scaling laws in LLMs (Large Language Models) refer to empirical regularities that describe how the performance of a language model improves with increasing model size and computational resources.

In this context, scaling laws are mathematically represented by equation (1.1), which describes the relationship between the number of parameters (n), the compute scaling factor (c), and the loss in nats/token. The equation is:

loss = 0.28 + (n-1e4) -0.16 + (c-1.4e-5) -0.17(c2.3e-12)

The paper discusses the implications of these scaling laws, including their potential to unlock new capabilities in machine learning models.

Scaling laws are seen as a shift in perspective away from focusing on specific neural architectures, loss functions, and training algorithms towards identifying broader commonalities that appear when studying machine learning across different scales. They suggest that many capabilities may lie on a spectrum that can be conti