## 1. Create Question Collection

In [1]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType

client = weaviate.connect_to_local()

# client.collections.delete("ResearchPapers")  # THIS WILL DELETE THE SPECIFIED COLLECTION(S) AND THEIR OBJECTS


try:
    questions = client.collections.create(
        name="questions",
        vectorizer_config=Configure.Vectorizer.text2vec_ollama(     # Configure the Ollama embedding integration
            api_endpoint="http://host.docker.internal:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
            model="nomic-embed-text",                               # The model to use
        ),
        generative_config=Configure.Generative.ollama(              # Configure the Ollama generative integration
            api_endpoint="http://host.docker.internal:11434",       # Allow Weaviate from within a Docker container to contact your Ollama instance
            model="llama3.2",                                       # The model to use
        ),
        properties=[
            Property(name="paper_title", data_type=DataType.TEXT, skip_vectorization=True),
            Property(name="doc_id", data_type=DataType.TEXT, skip_vectorization=True),
            Property(name="question_text", data_type=DataType.TEXT, skip_vectorization=False)
        ]
    )

    print("Collection 'questions' created successfully.")
except weaviate.exceptions.WeaviateQueryError as e:
    print(f"Error creating collection: {e}")
    # Optionally, handle the error or exit
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    # Handle other exceptions
finally:
    client.close()


Collection 'questions' created successfully.


/Users/moraish/Desktop/ams691/project_llm/.venv/lib/python3.9/site-packages/weaviate/collections/classes/config.py:1950: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  for cls_field in self.model_fields:


# 2. Data Ingestion

In [8]:
import pandas as pd

df = pd.read_csv("/Users/moraish/Desktop/ams691/project_llm/questions.csv", header=None)

In [9]:
df.head()

Unnamed: 0,0,1
0,1_Prefix-Tuning- Optimizing Continuous Prompts...,What is the key difference between prefix-tuni...
1,1_Prefix-Tuning- Optimizing Continuous Prompts...,In the context of table-to-text generation...
2,1_Prefix-Tuning- Optimizing Continuous Prompts...,What are the observed benefits of prefix-t...
3,1_Prefix-Tuning- Optimizing Continuous Prompts...,How does prefix-tuning perform in extrapol...
4,1_Prefix-Tuning- Optimizing Continuous Prompts...,How does the choice of prefix length impac...


In [11]:
import weaviate
import pandas as pd
from weaviate.util import generate_uuid5
import traceback

# --- 1. Read and Process CSV Data ---
try:
    df = pd.read_csv("/Users/moraish/Desktop/ams691/project_llm/questions.csv", header=None)
    all_questions = []
    processed_count = 0
    error_count = 0

    for index, row in df.iterrows():
        try:
            col_0 = str(row[0]) # Ensure it's a string
            col_1 = str(row[1]).strip() # Ensure it's a string and remove leading/trailing whitespace

            # Skip rows with empty question text
            if not col_1:
                print(f"Skipping row {index + 1}: Empty question text.")
                error_count += 1
                continue

            # Split column 0 to get doc_id and paper_title
            parts = col_0.split("_", 1)
            if len(parts) == 2:
                doc_id_str = parts[0]
                paper_title_str = parts[1]

                # Prepare the data object for Weaviate
                question_obj = {
                    "paper_title": paper_title_str,
                    "doc_id": doc_id_str,
                    "question_text": col_1
                }
                all_questions.append(question_obj)
                processed_count += 1
            else:
                print(f"Skipping row {index + 1}: Column 0 format incorrect ('{col_0}'). Expected 'id_papername'.")
                error_count += 1
        except Exception as e:
            print(f"Error processing row {index + 1}: {e}")
            error_count += 1

    print(f"Successfully processed {processed_count} rows from CSV.")
    if error_count > 0:
        print(f"Skipped {error_count} rows due to errors or empty questions.")

except FileNotFoundError:
    print("Error: questions.csv not found at the specified path.")
    all_questions = [] # Ensure list is empty if file not found
except Exception as e:
    print(f"Error reading or processing CSV: {e}")
    all_questions = [] # Ensure list is empty on other errors

# --- 2. Ingest Data into Weaviate ---
if all_questions: # Proceed only if there are questions to ingest
    client = None # Initialize client to None
    try:
        client = weaviate.connect_to_local()
        print("Connected to Weaviate.")

        # Get the collection "questions"
        questions_collection = client.collections.get("questions")
        print("Accessed 'questions' collection.")

        print(f"Starting batch import of {len(all_questions)} questions...")
        with questions_collection.batch.dynamic() as batch:
            for question_data in all_questions:
                try:
                    # Generate a consistent UUID based on the question data
                    obj_uuid = generate_uuid5(question_data)
                    batch.add_object(
                        properties=question_data,
                        uuid=obj_uuid
                    )
                except Exception as e:
                    print(f"Error adding object to batch: {question_data}. Error: {e}")
                    # Optionally break or continue based on error tolerance
                    # if batch.number_errors > 10:
                    #     print("Stopping batch due to excessive errors.")
                    #     break

        # print(f"Batch import finished. Added: {batch.number_imported}, Errors: {batch.number_errors}")

        # Check for failed objects specifically
        failed_objects = questions_collection.batch.failed_objects
        if failed_objects:
            print(f"Number of failed imports: {len(failed_objects)}")
            # Print details of the first few failed objects for debugging
            for i, failed in enumerate(failed_objects[:5]):
                 print(f"  Failed object {i+1}: {failed}")
        else:
             print("All objects imported successfully.")


    except weaviate.exceptions.WeaviateQueryError as e:
        print(f"Weaviate Query Error during ingestion: {e}")
        traceback.print_exc()
    except weaviate.exceptions.WeaviateStartUpError as e:
        print(f"Weaviate Connection Error: {e}. Is Weaviate running and accessible?")
        traceback.print_exc()
    except Exception as e:
        print(f"An unexpected error occurred during Weaviate ingestion: {e}")
        traceback.print_exc()
    finally:
        if client:
            client.close()
            print("Weaviate client closed.")
else:
    print("No questions processed from CSV to ingest.")

Successfully processed 895 rows from CSV.
Connected to Weaviate.
Accessed 'questions' collection.
Starting batch import of 895 questions...
All objects imported successfully.
Weaviate client closed.


## Helper function to convert to JSON

In [18]:
import json


def parse_query_return(query_return):
    """
    Parses a QueryReturn-like object into a JSON string.
    """
    parsed_objects = []

    for obj in query_return.objects:
        parsed_obj = {
            "uuid": str(obj.uuid),
            "collection": getattr(obj, "collection", None),
            "properties": getattr(obj, "properties", {}),
            "metadata": {
                "creation_time": getattr(obj.metadata, "creation_time", None),
                "last_update_time": getattr(obj.metadata, "last_update_time", None),
                "distance": getattr(obj.metadata, "distance", None),
                "certainty": getattr(obj.metadata, "certainty", None),
                "score": getattr(obj.metadata, "score", None),
                "explain_score": getattr(obj.metadata, "explain_score", None),
                "is_consistent": getattr(obj.metadata, "is_consistent", None),
                "rerank_score": getattr(obj.metadata, "rerank_score", None),
            }
        }
        parsed_objects.append(parsed_obj)

    return json.dumps(parsed_objects, indent=2)

In [21]:
client = weaviate.connect_to_local()

import weaviate.classes as wvc

questions = client.collections.get('questions')

question = "What are the key issues identified in the data provenance?"

response = questions.query.near_text(
    query=question,
    distance=0.8,
    limit=2,
    return_metadata=wvc.query.MetadataQuery(certainty=True, distance=True)

)

print(response)




QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('eee90964-802a-51e9-a2fe-d414ead67e9d'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.14223623275756836, certainty=0.9288818836212158, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'paper_title': 'Documenting Large Webtext Corpora- A Case Study on the Colossal Clean Crawled Corpus', 'question_text': 'What are the key issues identified in the data provenance analysis for C4?', 'doc_id': '40'}, references=None, vector={}, collection='Questions'), Object(uuid=_WeaviateUUIDInt('750a3aba-8a99-5e92-98ea-b172b3daa925'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=0.3534918427467346, certainty=0.8232541084289551, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'paper_title': 'Documenting Large Webtext Corpora- A Case Study on the Colossal Clean Crawled Corpus', 'question_text': 'What challenges are associate

  questions = client.collections.get('questions')


In [22]:
json_output = parse_query_return(response)
print(json_output)


[
  {
    "uuid": "eee90964-802a-51e9-a2fe-d414ead67e9d",
    "collection": "Questions",
    "properties": {
      "paper_title": "Documenting Large Webtext Corpora- A Case Study on the Colossal Clean Crawled Corpus",
      "question_text": "What are the key issues identified in the data provenance analysis for C4?",
      "doc_id": "40"
    },
    "metadata": {
      "creation_time": null,
      "last_update_time": null,
      "distance": 0.14223623275756836,
      "certainty": 0.9288818836212158,
      "score": null,
      "explain_score": null,
      "is_consistent": null,
      "rerank_score": null
    }
  },
  {
    "uuid": "750a3aba-8a99-5e92-98ea-b172b3daa925",
    "collection": "Questions",
    "properties": {
      "paper_title": "Documenting Large Webtext Corpora- A Case Study on the Colossal Clean Crawled Corpus",
      "question_text": "What challenges are associated with the lack of documentation in large web-crawled datasets like C4?",
      "doc_id": "40"
    },
    "met