In [1]:
import weaviate
import uuid
import weaviate.classes.config as wvcc

import requests
import json

# making embedding
from langchain_huggingface import HuggingFaceEmbeddings

from weaviate.classes.config import (
    Property,
    DataType,
    Tokenization,
    Configure,
    VectorDistances
)

# Cloud Connection

In [2]:
#define a client
client = weaviate.connect_to_wcs(
    cluster_url="https://ep9buqisqh2agxcqhaqja.c0.us-central1.gcp.weaviate.cloud",
    auth_credentials=weaviate.classes.init.Auth.api_key("cBbDQMWkOgPEzzQYg2megPVbhrZ28P6mH33k"),
)

#checking client status
def check_client(client):
    try:
        # check client is ready or not
        response = client.is_ready()
        if response:
            print("Weaviate client is ready.")
        else:
            print("Weaviate client is not ready.")
    except Exception as e:
        print(f"Error checking Weaviate client status: {e}")


check_client(client)

Weaviate client is ready.


# preparee the data and Add Embeddings

## Extract data into a list

In [7]:
# Slice the data
data_list = []

def stream_and_slice_json(file_path, fraction=0.25):

    total_items = 0
    collected_items = 0

    with open(file_path, 'r') as file:
        for line in file:
            try:
                json_obj = json.loads(line)
                data_list.append(json_obj)
                collected_items += 1
                total_items += 1

                # Check if we've collected the desired fraction of the total items
                if collected_items >= total_items * fraction:
                    break
            except json.JSONDecodeError:
                print("Error decoding JSON in line:", line)

    return data_list

In [8]:
stream_and_slice_json('News_Category_Dataset_v3.json')

[{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9',
  'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters',
  'category': 'U.S. NEWS',
  'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.',
  'authors': 'Carla K. Johnson, AP',
  'date': '2022-09-23'}]

In [9]:
data_list[0]

{'link': 'https://www.huffpost.com/entry/covid-boosters-uptake-us_n_632d719ee4b087fae6feaac9',
 'headline': 'Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters',
 'category': 'U.S. NEWS',
 'short_description': 'Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.',
 'authors': 'Carla K. Johnson, AP',
 'date': '2022-09-23'}

## combine props for embedding

In [10]:
#define embedding model
from langchain_huggingface import HuggingFaceEmbeddings
# Load the HuggingFace model for creating embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
# generation of embeddings
def generate_embeddings(text):
    embeddings = embedding_model.embed_documents([text])

    # Assuming embeddings is a list of lists (one list per document)
    if embeddings and isinstance(embeddings[0], list):
        return embeddings[0]  # Return the first (and only) list of embeddings


# Combine the properties
def combine_properties(item):
    # Extract properties
    headline = item.get('headline', '')
    category = item.get('category', '')
    short_description = item.get('short_description', '')

    # Combine properties into a single text string
    combined_text = f"{headline} {category} {short_description}"
    return combined_text


# Generate IDs
def generate_unique_id():
    return str(uuid.uuid4())


In [115]:
def prepare_data_for_insertion(data_list):
    """
    Prepare data for insertion by generating unique IDs, combining properties, and creating embeddings.

    Args:
        data_list (list): List of dictionaries containing item properties.

    Returns:
        list: List of dictionaries ready for insertion.
    """
    prepared_data = []
    for item in data_list:
        # Generate the id for each item
        item_id = generate_unique_id()

        # Combine the properties
        combined_text = combine_properties(item)

        # Create Embeddings
        embeddings = generate_embeddings(combined_text)

        # Prepare the item for insertion
        prepared_item = {
            'id': item_id,  # ID property
            'headline': item.get('headline', ''),
            'category': item.get('category', ''),
            'short_description': item.get('short_description', ''),
            'embeddings': embeddings,  # Embeddings property
            'meta_data': item.get('meta_data', '')  # Optional additional property
        }
        prepared_data.append(prepared_item)

    return prepared_data


In [112]:
# prepared_data = prepare_data_for_insertion(data_list)

# # Print only the first 5 items from prepared_data
# for item in prepared_data[:5]:  # Slice the list to get the first 5 items
#     print(f"ID: {item['id']}")
#     print(f"Headline: {item['headline']}")
#     print(f"Category: {item['category']}")
#     print(f"Short Description: {item['short_description']}")
#     print(f"Embeddings: {item['embeddings']}")
#     print("\n")

ID: fef0b163-cf11-43b4-8c1f-e6ae84227bb8
Headline: Over 4 Million Americans Roll Up Sleeves For Omicron-Targeted COVID Boosters
Category: U.S. NEWS
Short Description: Health experts said it is too early to predict whether demand would match up with the 171 million doses of the new boosters the U.S. ordered for the fall.
Embeddings: [-0.020388882607221603, 0.03906365856528282, -0.04621047526597977, -0.008600885979831219, 0.03425996005535126, -0.042044125497341156, -0.012011573649942875, 0.10828500986099243, -0.022965937852859497, -0.017370129004120827, -0.037982843816280365, 0.05700824782252312, -0.0011722625931724906, 0.03296976536512375, 0.061620406806468964, 0.030108142644166946, 0.05930912867188454, -0.05908643454313278, 0.012857522815465927, 0.01886061578989029, -0.054279446601867676, -0.0192625280469656, 0.03139610216021538, 0.025165023282170296, -0.03422611206769943, -0.034655552357435226, -0.0898100957274437, -0.03096819296479225, 0.011081336066126823, -0.02853260189294815, -0.0

# Data Insertion

## **Create the collection and insert the data**

In [109]:

client.collections.delete("External_data_coll")



def define_collection(client, collection_name="External_data_coll"):
    try:
        # Reconnect the client
        client.connect()

        # Check if collection exists
        if client.collections.exists(collection_name):
            return 'Collection already exists.'

        # Define the collection
        client.collections.create(
            collection_name,  # Define the collection name
            vectorizer_config=Configure.Vectorizer.text2vec_huggingface(
                model="sentence-transformers/all-MiniLM-L6-v2", vectorize_collection_name=True
            ),  # Use HuggingFace model for vectorization

            vector_index_config=Configure.VectorIndex.hnsw(
                distance_metric=VectorDistances.COSINE
            ),

            # Define properties excluding 'id' and 'embeddings'
            properties=[
                Property(
                    name="headline",
                    data_type=DataType.TEXT,  # Use DataType.TEXT instead of STRING
                    vectorize_property_name=True,
                    tokenization=Tokenization.LOWERCASE
                ),
                Property(
                    name="category",
                    data_type=DataType.TEXT,  # Use DataType.TEXT instead of STRING
                    vectorize_property_name=True,
                    tokenization=Tokenization.LOWERCASE
                ),
                Property(
                    name="short_description",
                    data_type=DataType.TEXT,  # Use DataType.TEXT
                    vectorize_property_name=True,
                    tokenization=Tokenization.LOWERCASE
                ),
                Property(
                    name="meta_data",
                    data_type=DataType.TEXT,
                    vectorize_property_name=False
                ),
            ]
        )

        return "Collection created successfully"

    except Exception as e:
        return f"An error occurred: {str(e)}"


In [110]:
# Define the collection
result = define_collection(client)
print(result)

Collection created successfully


In [118]:
def insert_data_batch(client, collection_name, data_list):
    # Prepare the data for insertion
    prepared_data = prepare_data_for_insertion(data_list)

    try:
        # Access the collection
        collection = client.collections.get(collection_name)

        # Use batch operations for insertion
        with collection.batch.dynamic() as batch:
            # Add documents to the batch
            for item in prepared_data:
                # Document including properties
                document = {
                    'id': item['id'],  # ID
                    'embeddings': item['embeddings'],  # Embeddings
                    'properties': {
                        'headline': item['headline'],
                        'category': item['category'],
                        'short_description': item['short_description'],
                        'meta_data': item['meta_data']
                    }
                }
                # Insert document
                batch.add_object(properties=document['properties'])


        # Check for failed objects
        if collection.batch.failed_objects:
            print("Failed objects:")
            for failed_obj in collection.batch.failed_objects:
                print(failed_obj)

        return "Batch insertion completed successfully."

    except Exception as e:
        print(f"Error: {e}")
        # Print available methods for debugging
        print("Available methods: ", dir(collection))
        raise


In [119]:
insert_data_batch(client, 'External_data_coll', data_list)


'Batch insertion completed successfully.'

# Query to get data

In [129]:
def fetch_object_by_id(client, collection_name, object_id):
    try:
        # Access the collection
        collection = client.collections.get(collection_name)

        # Fetch the object by ID
        data_object = collection.query.fetch_object_by_id(object_id)  # Check if this method is correct

        # Check if data_object is not None
        if data_object is None:
            print("No object found with the given ID.")
            return None

        # Print properties of the fetched object
        print(data_object.properties)
        return data_object.properties

    except Exception as e:
        print(f"Error: {e}")
        # Print available methods for debugging
        print("Available methods: ", dir(collection))
        raise

# Example usage
object_id = "fef0b163-cf11-43b4-8c1f-e6ae84227bb8"
properties = fetch_object_by_id(client, 'External_data_coll', object_id)
if properties:
    print(properties)


No object found with the given ID.
