In [25]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer

In [17]:
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.w2PnVQu7hvrSb-5u-CWEPxGXuUbkDy_IaElgNBHAnXU"

q_url = "https://bf57de15-f343-4041-aacd-6d8daccab983.europe-west3-0.gcp.cloud.qdrant.io"

client = QdrantClient(q_url, api_key=key)

In [14]:
#load book summary data
booksum_df = pd.read_csv('book_sum.csv')
booksum_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41871 entries, 0 to 41870
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    41871 non-null  object
 1   summary  41871 non-null  object
dtypes: object(2)
memory usage: 654.4+ KB


## Qdrant Creation Functions
- create_book_embeddingings
-- creates vector embeddings from a books data frame
- upload_to_qdrant
-- takes book vectors and uploads them to qdrant
- process_book_data frame
-- run the book data frame through this to put it all together


In [26]:
def create_book_embeddings(df, column_name="summary", model_name="all-MiniLM-L6-v2"):
    """
    Generate embeddings for a dataframe of books summaries.
    
    Parameters:
    - df: pandas DataFrame containing the book data with 'title' and 'summary' columns
    - column_name: the column to generate embeddings for (defaults to "summary")
    - model_name: the sentence transformer model to use
    
    Returns:
    - list of point objects ready for Qdrant upload and the vector size
    """
    # Load embedding model
    model = SentenceTransformer(model_name)
    
    # Get the model's output dimension
    vector_size = model.get_sentence_embedding_dimension()
    
    print(f"Creating embeddings for {len(df)} books using {model_name}...")
    
    # Create batch of text to encode
    texts = df[column_name].tolist()
    
    # Encode all texts at once (more efficient)
    vectors = model.encode(texts, show_progress_bar=True)
    
    # Create point objects for Qdrant
    book_vectors = []
    
    for i, (index, row) in enumerate(df.iterrows()):
        book_vectors.append({
            "id": index,  # Use dataframe index as ID
            "vector": vectors[i].tolist(),
            "payload": {
                "title": row.get("title", ""),  # Store title in payload
                "summary": row.get(column_name, "")  # Store summary text for reference
                # Add additional metadata fields here if needed
            }
        })
    
    print(f"Created {len(book_vectors)} embeddings with dimension {vector_size}")
    return book_vectors, vector_size

def upload_to_qdrant(book_vectors, vector_size, client=None, url=None, api_key=None, 
                     collection_name="books", batch_size=100):
    """
    Upload book embeddings to Qdrant.
    
    Parameters:
    - book_vectors: list of vector points to upload
    - vector_size: dimensionality of the embedding vectors
    - client: an existing QdrantClient (optional)
    - url: Qdrant server URL (required if client not provided)
    - api_key: Qdrant API key (required if client not provided)
    - collection_name: name for the Qdrant collection
    - batch_size: number of vectors to upload in each batch
    
    Returns:
    - QdrantClient instance
    """

    # Create client if not provided
    if client is None:
        if url is None:
            # Default to localhost if no URL provided
            client = QdrantClient(host="localhost", port=6333)
        else:
            client = QdrantClient(url=url, api_key=api_key)
    
    # Check if collection exists
    collections = client.get_collections().collections
    collection_exists = any(collection.name == collection_name for collection in collections)
    
    if collection_exists:
        print(f"Collection '{collection_name}' already exists. Recreating...")
        client.delete_collection(collection_name=collection_name)
    
    # Create collection with appropriate vector size
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=vector_size, 
            distance=models.Distance.COSINE
        ),
        optimizers_config=models.OptimizersConfigDiff(
            indexing_threshold=10000  # Optimize for larger collections
        )
    )
    
    # Print some info about what we're uploading
    print(f"Uploading {len(book_vectors)} vectors to collection '{collection_name}'")
    print(f"Using batch size of {batch_size}")
    
    # Upload vectors in batches
    total_batches = (len(book_vectors) - 1) // batch_size + 1
    for i in range(0, len(book_vectors), batch_size):
        batch = book_vectors[i:i + batch_size]
        client.upsert(
            collection_name=collection_name, 
            points=batch
        )
        batch_num = i // batch_size + 1
        print(f"Uploaded batch {batch_num}/{total_batches} ({len(batch)} points)")
    
    # Verify upload
    collection_info = client.get_collection(collection_name=collection_name)
    print(f"Upload complete! Collection now contains {collection_info.vectors_count} vectors.")
    
    return client

# Creating Qdrant for goodreads comic database

In [28]:
#create embeddings for book summaries
sum_vectors, vector_length = create_book_embeddings(booksum_df, column_name="summary", model_name="all-MiniLM-L6-v2")

Creating embeddings for 41871 books using all-MiniLM-L6-v2...


Batches:   0%|          | 0/1309 [00:00<?, ?it/s]

Created 41871 embeddings with dimension 384


In [29]:
#upload book embeddings to Qdrant in batches of 100
upload_to_qdrant(sum_vectors, vector_length, client=client, collection_name="books", batch_size=100)

Collection 'books' already exists. Recreating...
Uploading 41871 vectors to collection 'books'
Using batch size of 100
Uploaded batch 1/419 (100 points)
Uploaded batch 2/419 (100 points)
Uploaded batch 3/419 (100 points)
Uploaded batch 4/419 (100 points)
Uploaded batch 5/419 (100 points)
Uploaded batch 6/419 (100 points)
Uploaded batch 7/419 (100 points)
Uploaded batch 8/419 (100 points)
Uploaded batch 9/419 (100 points)
Uploaded batch 10/419 (100 points)
Uploaded batch 11/419 (100 points)
Uploaded batch 12/419 (100 points)
Uploaded batch 13/419 (100 points)
Uploaded batch 14/419 (100 points)
Uploaded batch 15/419 (100 points)
Uploaded batch 16/419 (100 points)
Uploaded batch 17/419 (100 points)
Uploaded batch 18/419 (100 points)
Uploaded batch 19/419 (100 points)
Uploaded batch 20/419 (100 points)
Uploaded batch 21/419 (100 points)
Uploaded batch 22/419 (100 points)
Uploaded batch 23/419 (100 points)
Uploaded batch 24/419 (100 points)
Uploaded batch 25/419 (100 points)
Uploaded batch 

<qdrant_client.qdrant_client.QdrantClient at 0x2762c22d690>

# Reset Qdrant
deletes and reinstates qdrant database, be careful!

In [27]:
# Get collection info to preserve configuration
collection_info = client.get_collection(collection_name="books")
vector_size = collection_info.config.params.vectors.size
distance = collection_info.config.params.vectors.distance

# Delete the collection
client.delete_collection(collection_name="books")

# Recreate with the same parameters
client.create_collection(
    collection_name="books",
    vectors_config=models.VectorParams(
        size=vector_size,
        distance=distance
    )
)

True