In [2]:
import pandas as pd
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams
from sentence_transformers import SentenceTransformer

## Qdrant Creation Functions
- create_book_embeddingings
-- creates vector embeddings from a books data frame
- upload_to_qdrant
-- takes book vectors and uploads them to qdrant
- process_book_data frame
-- run the book data frame through this to put it all together


In [3]:

def create_book_embeddings(df, column_name="summary", model_name="all-MiniLM-L6-v2"):
    """
    Generate embeddings for a dataframe of books.
    
    Parameters:
    - df: pandas DataFrame containing the book data
    - column_name: the column to generate embeddings for (defaults to "summary")
    - model_name: the sentence transformer model to use
    
    Returns:
    - list of point objects ready for Qdrant upload
    """
    # Load embedding model
    model = SentenceTransformer(model_name)
    
    # Get the model's output dimension
    vector_size = model.get_sentence_embedding_dimension()
    
    # Convert descriptions to embeddings
    book_vectors = []
    
    for index, row in df.iterrows():
        vector = model.encode(row[column_name]).tolist()
        book_vectors.append({
            "id": index + 1,  # Assign unique ID
            "vector": vector,
            "payload": {"title": row.get("title", ""), "summary": row.get(column_name, "")}
        })
    
    return book_vectors, vector_size

def upload_to_qdrant(book_vectors, vector_size, client=None, url=None, api_key=None, 
                     collection_name="books", batch_size=100):
    """
    Upload book embeddings to Qdrant.
    
    Parameters:
    - book_vectors: list of vector points to upload
    - vector_size: dimensionality of the embedding vectors
    - client: an existing QdrantClient (optional)
    - url: Qdrant server URL (required if client not provided)
    - api_key: Qdrant API key (required if client not provided)
    - collection_name: name for the Qdrant collection
    - batch_size: number of vectors to upload in each batch
    
    Returns:
    - QdrantClient instance
    """
    # Create client if not provided
    if client is None:
        if url is None or api_key is None:
            raise ValueError("Either provide a client or both url and api_key")
        client = QdrantClient(url, api_key=api_key)
    
    # Create/recreate collection with appropriate vector size
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
    )
    
    # Upload vectors in batches
    for i in range(0, len(book_vectors), batch_size):
        batch = book_vectors[i : i + batch_size]
        client.upsert(collection_name=collection_name, points=batch)
        print(f"Uploaded batch {i // batch_size + 1} of {(len(book_vectors) - 1) // batch_size + 1}")
    
    print("Upload complete!")
    return client

def process_book_dataframe(df, column_name="summary", model_name="all-MiniLM-L6-v2", 
                          url=None, api_key=None, collection_name="books", batch_size=100):
    """
    End-to-end function to process a book dataframe and upload to Qdrant.
    
    Parameters:
    - df: pandas DataFrame containing the book data
    - column_name: the column to generate embeddings for
    - model_name: the sentence transformer model to use
    - url: Qdrant server URL
    - api_key: Qdrant API key
    - collection_name: name for the Qdrant collection
    - batch_size: number of vectors to upload in each batch
    
    Returns:
    - QdrantClient instance
    """
    # Generate embeddings
    book_vectors, vector_size = create_book_embeddings(df, column_name, model_name)
    
    # Upload to Qdrant
    client = upload_to_qdrant(
        book_vectors, 
        vector_size,
        url=url, 
        api_key=api_key, 
        collection_name=collection_name,
        batch_size=batch_size
    )
    
    return client