# Qdrant Cloud Setup for Logistics Datapoints

This notebook sets up the Qdrant Cloud collection for storing and retrieving normalized datapoints.

In [14]:
# Install required packages
!pip install qdrant-client python-dotenv google-generativeai tqdm



In [23]:
# Import libraries
from qdrant_client import QdrantClient
from qdrant_client.http import models
import json
import os
import uuid
from tqdm.notebook import tqdm
import time
import sys
sys.path.append("..")
from utils.qdrant_client import get_qdrant_client, get_embedding, COLLECTION_NAME

# Load environment variables
load_dotenv()

# Check if environment variables are set
required_vars = ['QDRANT_URL', 'QDRANT_API_KEY', 'GOOGLE_API_KEY']
missing_vars = [var for var in required_vars if not os.getenv(var)]

if missing_vars:
    print(f"❌ Missing required environment variables: {', '.join(missing_vars)}")
    print("Please set these in your .env file")
else:
    print("✅ All required environment variables are set")

✅ All required environment variables are set


In [16]:
# Configure Google Generative AI
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

# Function to generate embeddings using Google's text-embedding-004
def get_embedding(text):
    """Generate embeddings using Google's text-embedding-004 model"""
    try:
        embedding_response = genai.embed_content(
            model="models/text-embedding-004",
            content=text,
            task_type="retrieval_document"
        )
        
        return embedding_response["embedding"]
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

# Test embedding generation
test_embedding = get_embedding("This is a test for embedding generation")
print(f"Test embedding dimension: {len(test_embedding)}")

Test embedding dimension: 768


In [17]:
# Initialize Qdrant client
client = QdrantClient(
    url=os.getenv('QDRANT_URL'),
    api_key=os.getenv('QDRANT_API_KEY')
)

# Test connection
try:
    collections = client.get_collections().collections
    print(f"✅ Successfully connected to Qdrant Cloud")
    print(f"Available collections: {[c.name for c in collections]}")
except Exception as e:
    print(f"❌ Failed to connect to Qdrant Cloud: {e}")

✅ Successfully connected to Qdrant Cloud
Available collections: ['logistics_datapoints']


## Create Collection for Logistics Datapoints

In [7]:
# Create collection for logistics datapoints
collection_name = "logistics_datapoints"

# Check if collection already exists
collections = client.get_collections().collections
if any(c.name == collection_name for c in collections):
    print(f"Collection '{collection_name}' already exists")
    recreate = input("Do you want to recreate the collection? (y/n): ")
    if recreate.lower() == 'y':
        client.delete_collection(collection_name)
        print(f"Deleted existing collection '{collection_name}'")
    else:
        print("Keeping existing collection")

# Create collection if it doesn't exist or was deleted
collections = client.get_collections().collections
if not any(c.name == collection_name for c in collections):
    # Create collection with Google's embedding dimension (768)
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=768,  # Google text-embedding-004 dimension
            distance=models.Distance.COSINE
        ),
        on_disk_payload=True,  # Store payload on disk for larger datasets
    )
    print(f"Created collection '{collection_name}'")
    
    # Create indices for efficient filtering
    index_fields = [
        "datapoint_id", 
        "datapoint_type", 
        "port_area", 
        "domain_area",
        "relevant_entity", 
        "regulation_category", 
        "regulation_subcategory", 
        "document_type",
        "source_document",
        "keywords"
    ]
    
    for field in index_fields:
        if field == "keywords":
            # Keywords is an array field
            client.create_payload_index(
                collection_name=collection_name,
                field_name=field,
                field_schema=models.PayloadSchemaType.KEYWORD
            )
        else:
            # Other fields are keyword fields
            client.create_payload_index(
                collection_name=collection_name,
                field_name=field,
                field_schema=models.PayloadSchemaType.KEYWORD
            )
    print(f"Created indices for fields: {', '.join(index_fields)}")

Collection 'logistics_datapoints' already exists
Deleted existing collection 'logistics_datapoints'
Created collection 'logistics_datapoints'
Created indices for fields: datapoint_id, datapoint_type, port_area, domain_area, relevant_entity, regulation_category, regulation_subcategory, document_type, source_document, keywords


## Load Normalized Datapoints

In [18]:
# Find all normalized datapoint files
normalized_dir = "../Data/Datapoints"
datapoint_files = glob.glob(f"{normalized_dir}/**/*_Datapoints.json", recursive=True)
print(f"Found {len(datapoint_files)} normalized datapoint files")

Found 8 normalized datapoint files


In [24]:
def load_datapoints(file_path, batch_size=100, collection_name=COLLECTION_NAME):
    """Load datapoints from a JSON file and add them to Qdrant collection."""
    client = get_qdrant_client()
    loaded = 0
    skipped = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            datapoints = json.load(f)
            
        # Process datapoints in batches
        points = []
        
        for datapoint in datapoints:
            if 'regulation_detail' not in datapoint or not datapoint['regulation_detail']:
                skipped += 1
                continue
                
            # Generate embedding
            embedding = get_embedding(datapoint['regulation_detail'])
            if embedding is None:
                skipped += 1
                continue
                
            # Convert string ID to valid UUID
            datapoint_id = datapoint.get('datapoint_id')
            if datapoint_id:
                try:
                    # Check if datapoint_id is already a valid UUID
                    uuid.UUID(datapoint_id)
                    valid_id = datapoint_id
                except ValueError:
                    # Convert the string to a valid UUID using uuid5
                    valid_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, datapoint_id))
            else:
                valid_id = str(uuid.uuid4())
                
            # Add point to batch
            points.append(
                models.PointStruct(
                    id=valid_id,
                    vector=embedding,
                    payload=datapoint
                )
            )
            
            # Upload batch if it reaches batch_size
            if len(points) >= batch_size:
                client.upsert(
                    collection_name=collection_name,
                    points=points
                )
                loaded += len(points)
                points = []
                
        # Upload any remaining points
        if points:
            client.upsert(
                collection_name=collection_name,
                points=points
            )
            loaded += len(points)
            
        return loaded, skipped
    
    except Exception as e:
        print(f"Error loading datapoints from {file_path}: {e}")
        return 0, 0

In [25]:
# Load datapoints with checkpoints and better error handling
total_loaded = 0
total_skipped = 0
failed_files = []

# Make sure we're using the correct collection name
collection_name = COLLECTION_NAME  # Use the imported constant

for i, file_path in enumerate(tqdm(datapoint_files, desc="Processing files")):
    try:
        loaded, skipped = load_datapoints(file_path, collection_name=collection_name)
        total_loaded += loaded
        total_skipped += skipped
        
        # Add checkpoint after each file
        print(f"File {i+1}/{len(datapoint_files)}: {os.path.basename(file_path)}")
        print(f"  ✓ Loaded: {loaded}, Skipped: {skipped}")
        
        # Small pause between files to avoid rate limiting
        if i < len(datapoint_files) - 1:
            time.sleep(1)
            
    except Exception as e:
        print(f"❌ Error processing {os.path.basename(file_path)}: {e}")
        failed_files.append(file_path)
        continue

print(f"\n====== LOADING SUMMARY ======")
print(f"✅ Successfully loaded {total_loaded} datapoints")
print(f"⚠️ Skipped {total_skipped} datapoints (missing detail or embedding failed)")

if failed_files:
    print(f"\n❌ Failed to process {len(failed_files)} files:")
    for failed in failed_files:
        print(f"  - {os.path.basename(failed)}")
else:
    print("\n✅ All files processed successfully!")

# Verify points in collection using imported function
try:
    client = get_qdrant_client()
    count = client.count(collection_name=collection_name).count
    print(f"\n✓ Collection '{collection_name}' now contains {count} datapoints")
except Exception as e:
    print(f"\n❌ Error verifying collection: {e}")

Processing files:   0%|          | 0/8 [00:00<?, ?it/s]

File 1/8: Rotterdam_Datapoints.json
  ✓ Loaded: 82, Skipped: 0
File 2/8: Yangshan_Datapoints.json
  ✓ Loaded: 290, Skipped: 0
File 3/8: Hamburg_Datapoints.json
  ✓ Loaded: 65, Skipped: 0
File 4/8: Antwerp_Datapoints.json
  ✓ Loaded: 68, Skipped: 0
File 5/8: Singapore_Datapoints.json
  ✓ Loaded: 270, Skipped: 0
File 6/8: Riga_Datapoints.json
  ✓ Loaded: 128, Skipped: 0
File 7/8: IMO_Datapoints.json
  ✓ Loaded: 170, Skipped: 0
File 8/8: INCOTERMS_Datapoints.json
  ✓ Loaded: 273, Skipped: 0

✅ Successfully loaded 1346 datapoints
⚠️ Skipped 0 datapoints (missing detail or embedding failed)

✅ All files processed successfully!

✓ Collection 'logistics_datapoints' now contains 1345 datapoints


## Test Retrieval

In [26]:
# Test search by embedding
def search_datapoints(query, filter_conditions=None, limit=5):
    # Generate embedding for query
    query_embedding = get_embedding(query)
    
    # Build search filter
    search_filter = None
    if filter_conditions:
        must_conditions = []
        
        for field, value in filter_conditions.items():
            if field == "keywords":
                if isinstance(value, list):
                    must_conditions.append(
                        models.FieldCondition(
                            key=field,
                            match=models.MatchAny(any=value)
                        )
                    )
                else:
                    must_conditions.append(
                        models.FieldCondition(
                            key=field,
                            match=models.MatchValue(value=value)
                        )
                    )
            else:
                must_conditions.append(
                    models.FieldCondition(
                        key=field,
                        match=models.MatchValue(value=value)
                    )
                )
                
        search_filter = models.Filter(must=must_conditions)
    
    # Execute search
    results = client.search(
        collection_name=collection_name,
        query_vector=query_embedding,
        query_filter=search_filter,
        limit=limit
    )
    
    return results

# Test with a sample query
test_query = "Requirements for importing hazardous materials into Singapore"
test_results = search_datapoints(
    test_query,
    filter_conditions={"port_area": "Singapore"}
)

print(f"Query: {test_query}")
print("\nResults:")
for i, result in enumerate(test_results, 1):
    print(f"\n{i}. {result.payload.get('datapoint_id', 'No ID')} (Score: {result.score:.4f})")
    print(f"   Type: {result.payload.get('datapoint_type', 'N/A')}")
    print(f"   Entity: {result.payload.get('relevant_entity', 'N/A')}")
    print(f"   Detail: {result.payload.get('regulation_detail', 'N/A')}")
    if 'keywords' in result.payload:
        print(f"   Keywords: {', '.join(result.payload['keywords'])}")

Query: Requirements for importing hazardous materials into Singapore

Results:

1. SINGAPORE_DP_CUSTOMS_PERMIT_APPLICATION_TRADENET_021 (Score: 0.8363)
   Type: Procedure
   Entity: Importer
   Detail: Permit applications for importing goods into Singapore, including containerized cargo, must be submitted via TradeNet®.
   Keywords: permit application, import, TradeNet, containerized cargo, electronic submission

2. SINGAPORE_DP_STATUTORY_CERTIFICATES_VALID_DEPARTURE_097 (Score: 0.8220)
   Type: Requirement
   Entity: Vessel
   Detail: Vessel must ensure all statutory and mandatory certificates are in force when proceeding to sea from Singapore.
   Keywords: statutory certificates, mandatory certificates, valid certificates, seaworthiness, compliance

3. SINGAPORE_DP_SHT_NON_COMPLIANCE_DENIAL_ENTRY_066 (Score: 0.8211)
   Type: Regulation
   Entity: Tanker (Single Hulled)
   Detail: Vessels not complying with MARPOL Convention on SHTs may be denied entry into Singapore port.
   Keywords

  results = client.search(


In [27]:
# Test hybrid search with keywords
keyword_query = "container customs requirements"
keyword_results = search_datapoints(
    keyword_query,
    filter_conditions={"keywords": ["customs", "container"]}
)

print(f"Query: {keyword_query}")
print("\nResults:")
for i, result in enumerate(keyword_results, 1):
    print(f"\n{i}. {result.payload.get('datapoint_id', 'No ID')} (Score: {result.score:.4f})")
    print(f"   Type: {result.payload.get('datapoint_type', 'N/A')}")
    print(f"   Entity: {result.payload.get('relevant_entity', 'N/A')}")
    print(f"   Detail: {result.payload.get('regulation_detail', 'N/A')}")
    if 'keywords' in result.payload:
        print(f"   Keywords: {', '.join(result.payload['keywords'])}")

Query: container customs requirements

Results:

1. INCO_DP_248_Suitability_General_AnyModeTermsRecommendedContainer (Score: 0.8383)
   Type: Guideline
   Entity: General
   Detail: For container shipments, it is generally recommended to use Incoterms designed for any mode of transport (EXW, FCA, CPT, CIP, DAP, DPU, DDP).
   Keywords: suitability, container, any mode terms, exw, fca, cpt, cip, dap, dpu, ddp, recommended

2. INCO_DP_236_Suitability_EXW_Container_NotRecommended (Score: 0.8336)
   Type: Guideline
   Entity: EXW
   Detail: EXW (Ex Works) is generally discouraged for international container shipments due to buyer export complexities.
   Keywords: exw, ex works, suitability, container, not recommended, discouraged, export

3. INCO_DP_013_Obligation_EXW_Buyer_ImportClearance (Score: 0.8156)
   Type: Obligation
   Entity: Buyer
   Detail: Under EXW, the buyer is responsible for handling and paying for import clearance in the destination country.
   Keywords: exw, ex works, buy

  results = client.search(


## Collection Statistics

In [29]:
# Get collection statistics
collection_info = client.get_collection(collection_name)
print(f"Collection: {collection_name}")
print(f"Points count: {collection_info.vectors_count}")
print(f"Indexed vectors: {collection_info.indexed_vectors_count}")
print(f"Status: {collection_info.status}")

# Get counts by datapoint_type
try:
    collection = client.get_collection(collection_name)
    payload_types = {}
    for point in client.scroll(collection_name=collection_name, limit=10000)[0]:
        datapoint_type = point.payload.get("datapoint_type")
        if datapoint_type:
            payload_types[datapoint_type] = payload_types.get(datapoint_type, 0) + 1
    
    print("\nDatapoint types:")
    for dtype, count in sorted(payload_types.items(), key=lambda x: x[1], reverse=True):
        print(f"  - {dtype}: {count}")
except Exception as e:
    print(f"Error getting datapoint type statistics: {e}")

# Get counts by port_area
try:
    port_areas = {}
    for point in client.scroll(collection_name=collection_name, limit=10000)[0]:
        port_area = point.payload.get("port_area")
        if port_area:
            port_areas[port_area] = port_areas.get(port_area, 0) + 1
    
    print("\nPort areas:")
    for area, count in sorted(port_areas.items(), key=lambda x: x[1], reverse=True)[:10]: # Show top 10
        print(f"  - {area}: {count}")
except Exception as e:
    print(f"Error getting port area statistics: {e}")

Collection: logistics_datapoints
Points count: None
Indexed vectors: 0
Status: green

Datapoint types:
  - Requirement: 187
  - Cost Allocation: 134
  - Fact: 96
  - Regulation: 94
  - Procedure: 83
  - Obligation: 75
  - Definition: 49
  - Benefit: 35
  - Power: 32
  - Rule: 25
  - Guideline: 21
  - Penalty: 18
  - Tariff Value: 17
  - Purpose: 17
  - Document Content - Key Element: 16
  - Documentation Requirement: 15
  - Challenge: 13
  - Document Function: 13
  - Clarification: 11
  - Risk Transfer: 11
  - Instruction: 11
  - Digital Platform: 10
  - Checklist: 10
  - Document Content - Instruction: 10
  - Prohibition: 9
  - Recommendation: 8
  - Document Requirement: 8
  - Exemption: 8
  - Responsibility: 8
  - Rationale: 8
  - Notification Requirement: 7
  - Standard: 6
  - Difference: 6
  - Tariff Information: 6
  - Checklist Item: 6
  - Information Gap: 6
  - Service Feature: 6
  - Capability: 5
  - Guideline - Best Practice: 5
  - Advantage: 5
  - Customs Procedure: 5
  - Feat