In [1]:
import os
import sys
import json
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
import subprocess
from sklearn.model_selection import train_test_split

# Add project directory to path for imports
sys.path.append('..')

# Import our Qdrant utilities
from utils.qdrant_client import get_qdrant_client, get_embedding, create_collection

In [2]:
from dotenv import load_dotenv
import google.generativeai as genai
# Load environment variables
load_dotenv()

# Configure Google Generative AI
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))

In [3]:
# Test connection to Qdrant
client = get_qdrant_client()
try:
    collections = client.get_collections().collections
    print(f"✅ Successfully connected to Qdrant Cloud")
    print(f"Available collections: {[c.name for c in collections]}")
except Exception as e:
    print(f"❌ Failed to connect to Qdrant Cloud: {e}")
    
# Test embedding generation
test_embedding = get_embedding("This is a test for embedding generation")
print(f"Test embedding dimension: {len(test_embedding)}")

✅ Successfully connected to Qdrant Cloud
Available collections: ['case_generation_references', 'logistics_datapoints', 'icl_logistics_case_examples']
Test embedding dimension: 768


## 2. Load and Explore Parquet Data

In [4]:
# Start caffeinate to prevent Mac from sleeping during exploration
print("Starting caffeinate for data exploration...")
caffeinate_process = subprocess.Popen(["caffeinate", "-i", "-d"], 
                                     stdout=subprocess.DEVNULL, 
                                     stderr=subprocess.DEVNULL)

try:
    # Load the evaluated cases
    data_dir = Path("../Data/GeneratedCases")
    evaluated_file = data_dir / "evaluated_cases_20250330_212702.parquet"
    
    if not evaluated_file.exists():
        print(f"⚠️ Could not find {evaluated_file}")
        # Find available parquet files
        parquet_files = list(data_dir.glob("*.parquet"))
        if parquet_files:
            print(f"Found these parquet files: {[f.name for f in parquet_files]}")
            evaluated_file = parquet_files[-1]  # Use the latest file
            print(f"Using: {evaluated_file}")
        else:
            raise FileNotFoundError(f"No parquet files found in {data_dir}")
    
    # Load the dataset
    print(f"Loading data from {evaluated_file}")
    df = pd.read_parquet(evaluated_file)
    print(f"Loaded {len(df)} cases")
    
    # Show columns
    print(f"\nDataFrame columns: {df.columns.tolist()}")
    
    # Display a sample
    print("\nSample row:")
    sample_row = df.iloc[0]
    for col in ['case_id', 'title']:
        print(f"{col}: {sample_row[col]}")
    print(f"enhanced_case: {sample_row['enhanced_case'][:100]}...")
    print(f"solution: {sample_row['solution'][:100]}...")
    
    # Display basic statistics
    if "realism_score" in df.columns:
        print("\nEvaluation Statistics:")
        metrics = ["realism_score", "complexity_score", "educational_value", "solution_quality"]
        for metric in metrics:
            if metric in df.columns:
                print(f"Average {metric}: {df[metric].mean():.2f}")
        
        # Count qualification status
        if "overall_qualification" in df.columns:
            qualified_count = df[df["overall_qualification"] == "QUALIFIED"].shape[0]
            print(f"Qualified cases: {qualified_count} ({qualified_count/len(df)*100:.1f}%)")
            
    # Create case text for embedding (combine title and case content)
    df["case_for_embedding"] = df["title"] + "\n\n" + df["enhanced_case"]
    print(f"\nAdded 'case_for_embedding' column (first 100 chars): {df['case_for_embedding'].iloc[0][:100]}...")
    
    # Split into train and test sets (80/20)
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    print(f"Split into: Train set: {len(train_df)} cases, Test set: {len(test_df)} cases")
    
    # Save train/test split for future reference
    train_df.to_parquet(data_dir / "train_cases.parquet")
    test_df.to_parquet(data_dir / "test_cases.parquet")
    print("✅ Saved train/test split to separate parquet files")
    
finally:
    # Make sure to terminate caffeinate
    if 'caffeinate_process' in locals():
        print("Terminating caffeinate...")
        caffeinate_process.terminate()

Starting caffeinate for data exploration...
Loading data from ../Data/GeneratedCases/evaluated_cases_20250330_212702.parquet
Loaded 307 cases

DataFrame columns: ['case_id', 'title', 'enhanced_case', 'solution', 'file_path', 'enhanced_case_length', 'solution_length', 'realism_score', 'complexity_score', 'educational_value', 'solution_quality', 'overall_qualification', 'evaluation_summary', 'improvement_suggestions']

Sample row:
case_id: case-20250329-203754-mtqgzd
title: Strangled Supply Lines: Navigating Baltic Port Congestion and Seafood Spoilage Risks
enhanced_case: **Scenario:** Nordic Seafood Imports (NSI), a leading importer of frozen seafood into Scandinavia an...
solution: ## Executive Summary Nordic Seafood Imports (NSI) is facing a critical supply chain disruption due t...

Evaluation Statistics:
Average realism_score: 8.11
Average complexity_score: 7.12
Average educational_value: 8.71
Average solution_quality: 7.74
Qualified cases: 304 (99.0%)

Added 'case_for_embedding' co

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, 240 to 102
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   case_id                  245 non-null    object 
 1   title                    245 non-null    object 
 2   enhanced_case            245 non-null    object 
 3   solution                 245 non-null    object 
 4   file_path                245 non-null    object 
 5   enhanced_case_length     245 non-null    int64  
 6   solution_length          245 non-null    int64  
 7   realism_score            243 non-null    float64
 8   complexity_score         243 non-null    float64
 9   educational_value        243 non-null    float64
 10  solution_quality         243 non-null    float64
 11  overall_qualification    243 non-null    object 
 12  evaluation_summary       243 non-null    object 
 13  improvement_suggestions  243 non-null    object 
 14  case_for_embedding       245 

In [6]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62 entries, 183 to 302
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   case_id                  62 non-null     object 
 1   title                    62 non-null     object 
 2   enhanced_case            62 non-null     object 
 3   solution                 62 non-null     object 
 4   file_path                62 non-null     object 
 5   enhanced_case_length     62 non-null     int64  
 6   solution_length          62 non-null     int64  
 7   realism_score            61 non-null     float64
 8   complexity_score         61 non-null     float64
 9   educational_value        61 non-null     float64
 10  solution_quality         61 non-null     float64
 11  overall_qualification    61 non-null     object 
 12  evaluation_summary       61 non-null     object 
 13  improvement_suggestions  61 non-null     object 
 14  case_for_embedding       62 no

## 3. Check Embedding Model Availability

In [11]:
# Test embedding generation
print("Testing embedding generation...")

# Try with a simple text
test_text = "This is a test case about maritime logistics and shipping containers."
test_embedding = get_embedding(test_text)

if test_embedding:
    print(f"✅ Embedding generation works! Vector length: {len(test_embedding)}")
    print(f"First 5 dimensions: {test_embedding[:5]}")
else:
    print("⚠️ Failed to generate test embedding. Check API key and connectivity.")
    # You might need to add code here to troubleshoot embedding generation

Testing embedding generation...
✅ Embedding generation works! Vector length: 768
First 5 dimensions: [0.017684225, 0.008641249, -0.014321081, -0.022147447, 0.062374268]


## 4. Create/Prepare Qdrant Collection

In [7]:
# Define collection name
CASE_COLLECTION_NAME = "icl_logistics_case_examples"

print(f"Setting up Qdrant collection '{CASE_COLLECTION_NAME}'...")
client = get_qdrant_client()

# Check if we can connect to Qdrant
try:
    # Get list of current collections
    collections = client.get_collections().collections
    print(f"Connected to Qdrant. Found {len(collections)} existing collections:")
    for c in collections:
        print(f" - {c.name}")
    
    # Check if collection already exists
    if any(c.name == CASE_COLLECTION_NAME for c in collections):
        recreate = input(f"Collection '{CASE_COLLECTION_NAME}' already exists. Recreate? (y/n): ")
        if recreate.lower() == 'y':
            client.delete_collection(CASE_COLLECTION_NAME)
            print(f"Deleted existing collection")
            
            # Create collection directly with client
            from qdrant_client.http import models
            
            # Create vector configuration
            # You might need to adjust the size based on your embedding model
            vector_size = 768  # Change this to match your model's embedding size
            
            # Get a test embedding to determine vector size
            test_text = "This is a test for embedding size"
            test_vector = get_embedding(test_text)
            if test_vector:
                vector_size = len(test_vector)
                print(f"Detected embedding size: {vector_size}")
            
            # Create the collection
            client.create_collection(
                collection_name=CASE_COLLECTION_NAME,
                vectors_config=models.VectorParams(
                    size=vector_size,
                    distance=models.Distance.COSINE
                )
            )
            print(f"✅ Created new collection: {CASE_COLLECTION_NAME}")
        else:
            print(f"Using existing collection")
    else:
        # Create collection directly with client
        from qdrant_client.http import models
        
        # Create vector configuration
        # You might need to adjust the size based on your embedding model
        vector_size = 768  # Default size for many embedding models
        
        # Get a test embedding to determine vector size
        test_text = "This is a test for embedding size"
        test_vector = get_embedding(test_text)
        if test_vector:
            vector_size = len(test_vector)
            print(f"Detected embedding size: {vector_size}")
        
        # Create the collection
        client.create_collection(
            collection_name=CASE_COLLECTION_NAME,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
        print(f"✅ Created new collection: {CASE_COLLECTION_NAME}")
        
except Exception as e:
    print(f"⚠️ Error connecting to Qdrant: {str(e)}")

Setting up Qdrant collection 'icl_logistics_case_examples'...
Connected to Qdrant. Found 3 existing collections:
 - case_generation_references
 - logistics_datapoints
 - icl_logistics_case_examples
Using existing collection


## 5. Create DataFrame with Only Needed Columns

In [None]:
# Create simplified dataframe with only columns we need for Qdrant
qdrant_df = train_df[['case_id', 'title', 'solution', 'case_for_embedding']].copy()

# Display info about the simplified dataframe
print(f"Created simplified dataframe for Qdrant upload with {len(qdrant_df)} rows")
print(f"Columns: {qdrant_df.columns.tolist()}")

# Show a sample of what we'll upload
print("\nSample row for Qdrant:")
sample = qdrant_df.iloc[0]
print(f"ID: {sample['case_id']}")
print(f"Title: {sample['title']}")
print(f"Solution (first 100 chars): {sample['solution'][:100]}...")
print(f"Case for embedding (first 100 chars): {sample['case_for_embedding'][:100]}...")

In [9]:
qdrant_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, 240 to 102
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   case_id             245 non-null    object
 1   title               245 non-null    object
 2   solution            245 non-null    object
 3   case_for_embedding  245 non-null    object
dtypes: object(4)
memory usage: 9.6+ KB


In [11]:
qdrant_df.head()

Unnamed: 0,case_id,title,solution,case_for_embedding
240,case-20250330-081720-dj177a,**Baltic Salmon Run: Navigating Regulatory Hur...,## Executive Summary Baltic Breeze Seafood fac...,**Baltic Salmon Run: Navigating Regulatory Hur...
143,case-20250329-201544-4yus0r,Baltic Bottleneck: Optimizing Timber Exports A...,## Executive Summary Scandinavian Timber Expor...,Baltic Bottleneck: Optimizing Timber Exports A...
253,case-20250329-200538-utxdtj,Arctic Sunrise: Navigating Rotterdams Congesti...,## Executive Summary Neptune Shipping faces a ...,Arctic Sunrise: Navigating Rotterdams Congesti...
76,case-20250329-083901-mhaul2,Arctic Peril: GlobalTechs Smartphone Shipment ...,"Okay, here's a detailed solution to the ""Arcti...",Arctic Peril: GlobalTechs Smartphone Shipment ...
113,case-20250329-004330-witq8y,**Guangdong Gale: A Baltic Furniture Retailer ...,## Executive Summary Baltic Breeze Imports (BB...,**Guangdong Gale: A Baltic Furniture Retailer ...


In [13]:
def create_qdrant_point(row):
    """Create a Qdrant point from a dataframe row"""
    import uuid
    
    # Generate embedding
    case_embedding = get_embedding(row["case_for_embedding"])
    
    if not case_embedding:
        return None
    
    # Create simplified payload
    payload = {
        "case_id": row["case_id"],
        "title": row["title"],
        "solution": row["solution"]
    }
    
    # Create UUID based on case_id for consistent retrieval
    # This ensures the same case always gets the same UUID
    point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, row["case_id"]))
    
    # Create point
    point = models.PointStruct(
        id=point_id,  # Use UUID string
        vector=case_embedding,
        payload=payload
    )
    
    return point

## 6. Test Upload with 2 Records

In [14]:
# Test upload with just 2 records
print("Testing upload with 2 records...")

test_points = []
for idx, row in qdrant_df.head(2).iterrows():
    point = create_qdrant_point(row)
    if point:
        test_points.append(point)
        print(f"Created point for case: {row['case_id']}")

# Upload test points
if test_points:
    try:
        client.upsert(
            collection_name=CASE_COLLECTION_NAME,
            points=test_points
        )
        print(f"✅ Successfully uploaded {len(test_points)} test points to Qdrant")
        
        # Verify points were uploaded
        count = client.count(collection_name=CASE_COLLECTION_NAME).count
        print(f"Collection now contains {count} points")
        
    except Exception as e:
        print(f"⚠️ Error uploading test points: {e}")
else:
    print("⚠️ No test points were created - check embedding generation")

Testing upload with 2 records...
Created point for case: case-20250330-081720-dj177a
Created point for case: case-20250329-201544-4yus0r
✅ Successfully uploaded 2 test points to Qdrant
Collection now contains 2 points


## 7. Upload All Records

In [15]:
# Start caffeinate for the full upload process
print("Starting caffeinate for full upload process...")
caffeinate_process = subprocess.Popen(["caffeinate", "-i", "-d", "-m"], 
                                     stdout=subprocess.DEVNULL, 
                                     stderr=subprocess.DEVNULL)

try:
    # Upload remaining training cases to collection
    print(f"\nUploading all {len(qdrant_df) - 2} remaining training cases to Qdrant...")
    
    # Batch size for uploads
    BATCH_SIZE = 20
    
    # Skip the first 2 we already uploaded
    remaining_df = qdrant_df.iloc[2:]
    
    # Process in batches to avoid overloading API
    total_uploaded = 0
    
    for i in tqdm(range(0, len(remaining_df), BATCH_SIZE)):
        batch = remaining_df.iloc[i:i+BATCH_SIZE]
        points = []
        
        for _, row in batch.iterrows():
            point = create_qdrant_point(row)  # Using the same function that works
            if point:
                points.append(point)
        
        # Upload batch
        if points:
            try:
                client.upsert(
                    collection_name=CASE_COLLECTION_NAME,
                    points=points
                )
                total_uploaded += len(points)
            except Exception as e:
                print(f"Error uploading batch: {e}")
    
    # Verify total upload
    final_count = client.count(collection_name=CASE_COLLECTION_NAME).count
    print(f"✅ Upload complete! Collection contains {final_count} points")
    print(f"Successfully uploaded {total_uploaded} cases in this run + 2 test cases")
    
finally:
    # Make sure to terminate caffeinate when done
    if 'caffeinate_process' in locals():
        print("Terminating caffeinate - your Mac can sleep again...")
        caffeinate_process.terminate()

Starting caffeinate for full upload process...

Uploading all 243 remaining training cases to Qdrant...


  0%|          | 0/13 [00:00<?, ?it/s]

✅ Upload complete! Collection contains 245 points
Successfully uploaded 243 cases in this run + 2 test cases
Terminating caffeinate - your Mac can sleep again...


## 8. Test Retrieval with Mock Question

In [16]:
# Test the retrieval with a mock question
print("Testing case retrieval with a mock question...")

# Create a test query
mock_question = """
We are a European automotive parts manufacturer shipping components to assembly plants in Asia. 
Recently, we've experienced significant delays at the Suez Canal and container shortages. 
What's the best approach to mitigate these logistics challenges?
"""

# Generate embedding for mock question
mock_embedding = get_embedding(mock_question)

if mock_embedding:
    # Search for similar cases
    results = client.search(
        collection_name=CASE_COLLECTION_NAME,
        query_vector=mock_embedding,
        limit=2
    )
    
    # Display results
    print("\nMost relevant cases for in-context learning:")
    for i, result in enumerate(results):
        print(f"\nExample {i+1}: {result.payload['title']} (Similarity: {result.score:.2f})")
        print(f"Solution excerpt: {result.payload['solution'][:200]}...")
else:
    print("⚠️ Failed to generate embedding for mock question")

# Now test with a case from the test set
print("\nTesting retrieval with a case from test set...")

# Pick a random case from test set
test_case = test_df.sample(1).iloc[0]
print(f"Test case: {test_case['title']}")

# Generate embedding for test case
test_embedding = get_embedding(test_case["case_for_embedding"])

if test_embedding:
    # Search for similar cases
    results = client.search(
        collection_name=CASE_COLLECTION_NAME,
        query_vector=test_embedding,
        limit=2
    )
    
    # Display results
    print("\nMost relevant cases for in-context learning:")
    for i, result in enumerate(results):
        print(f"\nExample {i+1}: {result.payload['title']} (Similarity: {result.score:.2f})")
        print(f"Solution excerpt: {result.payload['solution'][:200]}...")
else:
    print("⚠️ Failed to generate embedding for test case")

# Test with manually created retrieval function
print("\nTesting with manually created retrieval function...")

# Import the function (if it exists)
try:
    from utils.case_retrieval import retrieve_similar_cases
    
    # Use the function with our mock question
    similar_examples = retrieve_similar_cases(
        mock_question, 
        num_examples=2,
        collection_name=CASE_COLLECTION_NAME  # Make sure this matches what we used above
    )
    
    # Display results
    for i, example in enumerate(similar_examples):
        print(f"\nExample {i+1}: {example['title']} (Similarity: {example['similarity_score']:.2f})")
        print(f"Solution excerpt: {example['solution'][:200]}...")
        
except ImportError:
    print("⚠️ Could not import retrieve_similar_cases - make sure utils/case_retrieval.py exists")
except Exception as e:
    print(f"⚠️ Error using retrieve_similar_cases: {e}")

Testing case retrieval with a mock question...


  results = client.search(



Most relevant cases for in-context learning:

Example 1: The Baltic Bottleneck: Navigating Semiconductor Supply Chain Disruptions in Northern Europe (Similarity: 0.77)
Solution excerpt: ## Executive Summary Sunbeam Electronics faces significant challenges in its semiconductor supply chain from China to Northern Europe, resulting in costly delays and potential contract losses. This so...

Example 2: Suez Squeeze: Nordic Seafoods Blockchain-Enabled Resilience (Similarity: 0.76)
Solution excerpt: ## Executive Summary Nordic Seafood Importers (NSI) faces significant disruptions due to Suez Canal congestion and geopolitical instability, impacting transit times, freight costs, and cold chain inte...

Testing retrieval with a case from test set...
Test case: The Jade Dragon Debacle: Navigating Congestion, Compliance, and Contingencies in the Baltic Supply Chain


  results = client.search(



Most relevant cases for in-context learning:

Example 1: Stranded Serenity: Navigating Baltic Port Congestion and Maritime Delays (Similarity: 0.91)
Solution excerpt: ## Executive Summary OceanTech Logistics faces a significant challenge due to the MV *Jade Dragon*'s engine malfunction and the resulting delays impacting Nordic Furnishings' critical furniture shipme...

Example 2: **Jade Dragon Delay: Navigating Congestion, Compliance, and Contractual Risk in Global Logistics** (Similarity: 0.90)
Solution excerpt: ## Executive Summary The "Jade Dragon" delay presents a significant challenge to StellarFlow Logistics, threatening both financial losses due to contractual penalties and reputational damage with a ke...

Testing with manually created retrieval function...

Example 1: The Baltic Bottleneck: Navigating Semiconductor Supply Chain Disruptions in Northern Europe (Similarity: 0.77)
Solution excerpt: ## Executive Summary Sunbeam Electronics faces significant challenges in its semic