# Electronic Parts Country of Origin Prediction

Simple demo using BGE-Large embeddings via vLLM and KNN classification.

## 1. Setup and Configuration

In [20]:
import os
import pandas as pd
import sys

sys.path.append('app/utils/')
from vllm_client import create_vllm_client, get_embeddings

In [21]:
# Configuration - your vLLM details are set via environment variables
# These are automatically configured by the workbench deployment

VLLM_ENDPOINT = os.getenv('VLLM_ENDPOINT', 'https://bge-large-arrow-embedding.apps.cluster-5qlcr.5qlcr.sandbox1342.opentlc.com')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'bge-large')
VLLM_API_KEY = os.getenv('VLLM_API_KEY')

print(f"Endpoint: {VLLM_ENDPOINT}")
print(f"Model: {EMBEDDING_MODEL}")
print(f"API Key: {'[CONFIGURED]' if VLLM_API_KEY else '[MISSING - set in deployment]'}")

Endpoint: https://bge-large-arrow-embedding.apps.cluster-5qlcr.5qlcr.sandbox1342.opentlc.com
Model: bge-large
API Key: [CONFIGURED]


In [22]:
# Configuration - set your vLLM details here
os.environ['VLLM_ENDPOINT'] = 'https://bge-large-arrow-embedding.apps.cluster-5qlcr.5qlcr.sandbox1342.opentlc.com'
os.environ['EMBEDDING_MODEL'] = 'bge-large'
os.environ['VLLM_API_KEY'] = 'API_KEY'  # Replace with your actual API key

print(f"Endpoint: {os.environ['VLLM_ENDPOINT']}")
print(f"Model: {os.environ['EMBEDDING_MODEL']}")
print(f"API Key: {'[CONFIGURED]' if os.environ['VLLM_API_KEY'] != 'your-api-key-here' else '[NEEDS CONFIGURATION]'}")

Endpoint: https://bge-large-arrow-embedding.apps.cluster-5qlcr.5qlcr.sandbox1342.opentlc.com
Model: bge-large
API Key: [CONFIGURED]


## 2. Load Data

In [23]:
# Load the synthetic electronics data

df = pd.read_csv('app/data/synthetic_electronics_parts.csv')

df.head()

Unnamed: 0,Part_Description,Country_Of_Origin
0,"high-quality test equipment components, harsh ...",Germany
1,A certified connector (heavy duty) for automot...,Germany
2,Manufactured for chipset needs: network IC wit...,Taiwan
3,power semiconductor (high-bandwidth) with fast...,South Korea
4,A automotive assembly sensor assembly for OSAT...,Malaysia


## 3. Connect to vLLM and Generate Embeddings

In [26]:
# Create vLLM client
client = create_vllm_client(VLLM_ENDPOINT, EMBEDDING_MODEL, VLLM_API_KEY)

In [28]:
# Generate embeddings for part descriptions
descriptions = df['Part_Description'].tolist()
print(f"🔄 Generating embeddings for {len(descriptions)} descriptions...")

embeddings = get_embeddings(client, descriptions, EMBEDDING_MODEL)

if embeddings:
    X = np.array(embeddings)
    print(f"✅ Generated embeddings with shape: {X.shape}")
    print(f"   - Embedding dimension: {X.shape[1]}")
else:
    print("❌ Failed to generate embeddings")
    print("Check your vLLM endpoint and API key configuration")

🔄 Generating embeddings for 500 descriptions...
An API error occurred: Connection error.


AttributeError: 'APIConnectionError' object has no attribute 'status_code'

## 4. Train KNN Classifier

In [None]:
# Prepare labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Country_Of_Origin'])
class_names = label_encoder.classes_

print(f"📊 Dataset info:")
print(f"   - Total samples: {len(X)}")
print(f"   - Number of countries: {len(class_names)}")
print(f"   - Countries: {list(class_names)}")

In [None]:
# Split data for training and testing
if len(X) > 4:  # Only split if we have enough samples
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    print(f"Split data: {len(X_train)} train, {len(X_test)} test")
else:
    # Use all data for training and testing (small dataset)
    X_train, X_test, y_train, y_test = X, X, y, y
    print(f"Small dataset: using all {len(X)} samples for both train and test")

In [None]:
# Train KNN classifier
k = min(3, len(X_train))  # Adjust k based on dataset size
print(f"🤖 Training KNN classifier with k={k}...")

knn = KNeighborsClassifier(
    n_neighbors=k,
    weights='distance',
    metric='cosine'
)

knn.fit(X_train, y_train)
print("✅ KNN training complete")

## 5. Evaluate Model

In [None]:
# Make predictions
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"🎯 Model Performance:")
print(f"   - Accuracy: {accuracy:.1%}")
print(f"   - Baseline (random): {1/len(class_names):.1%}")
print(f"   - Improvement: {accuracy/(1/len(class_names)):.1f}x better than random")

In [None]:
# Show detailed results
print("\n📊 Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
# Show sample predictions
print("\n🔍 Sample Predictions:")
print("=" * 80)

for i in range(min(len(X_test), 5)):
    desc = descriptions[i] if i < len(descriptions) else "Sample description"
    actual = class_names[y_test[i]]
    predicted = class_names[y_pred[i]]
    
    print(f"\nSample {i+1}:")
    print(f"Description: {desc[:100]}{'...' if len(desc) > 100 else ''}")
    print(f"Actual: {actual}")
    print(f"Predicted: {predicted}")
    print(f"Result: {'✅ Correct' if actual == predicted else '❌ Incorrect'}")

## 6. Visualize Results

In [None]:
# Create confusion matrix
if len(class_names) > 1:
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix - Country of Origin Prediction')
    plt.xlabel('Predicted Country')
    plt.ylabel('Actual Country')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Skipping confusion matrix (only one class)")

## 7. Test with New Examples

In [None]:
# Test with new part descriptions
test_descriptions = [
    "automotive-grade microcontroller with advanced safety features",
    "low-cost ceramic capacitor for consumer electronics",
    "precision optical sensor for industrial automation"
]

print("🧪 Testing with new examples:")
print("=" * 50)

# Generate embeddings for test descriptions
test_embeddings = client.get_embeddings(test_descriptions)

if test_embeddings:
    test_X = np.array(test_embeddings)
    test_predictions = knn.predict(test_X)
    test_probabilities = knn.predict_proba(test_X)
    
    for i, desc in enumerate(test_descriptions):
        predicted_country = class_names[test_predictions[i]]
        confidence = test_probabilities[i].max()
        
        print(f"\nTest {i+1}: {desc}")
        print(f"Predicted Country: {predicted_country}")
        print(f"Confidence: {confidence:.2f}")
else:
    print("❌ Failed to generate embeddings for test examples")

## Summary

🎉 **Demo Complete!**

This notebook demonstrates:
- ✅ BGE-Large embeddings via vLLM API
- ✅ KNN classification for country prediction
- ✅ Model evaluation and visualization
- ✅ Real-time inference with new examples

The model uses semantic embeddings from transformer models to understand the relationship between part descriptions and their likely countries of origin.