# Electronic Parts Country of Origin Prediction

Simple demo using BGE-Large embeddings via vLLM and KNN classification.

## 1. Setup and Configuration

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import seaborn as sns
load_dotenv()
sys.path.append('../utils/')
from vllm_client import create_vllm_client, get_embeddings

In [None]:
# Configuration - your vLLM details are set via environment variables
# These are automatically configured by the workbench deployment

ENDPOINT = os.getenv('ENDPOINT')
EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL')
API_KEY = os.getenv('API_KEY')

print(f"Endpoint: {ENDPOINT}")
print(f"Model: {EMBEDDING_MODEL}")
print(f"API Key: {'[CONFIGURED]' if API_KEY else '[MISSING - set in deployment]'}")

## 2. Load Data

In [None]:
# Load the synthetic electronics data

df = pd.read_csv('../data/synthetic_electronics_parts_1k.csv')

df.head()

## 3. Connect to vLLM and Generate Embeddings

In [None]:
# Create vLLM client
client = create_vllm_client(ENDPOINT, EMBEDDING_MODEL, API_KEY)

In [None]:
# Generate embeddings for part descriptions
descriptions = df['Part_Description'].tolist()
print(f"🔄 Generating embeddings for {len(descriptions)} descriptions...")

embeddings = get_embeddings(client, descriptions, EMBEDDING_MODEL)

if embeddings:
    X = np.array(embeddings)
    print(f"✅ Generated embeddings with shape: {X.shape}")
    print(f"   - Embedding dimension: {X.shape[1]}")
else:
    print("❌ Failed to generate embeddings")
    print("Check your vLLM endpoint and API key configuration")

## 4. Train KNN Classifier

In [None]:
# Prepare labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Country_Of_Origin'])
class_names = label_encoder.classes_

print(f"📊 Dataset info:")
print(f"   - Total samples: {len(X)}")
print(f"   - Number of countries: {len(class_names)}")
print(f"   - Countries: {list(class_names)}")

In [None]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Split data: {len(X_train)} train, {len(X_test)} test")


In [None]:
# Train KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
print(f"🤖 Training KNN classifier with k={k}...")
knn.fit(X_train, y_train)
print("✅ KNN training complete")

## 5. Evaluate Model

In [None]:
# Make predictions
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"🎯 Model Performance:")
print(f"   - Accuracy: {accuracy:.1%}")

In [None]:
# Show detailed results
print("\n📊 Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

## 6. Visualize Results

In [None]:
# Create confusion matrix
if len(class_names) > 1:
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
               xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix - Country of Origin Prediction')
    plt.xlabel('Predicted Country')
    plt.ylabel('Actual Country')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Skipping confusion matrix (only one class)")

In [None]:
# Show sample predictions
print("\n🔍 Sample Predictions:")
print("=" * 80)

for i in range(min(len(X_test), 5)):
    desc = descriptions[i] if i < len(descriptions) else "Sample description"
    actual = class_names[y_test[i]]
    predicted = class_names[y_pred[i]]
    
    print(f"\nSample {i+1}:")
    print(f"Description: {desc[:100]}{'...' if len(desc) > 100 else ''}")
    print(f"Actual: {actual}")
    print(f"Predicted: {predicted}")
    print(f"Result: {'✅ Correct' if actual == predicted else '❌ Incorrect'}")

## 7. Test with New Examples

In [None]:
# Test with new part descriptions
test_descriptions = [
    "high-frequency RF transistor for mobile communication systems",
    "robust power MOSFET for electric vehicle inverters",
    "miniature MEMS accelerometer for wearable fitness trackers",
    "radiation-hardened FPGA for aerospace applications",
    "multi-layer ceramic inductor for high-efficiency DC-DC converters",
    "ultra-low-power Bluetooth SoC for IoT smart home devices",
    "thermally stable voltage regulator for telecom base stations",
    "automated LiDAR module for autonomous delivery drones",
    "waterproof piezoelectric buzzer for marine instrumentation",
    "energy-harvesting rectifier circuit for remote sensors"
]

print("🧪 Testing with new examples:")
print("=" * 50)

# Generate embeddings for test descriptions
test_embeddings = get_embeddings(client, test_descriptions, EMBEDDING_MODEL)

if test_embeddings:
    test_X = np.array(test_embeddings)
    test_predictions = knn.predict(test_X)
    test_probabilities = knn.predict_proba(test_X)
    
    for i, desc in enumerate(test_descriptions):
        predicted_country = class_names[test_predictions[i]]
        confidence = test_probabilities[i].max()
        
        print(f"\nTest {i+1}: {desc}")
        print(f"Predicted Country: {predicted_country}")
        print(f"Confidence: {confidence:.2f}")
else:
    print("❌ Failed to generate embeddings for test examples")

## Summary

🎉 **Demo Complete!**

This notebook demonstrates:
- ✅ BGE-Large embeddings via vLLM API
- ✅ KNN classification for country prediction
- ✅ Model evaluation and visualization
- ✅ Real-time inference with new examples

The model uses semantic embeddings from transformer models to understand the relationship between part descriptions and their likely countries of origin.