In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import torch
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
MODEL_CONFIGS = {
    "lightweight": "all-MiniLM-L6-v2",      # 22M params, 384d
    "performance": "BAAI/bge-large-en-v1.5"  # 335M params, 1024d
}

In [3]:
def get_device(device: str = "auto") -> str:
    """Determine optimal device for computation."""
    if device == "auto":
        if torch.cuda.is_available():
            return "cuda"
        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
            return "mps"  # Apple Silicon GPU
        else:
            return "cpu"
    return device

In [4]:
def setup_model(model_tier: str = "lightweight", device: str = "auto") -> SentenceTransformer:
    """Load and return the sentence transformer model."""
    device = get_device(device)
    model_name = MODEL_CONFIGS[model_tier]

    print(f"Loading {model_name} on {device}...")
    model = SentenceTransformer(model_name, device=device)
    print(f"Model loaded. Embedding dimension: {model.get_sentence_embedding_dimension()}")

    return model

In [5]:
def load_json_data(json_path: str) -> List[Dict]:
    """Load and validate JSON token data."""
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Extract tokens if nested in structure
    tokens = data['tokens'] if 'tokens' in data else data
    print(f"Loaded {len(tokens)} tokens from {json_path}")

    return tokens

In [6]:
def extract_label_features(tokens: List[Dict]) -> Tuple[np.ndarray, List[str], Dict]:
    """
    Extract and encode label features from tokens.

    Returns:
        label_features: Standardized feature matrix
        feature_names: Names of features
        encoders: Dict of label encoders for later use
    """
    # Collect all label types and values
    all_labels = {}
    for token in tokens:
        for label_type, label_info in token['labels'].items():
            if label_type not in all_labels:
                all_labels[label_type] = set()
            all_labels[label_type].add(label_info['value'])

    # Create label encoders for categorical variables
    label_encoders = {}
    for label_type, values in all_labels.items():
        if 'null' in values or len(values) > 10:  # Categorical encoding
            label_encoders[label_type] = LabelEncoder()
            label_encoders[label_type].fit(list(values))

    # Extract features for each token
    label_data = []
    for token in tokens:
        token_features = []
        for label_type, label_info in token['labels'].items():
            value = label_info['value']
            confidence = label_info['confidence']

            if label_type in label_encoders:
                # Categorical: encode and weight by confidence
                encoded_val = label_encoders[label_type].transform([value])[0]
                token_features.extend([encoded_val * confidence, confidence])
            else:
                # Numerical or string hash
                try:
                    num_val = float(value) if value != 'null' else 0.0
                    token_features.extend([num_val * confidence, confidence])
                except ValueError:
                    # String hash as feature
                    hash_val = hash(value) % 1000
                    token_features.extend([hash_val * confidence, confidence])

        label_data.append(token_features)

    # Convert to numpy and standardize
    label_features = np.array(label_data)
    scaler = StandardScaler()
    if label_features.size > 0:
        label_features = scaler.fit_transform(label_features)

    # Generate feature names
    feature_names = []
    for label_type in sorted(all_labels.keys()):
        feature_names.extend([f"{label_type}_value", f"{label_type}_confidence"])

    print(f"Extracted {len(feature_names)} label features")

    return label_features, feature_names, label_encoders

In [7]:
def extract_texts(tokens: List[Dict]) -> List[str]:
    """Extract text content from tokens."""
    return [token['token'] for token in tokens]

In [8]:
def create_semantic_embeddings(model: SentenceTransformer, texts: List[str]) -> np.ndarray:
    """Generate semantic embeddings from text."""
    print("Generating semantic embeddings...")
    embeddings = model.encode(
        texts,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    print(f"Semantic embeddings shape: {embeddings.shape}")
    return embeddings

In [9]:
def combine_embeddings(semantic_embeddings: np.ndarray,
                       label_features: np.ndarray) -> np.ndarray:
    """Combine semantic embeddings with label features."""
    if label_features.size > 0:
        print("Combining semantic and label features...")
        combined = np.concatenate([semantic_embeddings, label_features], axis=1)
    else:
        combined = semantic_embeddings

    print(f"Final embedding shape: {combined.shape}")
    return combined

In [10]:
def save_embeddings(embeddings: np.ndarray,
                    texts: List[str],
                    feature_names: List[str],
                    model_tier: str,
                    output_dir: str = "embeddings_output") -> Dict[str, str]:
    """Save embeddings and metadata to files."""
    output_path = Path(output_dir) / model_tier
    output_path.mkdir(parents=True, exist_ok=True)

    # Save embeddings
    embedding_file = output_path / f"embeddings_{model_tier}.npy"
    np.save(embedding_file, embeddings)

    # Save metadata
    metadata = {
        'model_tier': model_tier,
        'model_name': MODEL_CONFIGS[model_tier],
        'embedding_dim': embeddings.shape[1],
        'label_features': len(feature_names),
        'num_tokens': len(texts),
        'feature_names': feature_names
    }

    metadata_file = output_path / f"metadata_{model_tier}.json"
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)

    # Save token mapping
    token_df = pd.DataFrame({
        'index': range(len(texts)),
        'token': texts,
        'embedding_file': str(embedding_file)
    })
    token_file = output_path / f"token_mapping_{model_tier}.csv"
    token_df.to_csv(token_file, index=False)

    file_paths = {
        'embeddings': str(embedding_file),
        'metadata': str(metadata_file),
        'token_mapping': str(token_file)
    }

    print(f"Saved embeddings to {embedding_file}")
    print(f"Saved metadata to {metadata_file}")
    print(f"Saved token mapping to {token_file}")

    return file_paths

In [11]:
def load_saved_embeddings(embedding_file: str) -> np.ndarray:
    """Load previously saved embeddings."""
    return np.load(embedding_file)

In [12]:
def get_embedding_stats(embeddings: np.ndarray) -> Dict:
    """Get basic statistics about embeddings."""
    return {
        'shape': embeddings.shape,
        'mean': embeddings.mean(),
        'std': embeddings.std(),
        'min': embeddings.min(),
        'max': embeddings.max(),
        'memory_mb': embeddings.nbytes / (1024 * 1024)}

In [13]:
# Quick workflow functions for common operations
def create_full_embeddings(json_path: str,
                           model_tier: str = "lightweight",
                           output_dir: str = "embeddings_output") -> Tuple[np.ndarray, Dict[str, str]]:
    """Complete workflow: JSON -> embeddings -> saved files."""

    # Load data
    tokens = load_json_data(json_path)
    texts = extract_texts(tokens)

    # Setup model
    model = setup_model(model_tier)

    # Extract features
    label_features, feature_names, _ = extract_label_features(tokens)

    # Create embeddings
    semantic_embeddings = create_semantic_embeddings(model, texts)
    final_embeddings = combine_embeddings(semantic_embeddings, label_features)

    # Save results
    file_paths = save_embeddings(
        final_embeddings, texts, feature_names, model_tier, output_dir
    )

    return final_embeddings, file_paths

In [14]:
def compare_model_tiers(json_path: str, output_dir: str = "embeddings_output"):
    """Generate embeddings with both model tiers for comparison."""

    print("=== Lightweight Model ===")
    embeddings_light, files_light = create_full_embeddings(
        json_path, "lightweight", output_dir
    )

    print(f"\nLightweight stats: {get_embedding_stats(embeddings_light)}")

    print("\n=== Performance Model ===")
    embeddings_perf, files_perf = create_full_embeddings(
        json_path, "performance", output_dir
    )

    print(f"\nPerformance stats: {get_embedding_stats(embeddings_perf)}")

    return {
        'lightweight': {'embeddings': embeddings_light, 'files': files_light},
        'performance': {'embeddings': embeddings_perf, 'files': files_perf}
    }

In [15]:
create_full_embeddings("./output.json")

Loaded 1182 tokens from ./output.json
Loading all-MiniLM-L6-v2 on mps...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded. Embedding dimension: 384
Extracted 8 label features
Generating semantic embeddings...


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Semantic embeddings shape: (1182, 384)
Combining semantic and label features...
Final embedding shape: (1182, 392)
Saved embeddings to embeddings_output/lightweight/embeddings_lightweight.npy
Saved metadata to embeddings_output/lightweight/metadata_lightweight.json
Saved token mapping to embeddings_output/lightweight/token_mapping_lightweight.csv


(array([[-0.06433426,  0.049358  ,  0.0284998 , ...,  1.00532391,
         -0.75986787,  0.59055148],
        [-0.04567493,  0.00633722,  0.00670272, ..., -1.17343088,
         -0.75986787,  0.59055148],
        [-0.04477897,  0.07996801, -0.00450986, ..., -1.17343088,
         -0.79263576,  0.3787057 ],
        ...,
        [-0.01809768, -0.02533447, -0.01309905, ...,  1.00532391,
          1.0669421 ,  1.01424304],
        [-0.10999466,  0.1284283 , -0.01778519, ..., -1.17343088,
         -1.022011  , -1.10421477],
        [-0.0313945 , -0.09264886,  0.04959343, ...,  1.00532391,
          1.0669421 ,  1.01424304]], shape=(1182, 392)),
 {'embeddings': 'embeddings_output/lightweight/embeddings_lightweight.npy',
  'metadata': 'embeddings_output/lightweight/metadata_lightweight.json',
  'token_mapping': 'embeddings_output/lightweight/token_mapping_lightweight.csv'})