# MaterialVision: Embedding Pipeline

This notebook provides a streamlined approach to:
1. Load models and datasets
2. Load validation data and create embeddings
3. Generate embeddings for custom text samples

In [1]:
# Essential imports and setup
import sys
import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from PIL import Image
import pickle

# Import model functions
from models import (
    load_clipp_scibert,
    load_clipp_distilbert, 
    load_mobileclip,
    load_blip
)

# Available models
MODELS = ['CLIPP-SciBERT', 'CLIPP-DistilBERT', 'MobileCLIP', 'BLIP']

print("‚úÖ Imports completed")
print(f"üìã Available models: {MODELS}")

  from .autonotebook import tqdm as notebook_tqdm


Adding to path: /home/jipengsun/MaterialVision/models/CLIPP_allenai
‚úÖ Successfully imported CLIPP SciBERT
Adding to path: /home/jipengsun/MaterialVision/models/CLIPP_bert
‚úÖ Successfully imported CLIPP SciBERT
Adding to path: /home/jipengsun/MaterialVision/models/CLIPP_bert
‚úÖ Successfully imported CLIPP DistilBERT
Adding to path: /home/jipengsun/MaterialVision/models/Apple_MobileCLIP
‚úÖ Successfully imported MobileCLIP
Adding to path: /home/jipengsun/MaterialVision/models/Salesforce
‚ùå Error importing BLIP: No module named 'models.CLIPP_bert'; 'models' is not a package
‚úÖ Imports completed
üìã Available models: ['CLIPP-SciBERT', 'CLIPP-DistilBERT', 'MobileCLIP', 'BLIP']
‚úÖ Successfully imported CLIPP DistilBERT
Adding to path: /home/jipengsun/MaterialVision/models/Apple_MobileCLIP
‚úÖ Successfully imported MobileCLIP
Adding to path: /home/jipengsun/MaterialVision/models/Salesforce
‚ùå Error importing BLIP: No module named 'models.CLIPP_bert'; 'models' is not a package
‚úÖ Imp

In [6]:
# Define validation data path
VAL_CSV = Path('../data/alpaca_mbj_bandgap_test.csv')
BATCH_SIZE = 32

print(f"Validation CSV path: {VAL_CSV}")
print(f"Validation CSV exists: {VAL_CSV.exists()}")
# Load validation data
if VAL_CSV.exists():
    val_df = pd.read_csv(VAL_CSV)
    print(f"‚úÖ Loaded validation data with {len(val_df)} samples")
    print(f"   Columns: {list(val_df.columns)}")
else:
    print(f"‚ùå Validation CSV not found at {VAL_CSV}")
    val_df = None

Validation CSV path: ../data/alpaca_mbj_bandgap_test.csv
Validation CSV exists: True
‚úÖ Loaded validation data with 1000 samples
   Columns: ['instruction', 'input', 'response', 'id', 'image']
‚úÖ Loaded validation data with 1000 samples
   Columns: ['instruction', 'input', 'response', 'id', 'image']


In [7]:
def create_text_embeddings(model_name, texts):
    """
    Create  embeddings for given sample using specified model.
    
    Args:
        model_name: Name of the model to use
        texts: Single text string or list of texts
    
    Returns:
        numpy array of embeddings
    """
    if isinstance(texts, str):
        texts = [texts]
    
    print(f"üîÑ Loading {model_name} model...")
    
    try:
        # Define device
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Load model based on type
        if model_name == 'CLIPP-SciBERT':
            checkpoint_path = '../models/CLIPP_allenai/checkpoints/best_clipp.pth'
            model, tokenizer, _ = load_clipp_scibert(checkpoint_path, device)
            
            embeddings = []
            model.eval()
            with torch.no_grad():
                for text in texts:
                    tokens = tokenizer(text, padding=True, truncation=True, 
                                     return_tensors="pt", max_length=512).to(device)
                    text_features = model.get_text_features(tokens['input_ids'], tokens['attention_mask'])
                    embeddings.append(text_features.cpu().numpy())
        
        elif model_name == 'CLIPP-DistilBERT':
            checkpoint_path = '../models/CLIPP_bert/checkpoints/best_clipp_bert.pth'
            model, tokenizer, _ = load_clipp_distilbert(checkpoint_path, device)
            
            embeddings = []
            model.eval()
            with torch.no_grad():
                for text in texts:
                    tokens = tokenizer(text, padding=True, truncation=True, 
                                     return_tensors="pt", max_length=512).to(device)
                    text_features = model.get_text_features(tokens['input_ids'], tokens['attention_mask'])
                    embeddings.append(text_features.cpu().numpy())
        
        elif model_name == 'MobileCLIP':
            import open_clip
            checkpoint_path = '../models/Apple_MobileCLIP/checkpoints/best_clipp_apple.pth'
            model, tokenizer, _ = load_mobileclip(checkpoint_path, device)
            
            embeddings = []
            model.eval()
            with torch.no_grad():
                for text in texts:
                    tokens = open_clip.tokenize([text]).to(device)
                    text_features = model.get_text_features(tokens)
                    embeddings.append(text_features.cpu().numpy())
        
        elif model_name == 'BLIP':
            checkpoint_path = '../models/Salesforce/checkpoints_blip/best_blip.pth'
            model, processor, _ = load_blip(checkpoint_path, device)
            
            embeddings = []
            model.eval()
            with torch.no_grad():
                for text in texts:
                    inputs = processor(text=[text], return_tensors="pt", 
                                     padding=True, truncation=True, max_length=512).to(device)
                    text_embeds = model.get_text_features(**inputs)
                    text_embeds = F.normalize(text_embeds, p=2, dim=1)
                    embeddings.append(text_embeds.cpu().numpy())
        
        result = np.vstack(embeddings) if len(embeddings) > 1 else embeddings[0]
        print(f"‚úÖ Generated embeddings: {result.shape}")
        return result.squeeze() if len(texts) == 1 else result
        
    except Exception as e:
        print(f"‚ùå Error with {model_name}: {e}")
        import traceback
        traceback.print_exc()
        return None

print("‚úÖ Text embedding function ready")

‚úÖ Text embedding function ready


In [9]:
# Generate embeddings for validation dataset
def process_validation_data(models_to_use=None):
    """
    Process validation dataset and create embeddings for all models.
    """
    if val_df is None:
        print("‚ùå No validation data available")
        return {}
    
    models_to_process = models_to_use or MODELS
    results = {}
    
    # Sample first 10 texts for demo (change as needed)
    sample_texts = val_df['input'].head(10).tolist()
    print(f"üìä Processing {len(sample_texts)} validation samples")
    
    for model_name in models_to_process:
        print(f"\nüîÑ Processing {model_name}...")
        embeddings = create_text_embeddings(model_name, sample_texts)
        
        if embeddings is not None:
            results[model_name] = {
                'embeddings': embeddings,
                'texts': sample_texts,
                'shape': embeddings.shape
            }
            print(f"‚úÖ {model_name}: {embeddings.shape}")
        else:
            print(f"‚ùå {model_name}: Failed")
    
    return results

# Run validation processing
print("üöÄ Starting validation data processing...")
validation_results = process_validation_data()
print(f"\nüéâ Completed processing for {len(validation_results)} models")

üöÄ Starting validation data processing...
üìä Processing 10 validation samples

üîÑ Processing CLIPP-SciBERT...
üîÑ Loading CLIPP-SciBERT model...


2025-11-09 19:57:08,204 INFO: Loading pretrained weights from Hugging Face hub (timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
2025-11-09 19:57:08,245 INFO: [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:57:08,245 INFO: [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


‚úÖ Generated embeddings: (10, 256)
‚úÖ CLIPP-SciBERT: (10, 256)

üîÑ Processing CLIPP-DistilBERT...
üîÑ Loading CLIPP-DistilBERT model...


2025-11-09 19:57:11,817 INFO: Loading pretrained weights from Hugging Face hub (timm/resnet50.a1_in1k)
2025-11-09 19:57:11,860 INFO: [timm/resnet50.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:57:11,860 INFO: [timm/resnet50.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:57:13,044 INFO: Loaded MobileCLIP-S2 model config.
2025-11-09 19:57:13,044 INFO: Loaded MobileCLIP-S2 model config.


‚úÖ Generated embeddings: (10, 256)
‚úÖ CLIPP-DistilBERT: (10, 256)

üîÑ Processing MobileCLIP...
üîÑ Loading MobileCLIP model...


2025-11-09 19:57:14,404 INFO: Loading pretrained MobileCLIP-S2 weights (datacompdr).


‚úÖ Generated embeddings: (10, 256)
‚úÖ MobileCLIP: (10, 256)

üîÑ Processing BLIP...
üîÑ Loading BLIP model...
‚úÖ Generated embeddings: (10, 256)
‚úÖ BLIP: (10, 256)

üéâ Completed processing for 4 models
‚úÖ Generated embeddings: (10, 256)
‚úÖ BLIP: (10, 256)

üéâ Completed processing for 4 models


In [11]:
# Test with custom text samples
def test_custom_text(text_input, models_to_use=None):
    """
    Generate embeddings for custom text using all or specified models.
    
    Args:
        text_input: Single text or list of texts
        models_to_use: List of model names (None for all)
    
    Returns:
        Dictionary of model_name -> embeddings
    """
    if isinstance(text_input, str):
        print(f"üìù Input text: \"{text_input[:100]}...\")")
    else:
        print(f"üìù Processing {len(text_input)} texts")
    
    models_to_process = models_to_use or MODELS
    results = {}
    
    for model_name in models_to_process:
        print(f"\nüîÑ {model_name}...")
        embeddings = create_text_embeddings(model_name, text_input)
        
        if embeddings is not None:
            results[model_name] = embeddings
            norm = np.linalg.norm(embeddings) if embeddings.ndim == 1 else [np.linalg.norm(emb) for emb in embeddings]
            print(f"‚úÖ Shape: {embeddings.shape}, Norm: {norm}")
    
    return results

# Example usage with custom texts
print("\nüß™ Testing with custom material descriptions...")

# Test single text
custom_text = "Silicon carbide semiconductor with high thermal conductivity for power electronics"
single_results = test_custom_text(custom_text)

# Test multiple texts
custom_texts = [
    "Graphene-based composite for energy storage applications",
    "Perovskite solar cell with enhanced stability",
    "Titanium alloy with superior mechanical properties"
]
batch_results = test_custom_text(custom_texts)

print("\nüéâ Custom text testing completed!")


üß™ Testing with custom material descriptions...
üìù Input text: "Silicon carbide semiconductor with high thermal conductivity for power electronics...")

üîÑ CLIPP-SciBERT...
üîÑ Loading CLIPP-SciBERT model...


2025-11-09 19:58:55,817 INFO: Loading pretrained weights from Hugging Face hub (timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
2025-11-09 19:58:55,869 INFO: [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:58:55,869 INFO: [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


‚úÖ Generated embeddings: (1, 256)
‚úÖ Shape: (256,), Norm: 1.0

üîÑ CLIPP-DistilBERT...
üîÑ Loading CLIPP-DistilBERT model...


2025-11-09 19:58:59,370 INFO: Loading pretrained weights from Hugging Face hub (timm/resnet50.a1_in1k)
2025-11-09 19:58:59,414 INFO: [timm/resnet50.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:58:59,414 INFO: [timm/resnet50.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:59:00,544 INFO: Loaded MobileCLIP-S2 model config.
2025-11-09 19:59:00,544 INFO: Loaded MobileCLIP-S2 model config.


‚úÖ Generated embeddings: (1, 256)
‚úÖ Shape: (256,), Norm: 1.0

üîÑ MobileCLIP...
üîÑ Loading MobileCLIP model...


2025-11-09 19:59:01,872 INFO: Loading pretrained MobileCLIP-S2 weights (datacompdr).


‚úÖ Generated embeddings: (1, 256)
‚úÖ Shape: (256,), Norm: 1.0

üîÑ BLIP...
üîÑ Loading BLIP model...
‚úÖ Generated embeddings: (1, 256)
‚úÖ Shape: (256,), Norm: 1.0000001192092896
üìù Processing 3 texts

üîÑ CLIPP-SciBERT...
üîÑ Loading CLIPP-SciBERT model...
‚úÖ Generated embeddings: (1, 256)
‚úÖ Shape: (256,), Norm: 1.0000001192092896
üìù Processing 3 texts

üîÑ CLIPP-SciBERT...
üîÑ Loading CLIPP-SciBERT model...


2025-11-09 19:59:10,597 INFO: Loading pretrained weights from Hugging Face hub (timm/vit_base_patch16_224.augreg2_in21k_ft_in1k)
2025-11-09 19:59:10,648 INFO: [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:59:10,648 INFO: [timm/vit_base_patch16_224.augreg2_in21k_ft_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.


‚úÖ Generated embeddings: (3, 256)
‚úÖ Shape: (3, 256), Norm: [np.float32(1.0), np.float32(1.0), np.float32(0.99999994)]

üîÑ CLIPP-DistilBERT...
üîÑ Loading CLIPP-DistilBERT model...


2025-11-09 19:59:13,680 INFO: Loading pretrained weights from Hugging Face hub (timm/resnet50.a1_in1k)
2025-11-09 19:59:13,721 INFO: [timm/resnet50.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:59:13,721 INFO: [timm/resnet50.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
2025-11-09 19:59:14,814 INFO: Loaded MobileCLIP-S2 model config.
2025-11-09 19:59:14,814 INFO: Loaded MobileCLIP-S2 model config.


‚úÖ Generated embeddings: (3, 256)
‚úÖ Shape: (3, 256), Norm: [np.float32(1.0), np.float32(1.0), np.float32(1.0)]

üîÑ MobileCLIP...
üîÑ Loading MobileCLIP model...


2025-11-09 19:59:16,350 INFO: Loading pretrained MobileCLIP-S2 weights (datacompdr).


‚úÖ Generated embeddings: (3, 256)
‚úÖ Shape: (3, 256), Norm: [np.float32(1.0), np.float32(1.0), np.float32(1.0)]

üîÑ BLIP...
üîÑ Loading BLIP model...
‚úÖ Generated embeddings: (3, 256)
‚úÖ Shape: (3, 256), Norm: [np.float32(0.99999994), np.float32(1.0), np.float32(1.0)]

üéâ Custom text testing completed!
‚úÖ Generated embeddings: (3, 256)
‚úÖ Shape: (3, 256), Norm: [np.float32(0.99999994), np.float32(1.0), np.float32(1.0)]

üéâ Custom text testing completed!


In [None]:
# Save results (optional)
def save_results(results, filename='embedding_results.pkl'):
    """
    Save embedding results to pickle file.
    """
    save_path = Path('./outputs') 
    save_path.mkdir(exist_ok=True)
    
    filepath = save_path / filename
    with open(filepath, 'wb') as f:
        pickle.dump(results, f)
    
    print(f"üíæ Results saved to {filepath}")
    return filepath

# Save validation and custom results
all_results = {
    'validation_results': validation_results,
    'single_text_results': single_results,
    'batch_text_results': batch_results
}

save_results(all_results, 'simple_embedding_results.pkl')

print("\nüìä Summary:")
print(f"‚úÖ Validation models processed: {len(validation_results)}")
print(f"‚úÖ Single text models: {len(single_results)}")
print(f"‚úÖ Batch text models: {len(batch_results)}")
print("\nüéØ Pipeline completed successfully!")

## Quick Usage Examples

### Text Embeddings
```python
# Generate embeddings for any text
my_text = "Your material description here"
results = test_custom_text(my_text, models_to_use=['CLIPP-SciBERT', 'MobileCLIP'])

# Generate text embeddings directly
text_embeddings = create_text_embeddings('CLIPP-SciBERT', "Your text here")
```

### Image Embeddings
```python
# Generate embeddings for images
image_paths = ["/path/to/image1.jpg", "/path/to/image2.jpg"]
image_embeddings = create_image_embeddings('CLIPP-SciBERT', image_paths)

# Single image
single_image_embedding = create_image_embeddings('CLIPP-SciBERT', "/path/to/image.jpg")
```

### Multimodal Embeddings
```python
# Generate both text and image embeddings
results = test_custom_multimodal(
    text_input=["Material description 1", "Material description 2"],
    image_paths=["/path/to/image1.jpg", "/path/to/image2.jpg"],
    models_to_use=['CLIPP-SciBERT']
)

# Text and images together
multimodal_embeddings = create_multimodal_embeddings(
    model_name='CLIPP-SciBERT',
    texts=["Silicon carbide semiconductor"],
    image_paths=["/path/to/sic_image.jpg"]
)
```

### Process Validation Data
```python
# Process validation data with specific models (text + images)
val_results = process_validation_data(models_to_use=['CLIPP-SciBERT'], include_images=True)

# Text only
val_results_text = process_validation_data(models_to_use=['CLIPP-SciBERT'], include_images=False)
```