# Vector Analysis Toolchain

This notebook provides modular functions for data processing, vectorization, and model generation.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from transformers import AutoTokenizer, AutoModel
import flaml
import json
from typing import Dict, List, Tuple, Any
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
def load_data(csv_path: str) -> pd.DataFrame:
    """Load and preprocess the dataset.
    
    Args:
        csv_path: Path to the CSV file
        
    Returns:
        Preprocessed DataFrame
    """
    logging.info(f"Loading data from {csv_path}")
    df = pd.read_csv(csv_path)
    return df

In [None]:
def tokenize_and_embed(texts: List[str], model_type: str = "bert-base-uncased") -> np.ndarray:
    """Tokenize and embed text using the specified model.
    
    Args:
        texts: List of text strings to embed
        model_type: Name of the transformer model to use
        
    Returns:
        Array of embeddings
    """
    logging.info(f"Tokenizing and embedding using {model_type}")
    tokenizer = AutoTokenizer.from_pretrained(model_type)
    model = AutoModel.from_pretrained(model_type)
    
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    
    return np.array(embeddings)

In [None]:
def train_automl(X: np.ndarray, y: np.ndarray, task: str = "classification", time_budget: int = 60) -> Tuple[Any, Dict]:
    """Train an AutoML model using FLAML.
    
    Args:
        X: Feature matrix
        y: Target vector
        task: Type of ML task ('classification' or 'regression')
        time_budget: Time budget in seconds
        
    Returns:
        Tuple of (trained model, metrics)
    """
    logging.info(f"Training AutoML model for task: {task}")
    automl = flaml.AutoML()
    automl.fit(X, y, task=task, time_budget=time_budget)
    
    metrics = {
        "best_estimator": str(automl.best_estimator),
        "best_config": automl.best_config,
        "best_loss": automl.best_loss,
        "time_to_best": automl.time_to_best
    }
    
    return automl, metrics

In [None]:
def prepare_visualization(model_metrics: Dict, embeddings: np.ndarray) -> Dict:
    """Prepare visualization data.
    
    Args:
        model_metrics: Dictionary of model metrics
        embeddings: Array of embeddings
        
    Returns:
        Dictionary with chart data
    """
    charts = [
        {
            "type": "scatter",
            "data": {
                "embeddings": embeddings.tolist()
            }
        }
    ]
    
    return {
        "charts": charts,
        "metrics": model_metrics
    }

In [None]:
def run_analysis(csv_path: str, model_type: str = "bert-base-uncased") -> str:
    """Main analysis function that will be called by the Rust bridge.
    
    Args:
        csv_path: Path to the input CSV file
        model_type: Type of transformer model to use
        
    Returns:
        JSON string with analysis results
    """
    # Load and preprocess data
    df = load_data(csv_path)
    
    # Generate embeddings
    text_column = df.select_dtypes(include=['object']).columns[0]  # Use first text column
    embeddings = tokenize_and_embed(df[text_column].tolist(), model_type)
    
    # Prepare labels if available
    if 'label' in df.columns:
        le = LabelEncoder()
        labels = le.fit_transform(df['label'])
        
        # Train AutoML model
        model, metrics = train_automl(embeddings, labels)
    else:
        metrics = {}
    
    # Prepare visualization data
    result = prepare_visualization(metrics, embeddings)
    
    return json.dumps(result)