## Corpus Generation via ABsynth

### Import the absynth corpus generator

In [1]:
from absynth.lexicon import Vocabulary, LexiconGenerator
from absynth.sentence import SentenceGenerator, FrameManager
from absynth.corpus import SyntheticCorpusGenerator
from absynth.visualization import Visualizer

### Create generator with default settings

In [None]:
generator_default = SyntheticCorpusGenerator()
corpus_default = generator_default.generate_corpus(num_sentences=2000)

### Create generator with custom settings

In [None]:
# Create generator with custom settings 
vocab = Vocabulary({
    "noun":300, "transitive_verb":40, "intransitive_verb":25, 
    "communication_verb":20, "motion_verb":20, "change_verb":15, "adjective":40, 
    "adverb":25, "location":150, "temporal":35, "instrument":25, "preposition":15, 
    "conjunction":10, "determiner":8
})

In [None]:
lexicon = LexiconGenerator(
    vocab_sizes=vocab,           # Custom vocabulary sizes
    num_clusters=5,              # Number of semantic clusters to create
    zipfian_alpha=1.05,             # Alpha parameter for Zipfian distribution
    error_bias=0.00001,              # Error bias for word generation
    random_seed=42               # For reproducible generation
)

In [None]:
templates = FrameManager()
sentence_generator = SentenceGenerator(lexicon, templates)
generator_custom = SyntheticCorpusGenerator(lexicon=lexicon, sentence_generator=sentence_generator)

In [None]:
corpus_custom = generator_custom.generate_corpus(
    num_sentences=2000,
    complexity_distribution={"simple": 0.55, "medium": 0.35, "complex": 0.10},
    semantic_frame_distribution={
        "transitive_action": 0.1,
        "transitive_with_location": 0.15,
        "motion_with_source": 0.15,
        "temporal_action": 0.15,
        "instrumental_action": 0.15,
        "multi_action": 0.15,
        "temporal_complex": 0.15,
    }
)

### Evaluate quality

In [None]:
evaluation_default = generator_default.evaluate_corpus(corpus_default, calculate_suitability=True)

In [None]:
evaluation_custom = generator_custom.evaluate_corpus(corpus_custom, calculate_suitability=True)

### Save corpus


In [None]:
corpus_custom.save("corpus_full.json", indent=2)
corpus_custom.export("corpus_semantic.json", format="semantic_annotations", indent=2) 
corpus_custom.export("corpus_sentences.json", format="sentences_only", indent=2)

### Visualize the corpus statistics

In [None]:
visualizer = Visualizer(log_dir='./plots')
visualizer.visualize(corpus_default)

In [None]:
visualizer = Visualizer(log_dir='./plots')
visualizer.visualize(corpus_custom)

# TRACE for Model Analysis

## Tokenizer creation and data loading 

In [None]:
from trace.tokenizer import create_tokenizer_from_data
CORPUS_PATH = "./data/corpus.json"  
tokenizer = create_tokenizer_from_data(vocab_file=CORPUS_PATH)  
VOCAB_SIZE = tokenizer.get_vocab_size()

In [None]:
from trace.dataloader import get_dataloader
train_loader, val_loader, test_loader = get_dataloader(
    corpus_path=CORPUS_PATH,
    tokenizer=tokenizer,
    batch_size=32,
    max_length=16,
    val_split=0.1,
    test_split=0.1
)

## Building a transformer 

In [None]:
# Create transformer model
from trace.transformer import Transformer, TransformerConfig

model_config = TransformerConfig(
    model_type="decoder_only",  # "encoder_only", "decoder_only", "encoder_decoder"
    vocab_size=VOCAB_SIZE,
    d_model=96,  # Hidden dimension
    num_heads=3,  # Attention heads
    num_decoder_layers=2,  # Number of layers
    d_ff=384,  # Feed-forward dimension
    max_seq_length=16,  # Maximum sequence length
    dropout=0.1,
    device="cpu"  # "cpu" or "cuda"
)

In [None]:
model = Transformer.from_config(model_config)

In [None]:
model

## Training configuration 

In [None]:
from trace.training import Trainer, TrainingConfig

# All tacking is enabled 
training_config_default = TrainingConfig(
    epochs=30,
    learning_rate=1e-3,
    batch_size=128,
    device="cpu",
)

In [None]:
training_config_custom = TrainingConfig(
    epochs=3,
    learning_rate=1e-3,
    batch_size=128,
    device="cpu",

    # Analysis modules (enable all)
    track_hessian=False,  # Loss landscape analysis
    track_linguistic_probes=False,  # POS probing  
    track_semantic_probes=False,  # Semantic role probing
    track_intrinsic_dimensions=True,  # Representation dimensionality
    track_pos_performance=False,  # Output POS accuracy
    track_semantic_roles_performance=False,  # Output semantic accuracy

    # Analysis frequency and visualization
    track_interval=500,  # Analyze every 500 steps
    save_visualization=True,  # Generate plots
    show_plots=True,
    plots_path="./analysis_results"  # Save results here
)

In [None]:
trainer = Trainer(training_config_custom, tokenizer, model)

## Running Trace

In [None]:
best_loss, analysis_results = trainer.train(
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
)