|
1 | 1 | # DSMIL AI-Assisted Compilation |
2 | 2 | **Integration Guide for DSMIL Layers 3-9 AI Advisors** |
3 | 3 |
|
4 | | -Version: 1.0 |
| 4 | +Version: 1.2 |
5 | 5 | Last Updated: 2025-11-24 |
6 | 6 |
|
7 | 7 | --- |
@@ -81,8 +81,8 @@ DSLLVM integrates with the DSMIL AI architecture (Layers 3-9, 48 AI devices, ~13 |
81 | 81 |
|
82 | 82 | ```json |
83 | 83 | { |
84 | | - "schema": "dsmilai-request-v1", |
85 | | - "version": "1.0", |
| 84 | + "schema": "dsmilai-request-v1.2", |
| 85 | + "version": "1.2", |
86 | 86 | "timestamp": "2025-11-24T15:30:45Z", |
87 | 87 | "compiler": { |
88 | 88 | "name": "dsmil-clang", |
@@ -145,6 +145,10 @@ DSLLVM integrates with the DSMIL AI architecture (Layers 3-9, 48 AI devices, ~13 |
145 | 145 | "cyclomatic_complexity": 12, |
146 | 146 | "branch_density": 0.08, |
147 | 147 | "dominance_depth": 4 |
| 148 | + }, |
| 149 | + "quantum_candidate": { |
| 150 | + "enabled": false, |
| 151 | + "problem_type": null |
148 | 152 | } |
149 | 153 | } |
150 | 154 | ], |
@@ -199,8 +203,8 @@ DSLLVM integrates with the DSMIL AI architecture (Layers 3-9, 48 AI devices, ~13 |
199 | 203 |
|
200 | 204 | ```json |
201 | 205 | { |
202 | | - "schema": "dsmilai-response-v1", |
203 | | - "version": "1.0", |
| 206 | + "schema": "dsmilai-response-v1.2", |
| 207 | + "version": "1.2", |
204 | 208 | "timestamp": "2025-11-24T15:30:47Z", |
205 | 209 | "request_id": "uuid-1234-5678-...", |
206 | 210 | "advisor": { |
@@ -290,6 +294,22 @@ DSLLVM integrates with the DSMIL AI architecture (Layers 3-9, 48 AI devices, ~13 |
290 | 294 | "confidence": 0.81, |
291 | 295 | "rationale": "AVX-512 available on Meteor Lake; widening vectorization factor from 8 to 16 can improve throughput by ~18%" |
292 | 296 | } |
| 297 | + ], |
| 298 | + "quantum_export": [ |
| 299 | + { |
| 300 | + "target": "function:optimize_placement", |
| 301 | + "recommended": false, |
| 302 | + "confidence": 0.89, |
| 303 | + "rationale": "Problem size (128 variables, 45 constraints) exceeds current QPU capacity (Device 46: ~12 qubits available). Recommend classical ILP solver.", |
| 304 | + "alternative": "use_highs_solver_on_cpu", |
| 305 | + "estimated_runtime_classical_ms": 23, |
| 306 | + "estimated_runtime_quantum_ms": null, |
| 307 | + "qpu_availability": { |
| 308 | + "device_46_status": "busy", |
| 309 | + "queue_depth": 7, |
| 310 | + "estimated_wait_time_s": 145 |
| 311 | + } |
| 312 | + } |
293 | 313 | ] |
294 | 314 | }, |
295 | 315 | "diagnostics": { |
@@ -728,6 +748,291 @@ dsmil-clang --ai-mode=local \ |
728 | 748 | dsmil-clang --ai-mode=off -O3 -o output input.c |
729 | 749 | ``` |
730 | 750 |
|
| 751 | +### 6.5 Compact ONNX Feature Scoring (v1.2) |
| 752 | + |
| 753 | +**Purpose**: Ultra-fast per-function cost decisions using tiny ONNX models running on Devices 43-58. |
| 754 | + |
| 755 | +**Motivation**: |
| 756 | + |
| 757 | +Full AI advisor calls (Layer 7 LLM, Layer 8 Security) have latency of 50-200ms per request, which is too slow for per-function optimization decisions during compilation. Solution: Use **compact ONNX models** (~5-20 MB) for sub-millisecond feature scoring, backed by NPU/AMX accelerators (Devices 43-58, Layer 5 performance analytics, ~140 TOPS total). |
| 758 | + |
| 759 | +**Architecture**: |
| 760 | + |
| 761 | +``` |
| 762 | +┌─────────────────────────────────────────────────┐ |
| 763 | +│ DSLLVM DsmilAICostModelPass │ |
| 764 | +│ │ |
| 765 | +│ Per Function: │ |
| 766 | +│ ┌────────────────────────────────────────────┐ │ |
| 767 | +│ │ 1. Extract IR Features │ │ |
| 768 | +│ │ - Basic blocks, loop depth, memory ops │ │ |
| 769 | +│ │ - CFG complexity, vectorization │ │ |
| 770 | +│ │ - DSMIL metadata (layer/device/stage) │ │ |
| 771 | +│ └─────────────┬──────────────────────────────┘ │ |
| 772 | +│ │ Feature Vector (128 floats) │ |
| 773 | +│ ▼ │ |
| 774 | +│ ┌────────────────────────────────────────────┐ │ |
| 775 | +│ │ 2. Batch Inference with Tiny ONNX Model │ │ |
| 776 | +│ │ Model: 5-20 MB (INT8/FP16 quantized) │ │ |
| 777 | +│ │ Input: [batch, 128] │ │ |
| 778 | +│ │ Output: [batch, 16] scores │ │ |
| 779 | +│ │ Device: 43-58 (NPU/AMX) │ │ |
| 780 | +│ │ Latency: <0.5ms per function │ │ |
| 781 | +│ └─────────────┬──────────────────────────────┘ │ |
| 782 | +│ │ Output Scores │ |
| 783 | +│ ▼ │ |
| 784 | +│ ┌────────────────────────────────────────────┐ │ |
| 785 | +│ │ 3. Apply Scores to Optimization Decisions │ │ |
| 786 | +│ │ - Inline if score[0] > 0.7 │ │ |
| 787 | +│ │ - Unroll by factor = round(score[1]) │ │ |
| 788 | +│ │ - Vectorize with width = score[2] │ │ |
| 789 | +│ │ - Device preference: argmax(scores[3:6])│ │ |
| 790 | +│ └────────────────────────────────────────────┘ │ |
| 791 | +└─────────────────────────────────────────────────┘ |
| 792 | +``` |
| 793 | + |
| 794 | +**Feature Vector (128 floats)**: |
| 795 | + |
| 796 | +| Index Range | Feature Category | Description | |
| 797 | +|-------------|------------------|-------------| |
| 798 | +| 0-7 | Complexity | Basic blocks, instructions, CFG depth, call count | |
| 799 | +| 8-15 | Memory | Load/store count, estimated bytes, stride patterns | |
| 800 | +| 16-23 | Control Flow | Branch count, loop nests, switch cases | |
| 801 | +| 24-31 | Arithmetic | Int ops, FP ops, vector ops, div/mod count | |
| 802 | +| 32-39 | Data Types | i8/i16/i32/i64/f32/f64 usage ratios | |
| 803 | +| 40-47 | DSMIL Metadata | Layer, device, clearance, stage (encoded as floats) | |
| 804 | +| 48-63 | Call Graph | Caller/callee stats, recursion depth | |
| 805 | +| 64-95 | Vectorization | Vector width, alignment, gather/scatter patterns | |
| 806 | +| 96-127 | Reserved | Future extensions | |
| 807 | + |
| 808 | +**Feature Extraction Example**: |
| 809 | +```cpp |
| 810 | +// Function: matmul_kernel |
| 811 | +// Basic blocks: 8, Instructions: 142, Loops: 2 |
| 812 | +float features[128] = { |
| 813 | + 8.0, // [0] basic_blocks |
| 814 | + 142.0, // [1] instructions |
| 815 | + 3.0, // [2] cfg_depth |
| 816 | + 2.0, // [3] call_count |
| 817 | + // ... [4-7] more complexity metrics |
| 818 | + |
| 819 | + 64.0, // [8] load_count |
| 820 | + 32.0, // [9] store_count |
| 821 | + 262144.0, // [10] estimated_bytes (log scale) |
| 822 | + 1.0, // [11] stride_pattern (contiguous) |
| 823 | + // ... [12-15] more memory metrics |
| 824 | + |
| 825 | + 7.0, // layer (encoded) |
| 826 | + 47.0, // device_id (encoded) |
| 827 | + 0.8, // stage: "quantized" → 0.8 |
| 828 | + 0.7, // clearance (normalized) |
| 829 | + // ... more DSMIL metadata |
| 830 | + |
| 831 | + // ... rest of features |
| 832 | +}; |
| 833 | +``` |
| 834 | + |
| 835 | +**Output Scores (16 floats)**: |
| 836 | + |
| 837 | +| Index | Score Name | Range | Description | |
| 838 | +|-------|-----------|-------|-------------| |
| 839 | +| 0 | inline_score | [0.0, 1.0] | Probability to inline this function | |
| 840 | +| 1 | unroll_factor | [1.0, 32.0] | Loop unroll factor | |
| 841 | +| 2 | vectorize_width | [1, 4, 8, 16, 32] | SIMD width (discrete values) | |
| 842 | +| 3 | device_cpu | [0.0, 1.0] | Probability for CPU execution | |
| 843 | +| 4 | device_npu | [0.0, 1.0] | Probability for NPU execution | |
| 844 | +| 5 | device_gpu | [0.0, 1.0] | Probability for iGPU execution | |
| 845 | +| 6 | memory_tier_ramdisk | [0.0, 1.0] | Probability for ramdisk | |
| 846 | +| 7 | memory_tier_ssd | [0.0, 1.0] | Probability for SSD | |
| 847 | +| 8 | security_risk_injection | [0.0, 1.0] | Risk score: injection attacks | |
| 848 | +| 9 | security_risk_overflow | [0.0, 1.0] | Risk score: buffer overflow | |
| 849 | +| 10 | security_risk_sidechannel | [0.0, 1.0] | Risk score: side-channel leaks | |
| 850 | +| 11 | security_risk_rop | [0.0, 1.0] | Risk score: ROP gadgets | |
| 851 | +| 12-15 | reserved | - | Future extensions | |
| 852 | + |
| 853 | +**ONNX Model Specification**: |
| 854 | + |
| 855 | +```python |
| 856 | +# Model architecture (PyTorch pseudo-code for training) |
| 857 | +class DsmilCostModel(nn.Module): |
| 858 | + def __init__(self): |
| 859 | + self.fc1 = nn.Linear(128, 256) |
| 860 | + self.fc2 = nn.Linear(256, 128) |
| 861 | + self.fc3 = nn.Linear(128, 16) |
| 862 | + self.relu = nn.ReLU() |
| 863 | + |
| 864 | + def forward(self, x): |
| 865 | + # x: [batch, 128] feature vector |
| 866 | + x = self.relu(self.fc1(x)) |
| 867 | + x = self.relu(self.fc2(x)) |
| 868 | + x = self.fc3(x) # [batch, 16] output scores |
| 869 | + return x |
| 870 | + |
| 871 | +# After training, export to ONNX |
| 872 | +torch.onnx.export( |
| 873 | + model, |
| 874 | + dummy_input, |
| 875 | + "dsmil-cost-v1.2.onnx", |
| 876 | + opset_version=14, |
| 877 | + dynamic_axes={'input': {0: 'batch_size'}} |
| 878 | +) |
| 879 | + |
| 880 | +# Quantize to INT8 for faster inference |
| 881 | +onnxruntime.quantization.quantize_dynamic( |
| 882 | + "dsmil-cost-v1.2.onnx", |
| 883 | + "dsmil-cost-v1.2-int8.onnx", |
| 884 | + weight_type=QuantType.QInt8 |
| 885 | +) |
| 886 | +``` |
| 887 | + |
| 888 | +**Inference Performance**: |
| 889 | + |
| 890 | +| Device | Hardware | Batch Size | Latency | Throughput | |
| 891 | +|--------|----------|------------|---------|------------| |
| 892 | +| Device 43 | NPU Tile 3 | 1 | 0.3 ms | 3333 functions/s | |
| 893 | +| Device 43 | NPU Tile 3 | 32 | 1.2 ms | 26667 functions/s | |
| 894 | +| Device 50 | CPU AMX | 1 | 0.5 ms | 2000 functions/s | |
| 895 | +| Device 50 | CPU AMX | 32 | 2.8 ms | 11429 functions/s | |
| 896 | +| CPU (fallback) | AVX2 | 1 | 1.8 ms | 556 functions/s | |
| 897 | + |
| 898 | +**Integration with DsmilAICostModelPass**: |
| 899 | + |
| 900 | +```cpp |
| 901 | +// DSLLVM pass pseudo-code |
| 902 | +class DsmilAICostModelPass : public PassInfoMixin<DsmilAICostModelPass> { |
| 903 | + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) { |
| 904 | + // Load ONNX model (once per compilation) |
| 905 | + auto *model = loadONNXModel("/opt/dsmil/models/dsmil-cost-v1.2-int8.onnx"); |
| 906 | + |
| 907 | + std::vector<float> feature_batch; |
| 908 | + std::vector<Function*> functions; |
| 909 | + |
| 910 | + // Extract features for all functions in module |
| 911 | + for (auto &F : M) { |
| 912 | + float features[128]; |
| 913 | + extractFeatures(F, features); |
| 914 | + feature_batch.insert(feature_batch.end(), features, features+128); |
| 915 | + functions.push_back(&F); |
| 916 | + } |
| 917 | + |
| 918 | + // Batch inference (fast!) |
| 919 | + std::vector<float> scores = model->infer(feature_batch, functions.size()); |
| 920 | + |
| 921 | + // Apply scores to optimization decisions |
| 922 | + for (size_t i = 0; i < functions.size(); i++) { |
| 923 | + float *func_scores = &scores[i * 16]; |
| 924 | + |
| 925 | + // Inlining decision |
| 926 | + if (func_scores[0] > 0.7) { |
| 927 | + functions[i]->addFnAttr(Attribute::AlwaysInline); |
| 928 | + } |
| 929 | + |
| 930 | + // Device placement |
| 931 | + int device = argmax({func_scores[3], func_scores[4], func_scores[5]}); |
| 932 | + functions[i]->setMetadata("dsmil.placement.device", device); |
| 933 | + |
| 934 | + // Security risk (forward to L8 if high) |
| 935 | + float max_risk = *std::max_element(func_scores+8, func_scores+12); |
| 936 | + if (max_risk > 0.8) { |
| 937 | + // Flag for full L8 security scan |
| 938 | + functions[i]->setMetadata("dsmil.security.needs_l8_scan", true); |
| 939 | + } |
| 940 | + } |
| 941 | + |
| 942 | + return PreservedAnalyses::none(); |
| 943 | + } |
| 944 | +}; |
| 945 | +``` |
| 946 | +
|
| 947 | +**Configuration**: |
| 948 | +
|
| 949 | +```bash |
| 950 | +# Use compact ONNX model (default in --ai-mode=local) |
| 951 | +dsmil-clang --ai-mode=local \ |
| 952 | + --ai-cost-model=/opt/dsmil/models/dsmil-cost-v1.2-int8.onnx \ |
| 953 | + -O3 -o output input.c |
| 954 | +
|
| 955 | +# Specify target device for ONNX inference |
| 956 | +dsmil-clang --ai-mode=local \ |
| 957 | + -mllvm -dsmil-onnx-device=43 \ # NPU Tile 3 |
| 958 | + -O3 -o output input.c |
| 959 | +
|
| 960 | +# Fallback to full L7/L8 advisors (slower, more accurate) |
| 961 | +dsmil-clang --ai-mode=advisor \ |
| 962 | + --ai-use-full-advisors \ |
| 963 | + -O3 -o output input.c |
| 964 | +
|
| 965 | +# Disable all AI (classical heuristics only) |
| 966 | +dsmil-clang --ai-mode=off -O3 -o output input.c |
| 967 | +``` |
| 968 | + |
| 969 | +**Training Data Collection**: |
| 970 | + |
| 971 | +Models trained on **JRTC1-5450** historical build data: |
| 972 | +- **Inputs**: IR feature vectors from 1M+ functions across DSMIL kernel, drivers, and userland |
| 973 | +- **Labels**: Ground-truth performance measured on Meteor Lake hardware |
| 974 | + - Execution time (latency) |
| 975 | + - Throughput (ops/sec) |
| 976 | + - Power consumption (watts) |
| 977 | + - Memory bandwidth (GB/s) |
| 978 | +- **Training Infrastructure**: Layer 7 Device 47 (LLM for feature engineering) + Layer 5 Devices 50-59 (regression training) |
| 979 | +- **Validation**: 80/20 train/test split, 5-fold cross-validation |
| 980 | + |
| 981 | +**Model Versioning & Provenance**: |
| 982 | + |
| 983 | +```json |
| 984 | +{ |
| 985 | + "model_version": "dsmil-cost-v1.2-20251124", |
| 986 | + "format": "ONNX", |
| 987 | + "opset_version": 14, |
| 988 | + "quantization": "INT8", |
| 989 | + "size_bytes": 8388608, |
| 990 | + "hash_sha384": "a7f3c2e9...", |
| 991 | + "training_data": { |
| 992 | + "dataset": "jrtc1-5450-production-builds", |
| 993 | + "samples": 1247389, |
| 994 | + "date_range": "2024-08-01 to 2025-11-20" |
| 995 | + }, |
| 996 | + "performance": { |
| 997 | + "mse_speedup": 0.023, |
| 998 | + "accuracy_device_placement": 0.89, |
| 999 | + "accuracy_inline_decision": 0.91 |
| 1000 | + }, |
| 1001 | + "signature": { |
| 1002 | + "algorithm": "ML-DSA-87", |
| 1003 | + "signer": "TSK (Toolchain Signing Key)", |
| 1004 | + "signature": "base64_encoded_signature..." |
| 1005 | + } |
| 1006 | +} |
| 1007 | +``` |
| 1008 | + |
| 1009 | +Embedded in toolchain provenance: |
| 1010 | +```json |
| 1011 | +{ |
| 1012 | + "compiler_version": "dsmil-clang 19.0.0-v1.2", |
| 1013 | + "ai_cost_model": "dsmil-cost-v1.2-20251124", |
| 1014 | + "ai_cost_model_hash": "a7f3c2e9...", |
| 1015 | + "ai_mode": "local" |
| 1016 | +} |
| 1017 | +``` |
| 1018 | + |
| 1019 | +**Benefits**: |
| 1020 | + |
| 1021 | +- **Latency**: <0.5ms per function vs 50-200ms for full AI advisor (100-400× faster) |
| 1022 | +- **Throughput**: Process entire compilation unit in parallel with batched inference |
| 1023 | +- **Accuracy**: 85-95% agreement with human expert decisions |
| 1024 | +- **Determinism**: Fixed model version ensures reproducible builds |
| 1025 | +- **Transparency**: Model performance tracked in provenance metadata |
| 1026 | +- **Scalability**: Can handle modules with 10,000+ functions efficiently |
| 1027 | + |
| 1028 | +**Fallback Strategy**: |
| 1029 | + |
| 1030 | +If ONNX model fails to load or device unavailable: |
| 1031 | +1. Log warning with fallback reason |
| 1032 | +2. Use classical LLVM heuristics (always available) |
| 1033 | +3. Mark binary with `"ai_cost_model_fallback": true` in provenance |
| 1034 | +4. Continue compilation (graceful degradation) |
| 1035 | + |
731 | 1036 | --- |
732 | 1037 |
|
733 | 1038 | ## 7. AI Integration Modes |
|
0 commit comments