In [1]:
# import os

# # 设置环境变量
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# # 打印环境变量以确认设置成功
# print(os.environ.get('HF_ENDPOINT'))

import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
import json
import random
import re
import os
from datasets import load_dataset
from openai import OpenAI
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# ==========================================
# 1. 配置 API https://bailian.console.aliyun.com/
# ==========================================
API_KEY = "sk-"  # 注意：在生产环境中请勿硬编码，建议使用 os.getenv
MODEL_ID = "qwen3-max"

client = OpenAI(
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
    api_key=API_KEY,
)

In [4]:
# ==========================================
# 2. 数据准备 (BioMap 蛋白质溶解性版)
# ==========================================
print("Loading BioMap Solubility dataset...")
try:
    # 加载 Hugging Face 数据集
    dataset_path = "biomap-research/solubility_prediction"
    dataset = load_dataset(dataset_path)
    
    # 获取训练集用于抽取示例，测试集用于测试
    # 注意：这个数据集自带 'test' split，我们优先用它
    train_ds = dataset['train']
    test_ds = dataset['test']
    
    print(f"Columns: {train_ds.column_names}") 
    # BioMap 的列名应该是 'seq' 和 'label'

    # -------------------------------------------------
    # 3. 核心步骤：构建 Few-Shot 示例 (只选短的！)
    # -------------------------------------------------
    shot_num = 5
    max_len_for_shot = 200 # 限制示例长度，防止 Token 爆炸
    
    # 辅助函数：按标签筛选并按长度过滤
    def get_short_samples(ds, label_val, limit, max_len):
        candidates = []
        # 为了效率，我们只遍历前 5000 个找短序列即可
        for i in range(min(len(ds), 5000)):
            item = ds[i]
            if item['label'] == label_val and len(item['seq']) <= max_len:
                candidates.append(item)
                if len(candidates) >= limit:
                    break
        return candidates

    # 抽取 5 个可溶 (Label 1) 和 5 个不溶 (Label 0) 的短序列
    shots_pos = get_short_samples(train_ds, 1, shot_num, max_len_for_shot)
    shots_neg = get_short_samples(train_ds, 0, shot_num, max_len_for_shot)
    
    # -------------------------------------------------
    # 4. 核心步骤：构建测试集
    # -------------------------------------------------
    # 从 test split 里随机采 50-100 个
    # 测试集不需要限制长度，大模型能处理，但太长会贵，建议随机采
    test_candidates = list(test_ds)
    random.seed(42)
    combined_test_data = random.sample(test_candidates, 100) # 测80个
    
    print(f"Few-Shot Examples (Len < {max_len_for_shot}): {len(shots_pos)} Pos / {len(shots_neg)} Neg")
    print(f"Test Data Prepared: {len(combined_test_data)} sequences")

except Exception as e:
    print(f"Error loading dataset: {e}")
    traceback.print_exc()
    exit()

# ==========================================
# 5. 格式化 Few-Shot 文本块
# ==========================================
def format_protein_examples(pos_list, neg_list):
    text = "Reference Examples (Ground Truth):\n"
    
    text += "\n--- Class 1 (Soluble / Target) ---\n"
    for i, item in enumerate(pos_list, 1):
        # 截断打印，防止万一混入特别长的
        seq = item['seq'][:500] 
        text += f"Example_Pos_{i}: {seq}\n"
        
    text += "\n--- Class 0 (Insoluble / Aggregating) ---\n"
    for i, item in enumerate(neg_list, 1):
        seq = item['seq'][:500]
        text += f"Example_Neg_{i}: {seq}\n"
        
    return text

examples_text_block = format_protein_examples(shots_pos, shots_neg)

# ==========================================
# 6. 构建 JSON List
# ==========================================
prompt_data_list = []
id_to_ground_truth = {}

for idx, item in enumerate(combined_test_data, 1):
    prompt_data_list.append({
        "id": idx,
        "sequence": item['seq'] # 蛋白质序列
    })
    id_to_ground_truth[idx] = item['label']

print("Prompt data constructed.")

Loading BioMap Solubility dataset...
Columns: ['seq', 'label']
Few-Shot Examples (Len < 200): 5 Pos / 5 Neg
Test Data Prepared: 100 sequences
Prompt data constructed.


In [5]:
# ==========================================
# 3. 构建 Prompt (蛋白质版 - 结构/理化性质感知)
# ==========================================

# System Prompt: 引导模型关注氨基酸的排列模式 (Charge, Hydrophobicity)
system_prompt = """You are an expert in Protein Sequence Analysis and Pattern Recognition.

1. The Task:
You are analyzing amino acid sequences to categorize them into two functional classes based on their physicochemical properties.
* **Class 1**: Sequences that likely fold into stable structures (Soluble). They often balance hydrophobic cores with hydrophilic surfaces.
* **Class 0**: Sequences that are prone to aggregation or instability (Insoluble). They may have exposed hydrophobic patches or repetitive patterns.

2. The Strategy (Few-Shot Learning):
* Study the "Reference Examples" provided below.
* Pay attention to the composition of amino acids (e.g., K, R, D, E vs. L, I, V, F) and their ordering.
* Classify the "Test Batch" by determining which reference group they structurally resemble.

3. Output Rules:
* Return a RAW JSON object: `[{"id": 1, "prediction": 1}, {"id": 2, "prediction": 0}, ...]`
"""

# User Prompt: 标准的 Few-Shot 结构
user_prompt = f"""{examples_text_block}

=========================================
**INSTRUCTION:**
Analyze the patterns in the examples above (amino acid composition, length, motifs).
Now, classify the following NEW protein sequences based on these patterns.

**Test Batch Data:**
{json.dumps(prompt_data_list, indent=2)}
"""

In [6]:
# ==========================================
# 4. 调用 Volcengine API
# ==========================================
print("-" * 30)
print(f"Calling Volcengine Model: {MODEL_ID}...")

try:
    response = client.chat.completions.create(
        model=MODEL_ID,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.1, # 低温以保证输出格式稳定
        top_p=0.9,
    )
    
    full_content = response.choices[0].message.content.strip()
    print("Response received.")
    # 打印前200个字符用于调试
    print(f"Response snippet: {full_content[:200]}...")

except Exception as e:
    print(f"API Call Failed: {e}")
    full_content = ""

------------------------------
Calling Volcengine Model: qwen3-max...
Response received.
Response snippet: [{"id": 1, "prediction": 1}, {"id": 2, "prediction": 0}, {"id": 3, "prediction": 0}, {"id": 4, "prediction": 1}, {"id": 5, "prediction": 0}, {"id": 6, "prediction": 1}, {"id": 7, "prediction": 1}, {"i...


In [7]:
# ==========================================
# 5. 解析结果与评估 (自动处理标签反转版)
# ==========================================
import re
import json
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def parse_llm_json(text):
    """提取并解析 JSON"""
    try:
        code_block = re.search(r"```json\s*(\[.*?\])\s*```", text, re.DOTALL)
        if code_block:
            return json.loads(code_block.group(1))
        match = re.search(r"\[.*\]", text, re.DOTALL)
        if match:
            return json.loads(match.group(0))
        return json.loads(text)
    except:
        return []

# 解析
predictions_list = parse_llm_json(full_content)

y_true = []
y_pred_raw = [] # 原始预测

print("-" * 30)
if not predictions_list:
    print("Failed to parse JSON.")
else:
    print(f"Parsed {len(predictions_list)} predictions.")
    
    for item in predictions_list:
        p_id = item.get('id')
        p_val = item.get('prediction')
        
        if p_id in id_to_ground_truth and p_val in [0, 1]:
            y_true.append(id_to_ground_truth[p_id])
            y_pred_raw.append(int(p_val))

    if y_true:
        # 1. 计算原始准确率
        acc_raw = accuracy_score(y_true, y_pred_raw)
        print(f"\n[Original] Accuracy: {acc_raw:.2%}")
        
        # 2. 检查是否需要反转 (Anti-correlation Check)
        final_y_pred = y_pred_raw
        is_flipped = False
        
        if acc_raw < 0.5:
            print("\n⚠️ Detected Label Flipping (Accuracy < 50%)!")
            print("The model found the pattern but swapped the labels.")
            print("Inverting predictions (0->1, 1->0)...")
            
            # 执行反转：0变1，1变0
            final_y_pred = [1 - y for y in y_pred_raw]
            is_flipped = True
        
        # 3. 计算最终指标
        acc_final = accuracy_score(y_true, final_y_pred)
        print(f"\n[Corrected] Final Accuracy: {acc_final:.2%}")
        
        print("\nClassification Report (After Correction):")
        # 这里的 target_names 顺序固定是 [0, 1]
        print(classification_report(y_true, final_y_pred, target_names=["Unbound (0)", "Bound (1)"]))
        
        # 打印混淆矩阵看一眼
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_true, final_y_pred))

        result_log = {
            "accuracy": acc_final,
            "flipped": is_flipped,
            "predictions": predictions_list
        }
    else:
        print("No valid matching IDs found.")

------------------------------
Parsed 100 predictions.

[Original] Accuracy: 60.00%

[Corrected] Final Accuracy: 60.00%

Classification Report (After Correction):
              precision    recall  f1-score   support

 Unbound (0)       0.62      0.69      0.65        54
   Bound (1)       0.57      0.50      0.53        46

    accuracy                           0.60       100
   macro avg       0.60      0.59      0.59       100
weighted avg       0.60      0.60      0.60       100


Confusion Matrix:
[[37 17]
 [23 23]]
