In [None]:
import os
import numpy as np
import joblib  # 用于保存和加载模型或中间数据
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor


def load_labse_model():
    """
    加载 LaBSE 模型 (需要提前安装 sentence-transformers)
    例如: pip install sentence-transformers
    """
    print("Loading LaBSE model...")
    model = SentenceTransformer('sentence-transformers/LaBSE')
    print("LaBSE model loaded successfully!")
    return model


def compress_embedding(embedding, num_groups=16):
    """
    对向量进行简单的分组聚合压缩：
      - 将向量等分成 num_groups 组
      - 每组内元素求和，再除以 sqrt(组大小)，保证数值稳定
      - 返回压缩后向量 (num_groups 维)
    """
    vec_len = len(embedding)
    group_size = vec_len // num_groups
    compressed = []

    for i in range(num_groups):
        start_idx = i * group_size
        end_idx = start_idx + group_size
        group_values = embedding[start_idx:end_idx]
        # 注意: 这里假设 embedding.length 是 num_groups 的整数倍，否则需额外处理
        s = np.sum(group_values)
        # 除以 sqrt(组大小) 确保数值平衡
        s /= np.sqrt(group_size)
        compressed.append(s)

    return np.array(compressed)


def get_embedding_vectors(model, instruction_text, user_input_text, 
                          d_app=4, d_user=16):
    """
    获取应用层语义 (instruction) 与用户层语义 (user_input) 的 LaBSE 向量，
    并根据指定的分组大小进行压缩。
    
    :param model:          已加载的 LaBSE 模型
    :param instruction_text: 指令文本
    :param user_input_text:  用户输入文本
    :param d_app:          对指令向量进行分组压缩后的维度
    :param d_user:         对用户向量进行分组压缩后的维度
    :return: (compressed_app_vec, compressed_user_vec)
    """
    # 获取原始 768 维 LaBSE 向量
    app_vec = model.encode(instruction_text)
    user_vec = model.encode(user_input_text)

    # 压缩
    compressed_app = compress_embedding(app_vec, num_groups=d_app)
    compressed_user = compress_embedding(user_vec, num_groups=d_user)
    
    return compressed_app, compressed_user


def prepare_training_data(model, data_samples):
    """
    给定多个 (instruction, user_input, user_input_length, generation_length) 样本，
    生成训练所需的特征和标签:
    
    data_samples 格式示例：
    [
        {
          "instruction": "Translate to German",
          "user_input": "I love you",
          "user_input_length": 3,
          "actual_gen_length": 5
        },
        ...
    ]
    这里的 actual_gen_length 即为训练标签 (生成长度)
    """
    X = []
    y = []

    for sample in data_samples:
        instr = sample["instruction"]
        u_input = sample["user_input"]
        u_length = sample["user_input_length"]
        gen_length = sample["actual_gen_length"]  # 真实的请求生成长度

        # 获取两个压缩后的向量
        c_app, c_user = get_embedding_vectors(model, instr, u_input)
        
        # 拼接 (应用层向量, 用户层向量, 用户输入长度)
        features = np.concatenate([c_app, c_user, [u_length]])
        
        X.append(features)
        y.append(gen_length)

    return np.array(X), np.array(y)


def train_random_forest(X, y, n_estimators=100, save_path="rf_model.pkl"):
    """
    训练随机森林回归模型，并将模型进行持久化保存
    """
    print("Training Random Forest Regressor...")
    rf_regressor = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf_regressor.fit(X, y)
    print("Training completed!")

    if save_path:
        joblib.dump(rf_regressor, save_path)
        print(f"Random Forest model saved to {save_path}")

    return rf_regressor


def main():
    # 1. 加载 LaBSE
    labse_model = load_labse_model()

    # 2. 这里假设你已经准备好了一个 data_samples 列表，包含训练数据
    #    可以从文件或数据库中读取
    data_samples = [
        {
            "instruction": "Translate to German",
            "user_input": "I love you",
            "user_input_length": 3,
            "actual_gen_length": 5
        },
        {
            "instruction": "Fix bugs in the following code",
            "user_input": "def add(a,b): return a+b",
            "user_input_length": 7,
            "actual_gen_length": 12
        },
        # ... 你可以在这里补充更多的数据样本
    ]

    # 3. 生成训练数据 (特征 X 与 标签 y)
    X, y = prepare_training_data(labse_model, data_samples)

    # 4. 存储预处理好的特征和标签，以免多次重复生成
    np.save("X_features.npy", X)
    np.save("y_labels.npy", y)
    print("Features and labels saved to .npy files.")

    # 5. 训练随机森林模型并持久化
    rf_model = train_random_forest(X, y, n_estimators=50, save_path="rf_model.pkl")

    # 6. 演示如何加载并做推理
    loaded_rf = joblib.load("rf_model.pkl")
    print("Model loaded from rf_model.pkl.")

    # 例如对一条新数据做预测（仅示例）
    new_instruction = "Translate to Spanish"
    new_user_input = "Hello World"
    new_user_length = 2
    
    c_app_new, c_user_new = get_embedding_vectors(labse_model, new_instruction, new_user_input)
    new_feature = np.concatenate([c_app_new, c_user_new, [new_user_length]]).reshape(1, -1)
    pred_generation_len = loaded_rf.predict(new_feature)[0]

    print(f"Predicted generation length for the new request is: {pred_generation_len}")


if __name__ == "__main__":
    main()


In [1]:
import numpy as np
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor
def load_labse_model():
    """
    加载 LaBSE 模型 (需要提前安装 sentence-transformers)
    """
    print("Loading LaBSE model...")
    # 如果你本地已经下载了 LaBSE，也可以改成本地路径或路径别名
    model = SentenceTransformer('sentence-transformers/LaBSE')
    print("LaBSE model loaded successfully!")
    return model
labse_model = load_labse_model()

  from .autonotebook import tqdm as notebook_tqdm


Loading LaBSE model...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Error while downloading from https://cdn-lfs.hf.co/sentence-transformers/LaBSE/77d8e1f2dbab6eb5d3c261ce9d3dbf1e3c69e02938c95f934f94f42c22dfa31f?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1743701583&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MzcwMTU4M319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9zZW50ZW5jZS10cmFuc2Zvcm1lcnMvTGFCU0UvNzdkOGUxZjJkYmFiNmViNWQzYzI2MWNlOWQzZGJmMWUzYzY5ZTAyOTM4Yzk1ZjkzNGY5NGY0MmMyMmRmYTMxZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=nYUohlttwuR6IKKX17fjwOjP1-IoPCatroebkUHLpVta%7EV25gb0N3Q5KhQVMu0eW2YXJUjHwh5uDFp9eBga7PCZZoi7zJ3MNMr1GxsFSg48LyDBWWoVs18qLyeT

KeyboardInterrupt: 

In [3]:
from transformers import BertModel, BertTokenizer

# 加载分词器和模型
tokenizer = BertTokenizer.from_pretrained('sentence-transformers/LaBSE')
model = BertModel.from_pretrained('sentence-transformers/LaBSE')

# 编码句子
sentences = ["这是一个示例句子。", "这是另一个句子。"]
inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)

# 获取句子嵌入
outputs = model(**inputs)
embeddings = outputs.pooler_output


OSError: Can't load tokenizer for 'sentence-transformers/LaBSE'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'sentence-transformers/LaBSE' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [2]:
from sentence_transformers import SentenceTransformer
print("hi")
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('LaBSE',cache_folder=r"D:\math\A-reasoning_demo\Models\LaBSE")
embeddings = model.encode(sentences)
print(embeddings)

hi


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [3]:
# 首先下载一个预训练模型
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
 
# 然后提供一些句子给模型
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.',
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)
 
# 现在有了一个带有嵌入的NumPy数组列表
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\25670\\.cache\\huggingface\\hub\\models--sentence-transformers--all-MiniLM-L6-v2\\snapshots\\c9745ed1d9f207416be6d2e6f8de32d1f16199bf\\1_Pooling\\config.json'

In [5]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

# 手动加载组件
model_path = "D:/math/A-reasoning_demo/Models/LaBSE"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# 组合为 SentenceTransformer
sbert_model = SentenceTransformer(modules=[model, tokenizer])

# 测试
embeddings = sbert_model.encode(["This is an example sentence"])
print(embeddings)

TypeError: transformers.models.bert.tokenization_bert_fast.BertTokenizerFast is not a Module subclass

In [2]:
# 怎么区分 instruction 和 user input
import cpuinfo
info = cpuinfo.get_cpu_info()
print("AVX supported:", info['flags'].count('avx') > 0)
print("AVX2 supported:", info['flags'].count('avx2') > 0)

ModuleNotFoundError: No module named 'cpuinfo'