In [None]:
!pip install faiss-gpu

In [2]:
import torch
from transformers import BertTokenizer, AlbertModel
import numpy as np
from tqdm import tqdm
import json

In [None]:
def get_embeddings(texts, model, tokenizer, device, batch_size=32):
    """批量处理获取文本向量"""
    embeddings = []
    
    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i + batch_size]
        # 移除标注信息，只保留制表符前的原始文本
        batch_texts = [text.split('\t')[0] for text in batch_texts]
        inputs = tokenizer(
            batch_texts, 
            return_tensors="pt", 
            padding=True, 
            truncation=True, 
            max_length=512
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.extend(batch_embeddings)
    
    return np.array(embeddings)

def spherical_mean(vectors, device="cuda"):
    # 将数据转移到 GPU
    vectors_gpu = torch.from_numpy(vectors).to(device)
    
    # 初始估计使用向量和的归一化
    center = torch.sum(vectors_gpu, dim=0)
    center = center / torch.norm(center)
    
    max_iter = 100
    tolerance = 1e-7
    prev_center = None
    
    # 迭代优化以找到真正的球面质心
    for _ in range(max_iter):
        if prev_center is not None:
            # 检查收敛
            if torch.abs(1 - torch.dot(center, prev_center)) < tolerance:
                break
                
        prev_center = center.clone()
        
        # 计算每个向量到当前中心的测地线距离的梯度
        dots = torch.matmul(vectors_gpu, center)  # 批量计算点积
        outer = dots.unsqueeze(1) * center.unsqueeze(0)  # 批量外积
        tangent_vectors = vectors_gpu - outer
        mean_direction = torch.mean(tangent_vectors, dim=0)
        
        # 如果梯度太小，说明已经收敛
        if torch.norm(mean_direction) < tolerance:
            break
            
        # 沿测地线更新中心
        new_center = center + mean_direction
        center = new_center / torch.norm(new_center)
    
    # 将结果转回 CPU
    return center.cpu().numpy()

def select_diverse_samples_faiss(embeddings,texts, target_size, device="cuda"):
    """使用 FAISS 加速样本选择,支持 GPU 加速"""
    import faiss
    
    # 检查是否可以使用 GPU
    use_gpu = device == "cuda" and faiss.get_num_gpus() > 0
    
    def create_index(vectors):
        """创建新的索引"""
        d = vectors.shape[1]
        if use_gpu:
            res = faiss.StandardGpuResources()
            config = faiss.GpuIndexFlatConfig()
            config.device = 0
            config.useFloat16 = True
            index = faiss.GpuIndexFlatIP(res, d, config)
        else:
            index = faiss.IndexFlatIP(d)
        
        # 添加向量到索引（向量已经在外部归一化）
        batch_size = 50000
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            index.add(batch)
            
        return index
    
    # 预处理：对所有向量进行归一化
    embeddings = embeddings.astype('float32')
    faiss.normalize_L2(embeddings)
    
    # 选择初始点
    # initial_idx = np.random.randint(len(embeddings))
    initial_idx = 0
    print(f"起始向量 { initial_idx } 原始文本 { texts[initial_idx] }")
    selected_indices = [initial_idx]
    selected_mask = np.zeros(len(embeddings), dtype=bool)
    selected_mask[initial_idx] = True
    
    # 搜索参数
    rebuild_interval = 500  # 每500个样本重建一次索引
    initial_k = 100
    k_step = 100
    max_k = min(1000, len(embeddings))
    max_attempts = 5
    
    # 初始化索引和映射
    index = None
    index_map = {}
    
    with tqdm(total=target_size-1, desc="选择多样化样本") as pbar:
        while len(selected_indices) < target_size:
            try:
                # 每当选中rebuild_interval个样本时重建索引
                if len(selected_indices) % rebuild_interval == 0 or index is None:
                    # 清理旧索引
                    if index is not None and use_gpu:
                        del index
                        torch.cuda.empty_cache()
                    
                    # 获取未选择的样本索引
                    unselected = np.where(~selected_mask)[0]
                    if len(unselected) == 0:
                        print("所有点都已被选择")
                        break
                        
                    # 创建索引映射关系
                    index_map = {i: idx for i, idx in enumerate(unselected)}
                    # 创建新的索引（使用已归一化的向量）
                    index = create_index(embeddings[unselected])
                
                # 计算已选向量的球面质心
                selected_vectors = embeddings[selected_indices]
                center = spherical_mean(selected_vectors, device)
                
                # 重整形并确保仍然归一化
                center = center.reshape(1, -1).astype('float32')
                
                # 动态搜索策略
                current_k = initial_k
                attempts = 0
                found = False
                
                while not found and attempts < max_attempts:
                    # 使用向量取反来搜索最远的点
                    D, I = index.search(-center, current_k)
                    
                    # 将局部索引映射回原始索引
                    # original_indices = [index_map[idx] for idx in I[0]]
                    
                    # 尝试找到未选择的点
                    for i,idx in enumerate(I[0]):
                        orig_idx = index_map[idx]
                        if not selected_mask[orig_idx]:
                            # print(f"选择向量: {orig_idx} 原始文本 { texts[orig_idx] } 距离: {D[0][i]} ")
                            # 打印样本前后距离
                            # print(f"向量前后向量距离: { D[0][i-1] if i > 0 else 'None'}  {D[0][i+1] if i < len(I[0]) - 1 else 'None'}")

                            selected_indices.append(orig_idx)
                            selected_mask[orig_idx] = True
                            pbar.update(1)
                            found = True
                            break
                    
                    if not found:
                        current_k = min(current_k + k_step, max_k)
                        attempts += 1
                
                # 如果多次尝试后仍未找到点，随机选择一个未选择的点
                if not found:
                    unselected = np.where(~selected_mask)[0]
                    if len(unselected) > 0:
                        random_idx = np.random.choice(unselected)
                        selected_indices.append(random_idx)
                        selected_mask[random_idx] = True
                        print(f"随机选择一个未选择的点 {random_idx}")
                        pbar.update(1)
                    else:
                        print("所有点都已被选择")
                        break
                
            except RuntimeError as e:
                print(f"处理时出错: {e}")
                if use_gpu:
                    torch.cuda.empty_cache()
                continue
    
    # 清理最终资源
    if use_gpu and index is not None:
        del index
        torch.cuda.empty_cache()
    
    return selected_indices

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 加载模型和分词器
model_name = "voidful/albert_chinese_large"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = AlbertModel.from_pretrained(model_name).to(device)

In [None]:
def load_translation_corpus(corpus_path):
    """加载翻译语料库文件"""
    with open(corpus_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        return data['pairs']

translation_pairs = load_translation_corpus("./translation_corpus.json")

# 提取中文文本并生成向量
chinese_texts = [pair[1] for pair in translation_pairs]
embeddings = get_embeddings(chinese_texts, model, tokenizer, device)

# 选择训练集大小
train_size = int(len(translation_pairs) * 0.7)
# 使用 FAISS 选择多样化样本
selected_indices = select_diverse_samples_faiss(embeddings,chinese_texts, train_size, device=str(device))

# 划分训练集和验证集
train_pairs = [translation_pairs[i] for i in selected_indices]
val_pairs = [pair for i, pair in enumerate(translation_pairs) if i not in selected_indices]
    
# 准备输出数据
result = {
    "train": train_pairs,
    "validation": val_pairs,
}

# 保存结果
with open("translation_dataset.json", "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)