In [None]:
# huggingface 处理大文件
#1 数据加载
## 1.1 加载本地数据
## 1.2 分块加载本地大数据
#2 token and embedding数据


In [34]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

In [8]:
data_files = {"train": "/home/ec2-user/project/AI/Study/Week1/data/train.csv", "validation": "/home/ec2-user/project/AI/Study/Week1/data/val.csv"}
down_dataset = load_dataset("csv",data_files=data_files)

In [9]:
next(iter(down_dataset))
down_dataset

DatasetDict({
    train: Dataset({
        features: ['prot_id', 'seq', 'seq_len', 'pdb_filename', 'ptm', 'mean_plddt', 'emb_filename', 'label', 'source'],
        num_rows: 963
    })
    validation: Dataset({
        features: ['prot_id', 'seq', 'seq_len', 'pdb_filename', 'ptm', 'mean_plddt', 'emb_filename', 'label', 'source'],
        num_rows: 275
    })
})

In [36]:
big_dataset_1 = load_dataset("csv",data_files=data_files,
    split="train",
    streaming=True,
    cache_dir="../test/dataset"
    )

In [27]:
big_dataset
#next(iter(big_dataset))

IterableDataset({
    features: Unknown,
    num_shards: 1
})

In [17]:
!pip install -U sentence-transformers   

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.2.0-py3-none-any.whl (493 kB)
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-5.2.0


In [None]:
# token and embedding my big dataset

In [38]:
from datasets import load_dataset
import torch
import numpy as np
import os
import json
from transformers import AutoModel, AutoTokenizer
import itertools
from tqdm import tqdm  # 进度条，便于监控

In [46]:
# 导入必备库
from datasets import load_dataset
import torch
import numpy as np
import os
import json
from transformers import AutoModel, AutoTokenizer

# ===================== 1. 基础配置（改这里的路径即可） =====================
DATA_FILE = "/home/ec2-user/project/AI/Study/Week1/data/train.csv"  # 你的CSV文件
SAVE_DIR = "/home/ec2-user/project/AI/Study/Week1/test/dataset/embedding"  # 保存embedding的文件夹
CHUNK_SIZE = 64  # 每次处理64条数据（按需改）
MODEL_NAME = "facebook/esm2_t6_8M_UR50D"  # ESM-2小模型
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # 自动选GPU/CPU

In [47]:
# ===================== 2. 加载模型和分词器（一次性加载） =====================
print("加载ESM-2模型...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).eval().to(DEVICE)  # 推理模式

# 创建保存文件夹
os.makedirs(SAVE_DIR, exist_ok=True)

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


加载ESM-2模型...


In [None]:
# ===================== 3. 流式读取数据+计算embedding =====================
# 加载CSV为流式数据集（不占内存）
dataset = load_dataset("csv", data_files=DATA_FILE, split="train", streaming=True)
dataset_iter = iter(dataset)  # 转为迭代器。举例子：next(dataset_iter) 即可取一条数据
chunk_idx = 0 # 块索引
# 逐块处理数据
while True: 
    # 1 这是把数据以64条为一块取出来 2是进行序列处理
    batch_data =[] # batch_data 存储64块数据
    for _ in range(CHUNK_SIZE):
        try:
            item = next(dataset_iter)
            batch_data.append(item)
        except StopIteration:
            break
    if not batch_data:
        break  # 数据取完，退出循环

    # 2. 提取序列和ID，过滤空值。 用来保存为 embedding
    seqs = [item["seq"] for item in batch_data if item["seq"]] # 列表表达式提取序列
    prot_ids = [item["prot_id"] for item in batch_data if item["seq"]]
    if not seqs:
        chunk_idx +=1
        continue

    # 3 分词+计算embedding
    inputs = tokenizer(seqs, return_tensors='pt', padding=True, truncation=True, max_length=1024).to(DEVICE)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:,0,:].cpu().numpy()

    # 4. 保存结果（JSON格式，ID和embedding一一对应）
    emb_dict = {pid: emb.tolist() for pid, emb in zip(prot_ids, embeddings)}
    json_path = os.path.join(SAVE_DIR, f"emb_chunk_{chunk_idx}.json")
    with open(json_path, "w") as f:
        json.dump(emb_dict, f)
    
    #print(f"第{chunk_idx}块：保存{len(emb_dict)}条embedding")
    chunk_idx += 1


def load_all_embeddings(save_dir):
    """加载所有保存的embedding，返回{prot_id: embedding}的字典"""
    all_emb = {}
    for file in os.listdir(save_dir):
        if file.startswith("emb_chunk_") and file.endswith(".json"):
            with open(os.path.join(save_dir, file), "r") as f:
                all_emb.update(json.load(f))
    return all_emb
all_emb = load_all_embeddings(SAVE_DIR)
print(f"\n全部加载完成！共{len(all_emb)}条embedding")
print(all_emb["REP|ywg_BAD18935.1"]) #一维矩阵，长320


全部加载完成！共963条embedding
[-0.007527138572186232, 0.5384337306022644, 0.1969233602285385, 0.3183571398258209, -0.022737350314855576, 0.04540286958217621, -0.47190532088279724, 0.0961337462067604, -0.24226747453212738, -0.8840938806533813, 0.32341644167900085, 0.24364060163497925, -0.813580334186554, -0.17932505905628204, -0.37175115942955017, -0.048180922865867615, 0.17804019153118134, -0.05104481428861618, -0.3904653787612915, -0.5403650403022766, -0.20769819617271423, -0.267784059047699, -0.34677940607070923, -0.0736531913280487, 0.16310368478298187, 0.3685562312602997, -0.40533530712127686, 0.10851006954908371, 0.18702229857444763, -0.06625205278396606, -0.001945540658198297, 0.0906536728143692, -0.35018134117126465, 0.17474716901779175, 0.13165134191513062, -0.16855557262897491, -0.18857955932617188, -0.32526490092277527, 0.1622246354818344, -0.4267810881137848, 0.21074829995632172, 0.011185672134160995, -0.3855058252811432, -0.12766128778457642, 0.294915109872818, 0.46019843220710754