In [None]:
# ============================================================
# RNA sequence feature extraction using DNABERT-6
# ============================================================

import os, re
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

# --------------------
# Config (edit these paths to your data)
# --------------------
CIRC_SEQ = "data/circRNA_sequence.csv"     # [id, sequence] or just [sequence]
MIR_SEQ  = "data/miRNA_sequence.csv"       # [id, sequence] or just [sequence]
OUT_CIRC = "data/circRNA_Extractedfeatures.csv"
OUT_MIR  = "data/miRNA_Extractedfeatures.csv"

MODEL_ID = "zhihan1996/DNA_bert_6"
K = 6   # k-mer size

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# --------------------
# DNABERT model
# --------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model     = AutoModel.from_pretrained(MODEL_ID).to(device).eval()

# --------------------
# Helpers
# --------------------
DNA_RE = re.compile(r"[^ACGT]")

def clean_dna(seq: str) -> str:
    """Uppercase, replace U→T, drop non-ACGT as N."""
    return DNA_RE.sub("N", str(seq).upper().replace("U", "T"))

def kmers(seq: str, k=6):
    s = clean_dna(seq)
    if len(s) < k: return []
    return [s[i:i+k] for i in range(len(s)-k+1) if "N" not in s[i:i+k]]

@torch.no_grad()
def dnabert_embed_texts(texts, batch_size=16):
    """Embed list of k-mer sentences into mean-pooled DNABERT embeddings."""
    feats = []
    for i in range(0, len(texts), batch_size):
        toks = tokenizer(
            texts[i:i+batch_size],
            padding=True, truncation=True, max_length=512,
            return_tensors="pt"
        )
        toks = {k:v.to(device) for k,v in toks.items()}
        hidden = model(**toks).last_hidden_state       # [B, T, H]
        attn   = toks["attention_mask"].unsqueeze(-1)  # [B, T, 1]
        pooled = (hidden * attn).sum(1) / attn.sum(1).clamp(min=1)
        feats.append(pooled.cpu().numpy())
    return np.vstack(feats)

def embed_sequences(path: str, outfile: str, label: str):
    print(f"[INFO] Reading {label} sequences from {path}")
    df = pd.read_csv(path, header=None)  # assumes 1 column (sequence) or 2 (id, sequence)
    
    if df.shape[1] == 1:   # no IDs
        seqs = df.iloc[:,0].astype(str).tolist()
        ids = [f"{label}_{i+1}" for i in range(len(seqs))]
    else:                  # first col = ID, second col = sequence
        ids = df.iloc[:,0].astype(str).tolist()
        seqs = df.iloc[:,1].astype(str).tolist()

    print(f"[INFO] Processing {len(seqs)} {label} sequences...")
    texts, keep_ids = [], []
    for idx, s in zip(ids, seqs):
        km = kmers(s, K)
        if km:
            texts.append(" ".join(km))
            keep_ids.append(idx)

    print(f"[INFO] Embedding {len(texts)} {label} sequences with DNABERT...")
    feats = dnabert_embed_texts(texts, batch_size=16)

    out_df = pd.DataFrame(feats, index=keep_ids)
    out_df.index.name = "ID"
    out_df.to_csv(outfile)
    print(f"[INFO] Saved {outfile}  shape={out_df.shape}")

# --------------------
# Run for circRNA & miRNA
# --------------------
if __name__ == "__main__":
    embed_sequences(CIRC_SEQ, OUT_CIRC, "circRNA")
    embed_sequences(MIR_SEQ,  OUT_MIR,  "miRNA")
    print("[DONE] Feature extraction complete")
