In [3]:
"""
RNA Codon & Amino Acid Feature Extraction Pipeline
--------------------------------------------------

Description:
This script converts numerical embeddings into binary, DNA, and RNA sequences.
Then, it computes codon-level and amino-acid-level features and saves them into CSV files.

Output files:
    - features1.csv: Codon probability features
    - features2.csv: Amino acid normalized features
    - features3.csv: Combined features
"""

In [4]:
# ==========================================================
# Imports
# ==========================================================
import pandas as pd
from itertools import product
from tqdm import tqdm

In [None]:
# ==========================================================
# Configuration
# ==========================================================
INPUT_PATH = "../embeddings/embeddings_final.csv"
CODON_FEATURES_FILE = "feature1.csv"
AMINO_FEATURES_FILE = "feature2.csv"
FINAL_FEATURES_FILE = "feature3.csv"

NUMBER_OF_BITS = 16
CODON_LENGTH = 3

BINARY_TO_DNA_MAP = {"00": "A", "01": "G", "10": "C", "11": "T"}
DNA_TO_RNA = str.maketrans("T", "U")

CODON_TO_AMINO = {
    "UUU": "Phe",
    "UUC": "Phe",
    "UUA": "Leu",
    "UUG": "Leu",
    "CUU": "Leu",
    "CUC": "Leu",
    "CUA": "Leu",
    "CUG": "Leu",
    "UCU": "Ser",
    "UCC": "Ser",
    "UCA": "Ser",
    "UCG": "Ser",
    "AGU": "Ser",
    "AGC": "Ser",
    "AUU": "Ile",
    "AUC": "Ile",
    "AUA": "Ile",
    "AUG": "Met",
    "GUU": "Val",
    "GUC": "Val",
    "GUA": "Val",
    "GUG": "Val",
    "CCU": "Pro",
    "CCC": "Pro",
    "CCA": "Pro",
    "CCG": "Pro",
    "ACU": "Thr",
    "ACC": "Thr",
    "ACA": "Thr",
    "ACG": "Thr",
    "GCU": "Ala",
    "GCC": "Ala",
    "GCA": "Ala",
    "GCG": "Ala",
    "UAU": "Tyr",
    "UAC": "Tyr",
    "CAU": "His",
    "CAC": "His",
    "CAA": "Gln",
    "CAG": "Gln",
    "AAU": "Asn",
    "AAC": "Asn",
    "AAA": "Lys",
    "AAG": "Lys",
    "GAU": "Asp",
    "GAC": "Asp",
    "GAA": "Glu",
    "GAG": "Glu",
    "UGU": "Cys",
    "UGC": "Cys",
    "UGG": "Trp",
    "CGU": "Arg",
    "CGC": "Arg",
    "CGA": "Arg",
    "CGG": "Arg",
    "AGA": "Arg",
    "AGG": "Arg",
    "GGU": "Gly",
    "GGC": "Gly",
    "GGA": "Gly",
    "GGG": "Gly",
    "UAA": "Stop",
    "UAG": "Stop",
    "UGA": "Stop",
}

In [6]:
# Convert a numeric embedding value to a binary string
def convert_to_binary(x: float) -> str:
    return format(int(x), f"0{NUMBER_OF_BITS}b")

In [7]:
# Convert binary string to DNA sequence
def binary_to_dna(binary_str: str) -> str:
    return "".join(
        BINARY_TO_DNA_MAP[binary_str[i : i + 2]] for i in range(0, len(binary_str), 2)
    )

In [8]:
# Convert DNA sequence to RNA sequence
def dna_to_rna(dna_seq: str) -> str:
    return dna_seq.translate(DNA_TO_RNA)

In [9]:
# Concatenate all RNA columns in a row into a single RNA sequence string
def join_rna_row(row: pd.Series) -> str:
    return "".join(row.values)

In [None]:
# Return all possible RNA codons
def generate_codons() -> list:
    return ["".join(p) for p in product("AGCU", repeat=3)]

In [None]:
# Compute codon frequency distribution for one RNA sequence
def compute_codon_probabilities(
    sequence: str, codons: list, all_codon_counts: list
) -> dict:
    codon_counts = {c: 0 for c in codons}
    codon_list = [
        sequence[i : i + CODON_LENGTH] for i in range(0, len(sequence), CODON_LENGTH)
    ]
    total_codons = len(codon_list)

    for codon in codon_list:
        if codon in codon_counts:
            codon_counts[codon] += 1

    all_codon_counts.append(codon_counts)
    return {c: codon_counts[c] / total_codons for c in codons}

In [None]:
# Aggregate codon frequencies into amino acid-level normalized features
def compute_amino_features(codon_count_row: dict, amino_acids: list) -> dict:
    amino_counts = {aa: 0.0 for aa in amino_acids}

    for codon, freq in codon_count_row.items():
        amino = CODON_TO_AMINO[codon]
        amino_counts[amino] += freq

    return {aa: amino_counts[aa] / 8192 for aa in amino_acids}

In [None]:
# ==========================================================
# Main Pipeline
# ==========================================================
def main():
    print("Loading embeddings...")
    df = pd.read_csv(INPUT_PATH)

    print("Converting numeric values to binary...")
    binary_df = df.map(convert_to_binary)

    print("Translating binary → DNA → RNA...")
    dna_df = binary_df.map(binary_to_dna)
    rna_df = dna_df.map(dna_to_rna)

    print("Combining RNA sequences row-wise...")
    final_rna_sequence = rna_df.apply(join_rna_row, axis=1)

    print("Generating codon frequency features...")
    codons = generate_codons()
    all_codon_counts = []
    codon_features = [
        compute_codon_probabilities(seq, codons, all_codon_counts)
        for seq in tqdm(final_rna_sequence, desc="Processing RNA sequences")
    ]

    codon_features_df = pd.DataFrame(codon_features)
    codon_features_df.to_csv(CODON_FEATURES_FILE, index=False)
    print(f"Codon features saved → {CODON_FEATURES_FILE}")

    print("Computing amino acid features...")
    amino_acids = sorted(set(CODON_TO_AMINO.values()))
    amino_features = [
        compute_amino_features(row_codon_count, amino_acids)
        for row_codon_count in tqdm(
            all_codon_counts, desc="Computing amino acid features"
        )
    ]

    amino_features_df = pd.DataFrame(amino_features)
    amino_features_df.to_csv(AMINO_FEATURES_FILE, index=False)
    print(f"Amino acid features saved → {AMINO_FEATURES_FILE}")

    print("Combining final feature set...")
    final_features_df = pd.concat([codon_features_df, amino_features_df], axis=1)
    final_features_df.to_csv(FINAL_FEATURES_FILE, index=False)
    print(f"Final features saved → {FINAL_FEATURES_FILE}")

    print("Pipeline completed successfully!")

In [None]:
# ==========================================================
# Entry Point
# ==========================================================
if __name__ == "__main__":
    main()