# Mutagen-PX: Tumor-specific gene mutation simulator

Welcome to **Mutagen-PX**, a lightweight Python toolkit designed to generate patient-specific “mutated” gene sequences based on TCGA mutation profiles.  

This notebook allows you to:
- Upload a text file of the reference gene sequence (FASTA format)
- Upload patient mutation profiles (TSV format)
- Generate a multi-FASTA file containing sequences that reflect tumor-specific mutations
- Easily download the results for downstream analysis

Mutagen-PX can run on **any gene** by providing the appropriate reference sequence and mutation table.

See GitHub repo for sample data [here](https://github.com/mahvin92/Mutagen-PX).


In [None]:
from google.colab import files

# Upload files manually
uploaded = files.upload()
# This will prompt you to upload 'tp53_seq.txt' and 'tp53_profile.tsv'

In [None]:
import pandas as pd

# -----------------------------
# PARAMETERS (set these in Colab)
# -----------------------------
gene_name = "TP53"
ref_file = "tp53_seq.txt"        # uploaded reference FASTA
profile_file = "tp53_profile.tsv"  # uploaded mutation profile TSV
output_file = f"{gene_name}-patients.fasta"
# -----------------------------

def read_fasta(filename):
    seq = []
    with open(filename) as f:
        for line in f:
            if not line.startswith(">"):
                seq.append(line.strip())
    return "".join(seq)

def write_fasta(sequences, filename):
    with open(filename, "w") as f:
        for sample, seq in sequences.items():
            f.write(f">{sample}\n")
            for i in range(0, len(seq), 80):
                f.write(seq[i:i+80] + "\n")

def apply_mutations(ref_seq, mutations):
    seq = list(ref_seq)
    mutations = sorted(
        mutations,
        key=lambda x: int(x['start']) if str(x['start']).isdigit() else 0,
        reverse=True
    )
    for mut in mutations:
        start = mut['start']
        end = mut['end']
        ref = mut['reference']
        alt = mut['change']

        if str(start).lower() == "no variant":
            continue

        start = int(start) - 1
        end = int(end)

        if ref == "-" and alt != "-":
            seq[start:start] = list(alt)
        elif alt == "-" and ref != "-":
            del seq[start:end]
        else:
            seq[start:end] = list(alt)
    return "".join(seq)

# -----------------------------
# MAIN EXECUTION
# -----------------------------
ref_seq = read_fasta(ref_file)
df = pd.read_csv(profile_file, sep="\t")

patient_sequences = {}

for sample, group in df.groupby("sample"):
    mutations = group.rename(columns={
        f"{gene_name} start": "start",
        f"{gene_name} end": "end",
        "reference": "reference",
        "change": "change"
    }).to_dict("records")

    if all(str(m['start']).lower() == "no variant" for m in mutations):
        patient_sequences[sample] = ref_seq
    else:
        patient_sequences[sample] = apply_mutations(ref_seq, mutations)

write_fasta(patient_sequences, output_file)
print(f"✅ Generated mutated sequences: {output_file}")


In [None]:
files.download(output_file)