# Gen data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import sys
import random
import os
sys.path.append("..")
from global_variables import *
from escape_map import EscapeMap, load_escape_map_from_csv,gen_artif_data, score_seq_batch

  dmean_v_dw = np.dot(s1.T, V)
  mean_V = np.dot(weights, V) / sum_weights


Loaded 29 KD vectors




In [3]:
#import utils from ../rbm
pgm_dir=('../')
sys.path.append(pgm_dir+'./utilities')

# Now safe to import
import utilities, Proteins_utils, sequence_logo, plots_utils
import rbm, RBM_utils  # rbm.py must be alongside this file

In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# --- Reproducibility (optional) ---
np.random.seed(42)

# --- Load model and set base concentrations ---
model = load_escape_map_from_csv("../fitness/params_sigmoid_diff.csv")
model.raw_concentrations = -12.0 * np.ones(len(KD_VECTORS), dtype=np.float64)

# total_beta = (1 / exp(raw_beta)) * coeff_beta
base_beta_factor = 1.0 / np.exp(model.raw_beta)

# --- Beta sweep & sampling config ---
BETA_LIST = [0.5, 0.8, 1,2,3]
N_SEQUENCES_PER_BETA = 100
N_CHAINS = 5
WARMING_STEPS = 1000
STEPS_BETWEEN = 100

# --- Outputs we’ll accumulate across betas ---
fasta_lines = []
score_rows = []
seq_counter = 0

for coeff_beta in BETA_LIST:
    # set current beta (no copying needed; just update the scalar)
    model.total_beta = base_beta_factor * float(coeff_beta)

    # --- Generate sequences ---
    seqs = gen_artif_data(
        model,
        n_sequences=N_SEQUENCES_PER_BETA,
        n_chains=N_CHAINS,
        warming_steps=WARMING_STEPS,
        steps_between_sampling=STEPS_BETWEEN,
        # init_seq=INIT_SEQ,  # optional
    )

    # --- Score sequences (normalize like your original script) ---
    scores = score_seq_batch(model, seqs) / model.total_beta  # (N,)

    # --- Convert to amino-acid strings and append to global FASTA ---
    seqs_str = Proteins_utils.num2seq(seqs)
    for seq_str, sc in zip(seqs_str, scores):
        seq_id = f"seq{seq_counter}"
        fasta_lines.append(f">{seq_id} | beta={coeff_beta:.3f}\n{seq_str}\n")
        score_rows.append({
            "seq_id": seq_id,
            "score": float(sc),
            "coeff_beta": float(coeff_beta),
        })
        seq_counter += 1

# --- Save a single FASTA with all sequences across betas ---
with open("generated_sequences_beta.fasta", "w") as fasta_file:
    fasta_file.writelines(fasta_lines)
print("All sequences saved to 'generated_sequences_beta.fasta'")

# --- Save one CSV with seq_id, score, coeff_beta ---
scores_df = pd.DataFrame(score_rows, columns=["seq_id", "score", "coeff_beta"])
scores_df.to_csv("generated_sequences_scores_beta.csv", index=False)
print("Scores saved to 'generated_sequences_scores_beta.csv'")

# Quick sanity checks
print("Total sequences:", len(scores_df))
print(scores_df.head())


Generating sequences...


100%|██████████| 5/5 [00:25<00:00,  5.15s/it]


output shape: (100, 178)
Generating sequences...


100%|██████████| 5/5 [00:25<00:00,  5.20s/it]


output shape: (100, 178)
Generating sequences...


100%|██████████| 5/5 [00:26<00:00,  5.25s/it]


output shape: (100, 178)
Generating sequences...


100%|██████████| 5/5 [00:26<00:00,  5.37s/it]


output shape: (100, 178)
Generating sequences...


100%|██████████| 5/5 [00:28<00:00,  5.66s/it]

output shape: (100, 178)
All sequences saved to 'generated_sequences_beta.fasta'
Scores saved to 'generated_sequences_scores_beta.csv'
Total sequences: 500
  seq_id      score  coeff_beta
0   seq0  57.397344         0.5
1   seq1  57.756809         0.5
2   seq2  57.669490         0.5
3   seq3  57.567324         0.5
4   seq4  57.736968         0.5





In [37]:
import os

INPUT_FASTA  = "generated_sequences_beta.fasta"       # or "generated_sequences_beta.fasta"
OUTPUT_FASTA = "generated_sequences_beta_prefixed.fasta"
PREFIX = "TNLCPFGEVFNATRFA"

def read_fasta(path):
    records = []
    header, seq_chunks = None, []
    with open(path, "r") as fh:
        for line in fh:
            line = line.rstrip("\n")
            if not line:
                continue
            if line.startswith(">"):
                if header is not None:
                    records.append((header, "".join(seq_chunks)))
                header, seq_chunks = line, []
            else:
                seq_chunks.append(line.strip())
        if header is not None:
            records.append((header, "".join(seq_chunks)))
    return records

def write_fasta(records, path, wrap_width=None):
    with open(path, "w") as fh:
        for header, seq in records:
            fh.write(f"{header}\n")
            if wrap_width and wrap_width > 0:
                for i in range(0, len(seq), wrap_width):
                    fh.write(seq[i:i+wrap_width] + "\n")
            else:
                fh.write(seq + "\n")

# Read, prepend, write
records = read_fasta(INPUT_FASTA)
prefixed = [(h, PREFIX + s) for (h, s) in records]
write_fasta(prefixed, OUTPUT_FASTA, wrap_width=0)  # set to 80 if you want wrapped lines

print(f"Prefixed {len(prefixed)} sequences.")
print(f"Wrote: {OUTPUT_FASTA}")
# quick peek
if prefixed:
    print("Example:")
    print(prefixed[0][0])
    print(prefixed[0][1][:80] + ("..." if len(prefixed[0][1])>80 else ""))


Prefixed 500 sequences.
Wrote: generated_sequences_beta_prefixed.fasta
Example:
>seq0 | beta=0.500
TNLCPFGEVFNATRFASVYQWNRQRVSNCVADYSVLYNSVFFSTFKCYGVSNTKLNDLCFLSVYADSFVIPGDEVRQIAP...


# Beta=3

In [54]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# --- Reproducibility (optional) ---
np.random.seed(42)

# --- Load model and set base concentrations ---
model = load_escape_map_from_csv("../fitness/params_sigmoid_diff.csv")
model.raw_concentrations = -12.0 * np.ones(len(KD_VECTORS), dtype=np.float64)

# total_beta = (1 / exp(raw_beta)) * coeff_beta
base_beta_factor = 1.0 / np.exp(model.raw_beta)

# --- Beta sweep & sampling config ---
BETA_LIST = [3]
N_SEQUENCES_PER_BETA = 1000
N_CHAINS = 10
WARMING_STEPS = 1000
STEPS_BETWEEN = 100

# --- Outputs we’ll accumulate across betas ---
fasta_lines = []
score_rows = []
seq_counter = 0

for coeff_beta in BETA_LIST:
    # set current beta (no copying needed; just update the scalar)
    model.total_beta = base_beta_factor * float(coeff_beta)

    # --- Generate sequences ---
    seqs = gen_artif_data(
        model,
        n_sequences=N_SEQUENCES_PER_BETA,
        n_chains=N_CHAINS,
        warming_steps=WARMING_STEPS,
        steps_between_sampling=STEPS_BETWEEN,
        # init_seq=INIT_SEQ,  # optional
    )

    # --- Score sequences (normalize like your original script) ---
    scores = score_seq_batch(model, seqs) / model.total_beta  # (N,)

    # --- Convert to amino-acid strings and append to global FASTA ---
    seqs_str = Proteins_utils.num2seq(seqs)
    for seq_str, sc in zip(seqs_str, scores):
        seq_id = f"seq{seq_counter}"
        fasta_lines.append(f">{seq_id} | beta={coeff_beta:.3f}\n{seq_str}\n")
        score_rows.append({
            "seq_id": seq_id,
            "score": float(sc),
            "coeff_beta": float(coeff_beta),
        })
        seq_counter += 1

# --- Save a single FASTA with all sequences across betas ---
with open("generated_sequences_beta3.fasta", "w") as fasta_file:
    fasta_file.writelines(fasta_lines)
print("All sequences saved to 'generated_sequences_beta3.fasta'")

# --- Save one CSV with seq_id, score, coeff_beta ---
scores_df = pd.DataFrame(score_rows, columns=["seq_id", "score", "coeff_beta"])
scores_df.to_csv("generated_sequences_scores_beta3.csv", index=False)
print("Scores saved to 'generated_sequences_scores_beta3.csv'")

# Quick sanity checks
print("Total sequences:", len(scores_df))
print(scores_df.head())


Generating sequences...


100%|██████████| 10/10 [06:43<00:00, 40.37s/it]


output shape: (1000, 178)
All sequences saved to 'generated_sequences_beta3.fasta'
Scores saved to 'generated_sequences_scores_beta3.csv'
Total sequences: 1000
  seq_id      score  coeff_beta
0   seq0  61.636061         3.0
1   seq1  61.646540         3.0
2   seq2  61.879691         3.0
3   seq3  61.879691         3.0
4   seq4  61.858077         3.0
