In [1]:
import random
import os

In [3]:
def train_markov_chain(data, k=1):
    model = {}
    for sequence in data:
        for i in range(len(sequence) - k):
            kmer = sequence[i:i+k]
            next_base = sequence[i+k]
            if kmer not in model:
                model[kmer] = []
            model[kmer].append(next_base)
    return model

def generate_sequence(markov_model, length=100):
    current_kmer = random.choice(list(markov_model.keys()))
    k = len(current_kmer)
    generated_sequence = current_kmer
    for _ in range(length - k):
        if current_kmer not in markov_model: # if the current kmer is not in the model (e.g., it was at the end of a sequence in the data), choose a random kmer
            current_kmer = random.choice(list(markov_model.keys()))
        possible_bases = markov_model[current_kmer]
        next_base = random.choice(possible_bases)
        generated_sequence += next_base
        current_kmer = current_kmer[1:] + next_base
    return generated_sequence

In [4]:
# Train data as list of sequence strings
#data_loc = "../data/prom400"
data_loc = "../data/human"
train_set_loc = data_loc + '/train.txt'
with open(train_set_loc) as f:
    lines = f.read().split('\n')

# Train the Markov chain model
max_k = 6
markov_models = {}
for k in range(1, max_k+1):
    markov_models[k] = train_markov_chain(lines, k)

In [23]:
# Generate sequence samples
sample_folder = data_loc + '/markovBaseline'
batch_num = 2
batch_size = 64
for k in markov_models:
    with open(os.path.join(sample_folder, f"{k}.txt"), "w") as f:
        for _ in range(batch_num * batch_size):
            generated_sequence = generate_sequence(markov_models[k], length=300)
            f.write(generated_sequence + "\n") 