In [56]:
!pip3 install msprime

In [8]:
import msprime
import numpy as np

In [None]:

# Define parameters
population_size = 100000
outgroup_num = 1   # Single individual for the outgroup
num_individuals = 20  # Number of diploid individuals
chromosome_length = 10_000_000
mutation_rate = 1e-8
recombination_rate = 1e-8
divergence_time = 10_000_000  # Time of divergence in generations for the outgroup

# Setup demography, including an outgroup
demography = msprime.Demography()
demography.add_population(name="ingroup", initial_size=population_size)
demography.add_population(name="outgroup", initial_size=population_size)
demography.add_population_split(time=divergence_time, derived=["ingroup"], ancestral="outgroup")

# Simulate the tree sequence
tree_sequence = msprime.sim_ancestry(
    samples={"ingroup": num_individuals, "outgroup": outgroup_num},
    ploidy=2,  
    demography=demography,
    sequence_length=chromosome_length,
    recombination_rate=recombination_rate,
    random_seed=42
)

# Introduce mutations on the tree sequence
mutated_ts = msprime.sim_mutations(
    tree_sequence,
    rate=mutation_rate,
    random_seed=42
)



# Set up the output VCF file, only writing the final flipped version
output_vcf = "flipped_simulated_data.vcf"

with open(output_vcf, "w") as vcf_file:
    # Write the VCF header
    vcf_file.write("##fileformat=VCFv4.2\n")
    vcf_file.write(f"##contig=<ID=chr1,length={chromosome_length}>\n")
    vcf_file.write(f"##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n")
    vcf_file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT")
    
    
    for i in range(num_individuals):
        vcf_file.write(f"\tingroup_{i}")
    vcf_file.write("\toutgroup_0\n")

    # Process and flip each variant; add ancestral allele
    for variant in mutated_ts.variants():
        original_ref = variant.alleles[0]
        original_alts = variant.alleles[1:]
        
        outgroup_allele_index = variant.genotypes[-1]  # Assuming last haplotype index
        ancestral_allele = variant.alleles[outgroup_allele_index]

        
        # Flip REF and first ALT
        if original_alts:
            new_ref = original_alts[0]
            new_alts = [original_ref] + list(original_alts[1:])

            # Write out variant line
            vcf_file.write(f"chr1\t{int(variant.site.position) + 1}\t.\t{new_ref}\t{','.join(new_alts)}\t.\t.\tAA={ancestral_allele}\tGT")

            for i in range(len(variant.genotypes) // 2):
                # Adjust genotype based on flipped alleles
                g1 = variant.genotypes[2 * i]
                g2 = variant.genotypes[2 * i + 1]
                gt1 = 0 if g1 == 1 else 1
                gt2 = 0 if g2 == 1 else 1
                vcf_file.write(f"\t{gt1}|{gt2}")
            vcf_file.write("\n")

print(f"Flipped VCF written to {output_vcf}")
