In [56]:
!pip3 install msprime

In [8]:
import msprime
import numpy as np

In [72]:

# Define parameters
population_size = 100000
outgroup_num = 1   # Single individual for the outgroup
num_individuals = 10  # Number of diploid individuals
chromosome_length = 10000
mutation_rate = 1e-8
recombination_rate = 1e-8
divergence_time = 10000000  # Time of divergence in generations for the outgroup

# Setup demography, including an outgroup
demography = msprime.Demography()
demography.add_population(name="ingroup", initial_size=population_size)
demography.add_population(name="outgroup", initial_size=population_size)
demography.add_population_split(time=divergence_time, derived=["ingroup"], ancestral="outgroup")

# Simulate the tree sequence
tree_sequence = msprime.sim_ancestry(
    samples={"ingroup": num_individuals, "outgroup": outgroup_num},
    ploidy=2,  
    demography=demography,
    sequence_length=chromosome_length,
    recombination_rate=recombination_rate,
    random_seed=42
)

# Introduce mutations on the tree sequence
mutated_ts = msprime.sim_mutations(
    tree_sequence,
    rate=mutation_rate,
    random_seed=42
)



# Set up the output VCF file, only writing the final flipped version
output_vcf = "flipped_simulated_data.vcf"

with open(output_vcf, "w") as vcf_file:
    # Write the VCF header
    vcf_file.write("##fileformat=VCFv4.2\n")
    vcf_file.write(f"##contig=<ID=chr1,length={chromosome_length}>\n")
    vcf_file.write(f"##INFO=<ID=AA,Number=1,Type=String,Description=\"Ancestral Allele\">\n")
    vcf_file.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT")
    
    
    for i in range(num_individuals):
        vcf_file.write(f"\tingroup_{i}")
    vcf_file.write("\toutgroup_0\n")

    # Process and flip each variant; add ancestral allele
    for variant in mutated_ts.variants():
        original_ref = variant.alleles[0]
        original_alts = variant.alleles[1:]
        
        outgroup_allele_index = variant.genotypes[-1]  # Assuming last haplotype index
        ancestral_allele = variant.alleles[outgroup_allele_index]

        
        # Flip REF and first ALT
        if original_alts:
            new_ref = original_alts[0]
            new_alts = [original_ref] + list(original_alts[1:])

            # Write out variant line
            vcf_file.write(f"chr1\t{int(variant.site.position) + 1}\t.\t{new_ref}\t{','.join(new_alts)}\t.\t.\tAA={ancestral_allele}\tGT")

            for i in range(len(variant.genotypes) // 2):
                # Adjust genotype based on flipped alleles
                g1 = variant.genotypes[2 * i]
                g2 = variant.genotypes[2 * i + 1]
                gt1 = 0 if g1 == 1 else 1
                gt2 = 0 if g2 == 1 else 1
                vcf_file.write(f"\t{gt1}|{gt2}")
            vcf_file.write("\n")

print(f"Flipped VCF written to {output_vcf}")


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 2 1 2 2 1 1 2 1 1 2 2 1 2 2 2 2 2 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 0 1 0 0 1 1 0 1 1 0 0 1 0 0 0 0 0 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [41]:
import msprime

# Define parameters
population_size = 100000
outgroup_size = 1  # Single haploid individual for the outgroup
num_individuals = 10  # Number of diploid individuals in ingroup
chromosome_length = 10000
mutation_rate = 1e-8
recombination_rate = 1e-8
divergence_time = 10000000  # Time of divergence in generations for the outgroup

# Setup demography, including an outgroup
demography = msprime.Demography()
demography.add_population(name="ingroup", initial_size=population_size)
demography.add_population(name="outgroup", initial_size=population_size)
demography.add_population_split(time=divergence_time, derived=["ingroup"], ancestral="outgroup")

# Simulate the tree sequence
ts = msprime.sim_ancestry(
    samples={"ingroup": num_individuals, "outgroup": outgroup_size},
    ploidy=2,
    demography=demography,
    sequence_length=chromosome_length,
    recombination_rate=recombination_rate,
    random_seed=42
)

# Introduce mutations on the tree sequence
mutated_ts = msprime.sim_mutations(
    ts,
    rate=mutation_rate,
    random_seed=42
)

# Manually adjust the reference allele based on 'tsk_0'
# 'tsk_0' corresponds to the first haplotype of the first ingroup diploid individual
tsk_0 = 0  # assuming this is your intended reference haplotype index

output_vcf = "simulated_data.vcf"
with open(output_vcf, "w") as vcf_file:
    print("##fileformat=VCFv4.2", file=vcf_file)
    print(f"##contig=<ID=chr1,length={chromosome_length}>", file=vcf_file)
    print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", end="", file=vcf_file)
    for i in range(2 * num_individuals + outgroup_size):
        print(f"\t{'' if i < num_individuals else 'outgroup_'}ind_{i//2}", end="", file=vcf_file)
    print(file=vcf_file)

    for variant in mutated_ts.variants():
        genotypes = variant.genotypes
        ref_allele_index = genotypes[0]
        ref_allele = variant.alleles[ref_allele_index]
        
        alt_alleles = [allele for allele in variant.alleles if allele != ref_allele]
        alt_allele_string = ",".join(alt_alleles) if alt_alleles else "."

        print(f"chr1\t{int(variant.site.position) + 1}\t.\t{ref_allele}\t{alt_allele_string}\t.\t.\t.", end="", file=vcf_file)

        for genotype in genotypes:
            g_idx = genotype * len(alt_alleles)
            genotype_str = f"{g_idx}|{g_idx}" if alt_alleles else "0|0"
            print(f"\t{genotype_str}", end="", file=vcf_file)

        print(file=vcf_file)

In [37]:
variant

Variant,Unnamed: 1
Site Id,1080
Site Position,9997.0
Number of Samples,22
Number of Alleles,2
Samples with Allele 'C',19 (86%)
Samples with Allele 'G',3 (14%)
Has Missing Data,False
Isolated as Missing,True


In [52]:
import msprime

# Define the simulation parameters
population_size = 1000  # Effective population size
chromosome_length = 10_000_000  # 10 Mb chromosome
mutation_rate = 1e-8  # Mutation rate per base per generation
recombination_rate = 1e-8  # Recombination rate per base per generation
num_individuals = 10  # Number of diploid individuals

# Simulate the ancestral recombination graph
tree_sequence = msprime.simulate(
    sample_size=num_individuals * 2,  # Diploid individuals, so * 2 for haploids
    length=chromosome_length,  # Length of the simulated chromosome
    Ne=population_size,  # Effective population size
    mutation_rate=mutation_rate,
    recombination_rate=recombination_rate
)

# Write the output to a VCF file
with open("simulated_data.vcf", "w") as vcf_file:
    tree_sequence.write_vcf(vcf_file, 2)  # 2 is the number of alleles per site (diploid)

# Note: This will use msprime's internal methods to write out the VCF
print("Simulation complete. VCF file 'simulated_data.vcf' generated.")

Simulation complete. VCF file 'simulated_data.vcf' generated.


In [53]:
import msprime

# Define simulation parameters
population_size = 1000  # Effective population size
chromosome_length = 10_000_000  # 10 Mb chromosome
mutation_rate = 1e-8  # Mutation rate per base per generation
recombination_rate = 1e-8  # Recombination rate per base per generation
num_individuals = 10  # Number of diploid individuals
divergent_divergence_time = 1_000_000  # Divergence time for the additional individual

# Set up the demography, including a highly divergent outgroup individual
demography = msprime.Demography()
demography.add_population(name="ingroup", initial_size=population_size)
demography.add_population(name="outgroup", initial_size=population_size)
demography.add_population_split(time=divergent_divergence_time, derived=["ingroup"], ancestral="outgroup")

# Simulate the ancestral recombination graph
ts = msprime.sim_ancestry(
    samples={
        "ingroup": num_individuals,
        "outgroup": 1  # One additional outgroup individual
    },
    ploidy=2,
    demography=demography,
    sequence_length=chromosome_length,
    recombination_rate=recombination_rate,
    random_seed=42
)

# Introduce mutations on the tree sequence
mutated_ts = msprime.sim_mutations(
    ts,
    rate=mutation_rate,
    random_seed=42
)

# Write the tree sequence to a VCF, now including the outgroup individual
with open("simulated_data_with_divergent_outgroup.vcf", "w") as vcf_file:
    mutated_ts.write_vcf(vcf_file)

In [54]:
import pysam

# Open the original VCF file
input_vcf = "simulated_data_with_divergent_outgroup.vcf"
output_vcf = "flipped_simulated_data.vcf"

# Open the input VCF for reading
vcf_in = pysam.VariantFile(input_vcf, 'r')

# Prepare the output VCF file, based on the input header
vcf_out = pysam.VariantFile(output_vcf, 'w', header=vcf_in.header)

for record in vcf_in:
    # Record the original REF and ALT alleles
    original_ref = record.ref
    original_alts = record.alts

    # Sanity check: there should be at least one ALT allele to flip with REF
    if not original_alts:
        continue

    # Choose new REF by selecting the first ALT allele
    new_ref = original_alts[0]

    # The new ALT alleles are REF followed by the remaining original ALT alleles, if any
    new_alts = [original_ref] + list(original_alts[1:])

    # Swap REF and ALT alleles
    record.ref = new_ref
    record.alts = new_alts

    # Flip genotypes
    for sample in record.samples.values():
        # Get the original genotype
        genotype = sample['GT']
        # Flip the genotype based on the new alleles list
        # This assumes diploid (two number genotype format)
        new_genotype = tuple(0 if x == 1 else 1 if x == 0 else x for x in genotype)
        sample['GT'] = new_genotype

    # Write the modified record to the output file
    vcf_out.write(record)

# Close both the VCF files
vcf_in.close()
vcf_out.close()

print(f"Flipped VCF written to {output_vcf}")

Flipped VCF written to flipped_simulated_data.vcf
