In [40]:
import vcfpy
import pandas as pd

def read_vcf(vcf_file_path):
    reader = vcfpy.Reader.from_path(vcf_file_path)
    records = []

    for record in reader:
        row = {
            'CHROM': record.CHROM,
            'POS': record.POS,
            'ID': record.ID,
            'REF': record.REF,
            'ALT': ','.join([str(alt.value) for alt in record.ALT]),
            'QUAL': record.QUAL,
            'FILTER': ','.join(record.FILTER) if record.FILTER else None,
            # FORMAT and SAMPLE fields could be added here similarly
        }
        for key, value in record.INFO.items():
            if isinstance(value, list):
                value = ','.join(map(str, value))
            row[key] = value
        
        records.append(row)

    return pd.DataFrame(records)

In [84]:
from Bio import SeqIO

def read_fasta_file(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences[record.id] = str(record.seq)
    return sequences

def are_equal(fa1: dict, fa2: dict):
    if fa1.keys() != fa2.keys():
        print('different chromosomes:')
        print('fa1: ' + ' '.join(fa1.keys()))
        print('fa2: ' + ' '.join(fa2.keys()))
        return False
    for ch in fa1.keys():
        if len(fa1[ch]) != len(fa2[ch]):
            print(f'chromosome {ch} has different lenghts:')
            print(len(fa1[ch]))
            print(len(fa2[ch]))
            return False
        if fa1[ch] != fa2[ch]:
            print(f'chromosome {ch} is different')
            return False
    return True

def mutate(fa, vcf_df):
    for index, sv in vcf_df.iterrows():
        display(sv)
        seq = fa[sv.CHROM]
        if sv.SVTYPE == 'DUP':
            dup = seq[sv.POS:sv.END]
            fa[sv.CHROM] = seq[:sv.POS] + dup*int(sv.dup_num) + seq[sv.POS:]
        elif sv.SVTYPE == 'INS':
            print(len(sv.ALT), sv.SVLEN)
            fa[sv.CHROM] = seq[:sv.POS] + sv.ALT + seq[sv.POS:]
        elif sv.SVTYPE == 'DEL':
            if sv.REF != seq[sv.POS:sv.END]:
                print(sv.REF)
                print(seq[sv.POS:sv.END])
            fa[sv.CHROM] = seq[:sv.POS] + seq[sv.END:]
        else:
            print(sv.SVTYPE, 'is not a supported SV type')
            assert(False)
            return Null
    return fa

vcf_df = read_vcf('refs/t2tChrY-indel.vcf')
display(vcf_df)
fa_orig = read_fasta_file('refs/t2tChrY.fa')
fa_SURVIVOR = read_fasta_file('refs/t2tChrY-indel.fasta')
fa_SV = mutate(fa_orig, vcf_df)
are_equal(fa_orig, fa_SURVIVOR)
are_equal(fa_SV, fa_SURVIVOR)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,PRECISE,SVTYPE,SVMETHOD,CHR2,END,SVLEN
0,NC_060948.1,47547671,[DEL0SURVIVOR],TGTAATGGAATGGAGTGCAGTGCAATGGAATGGAATGGAATTGAAT...,T,,PASS,True,DEL,SURVIVOR_sim,NC_060948.1,47548136,465
1,NC_060948.1,54779236,[DEL1SURVIVOR],AATATAACTAACATACATAATATTATAATATAGATTATATTGTGTT...,A,,PASS,True,DEL,SURVIVOR_sim,NC_060948.1,54779731,495


CHROM                                             NC_060948.1
POS                                                  47547671
ID                                             [DEL0SURVIVOR]
REF         TGTAATGGAATGGAGTGCAGTGCAATGGAATGGAATGGAATTGAAT...
ALT                                                         T
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    DEL
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  47548136
SVLEN                                                     465
Name: 0, dtype: object

CHROM                                             NC_060948.1
POS                                                  54779236
ID                                             [DEL1SURVIVOR]
REF         AATATAACTAACATACATAATATTATAATATAGATTATATTGTGTT...
ALT                                                         A
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    DEL
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  54779731
SVLEN                                                     495
Name: 1, dtype: object

AATATAACTAACATACATAATATTATAATATAGATTATATTGTGTTATATTAAATATCATATATATCATGTATCATATATTCTATAATATATATTACACATTATATCATATGTATGTTGTCTATTATGTGATATATATAACTCTATATATAATTATATTTTTATACATATAACTAGATAAACATGTAATTCTACATATTTTCATTTATAATATGTAGAATTATATAATCATATATAATTATTTATATTCTATAAAAAATTTTATCATTATATAAATTATAATATATAAAAATTATAATATGTACTACAAATATATATATTATATATCATATATGATATAGTACCTTTGTTATATATCATAATACATATAAATGTGTATTATGTTATATATAATTATATAATTTCATATATAAGATGTATAATATGTATCATATATTATATATGTTATGTAATATATACAGTATATATAAGATGACACAGGATAAATATTATATAC
ATAAGATGACACAGGATAAATATTATATACTATGACATATAAAATATATGAGGTTATATGTTACATATAAGGCATAGCACATAACATGTAATATATATCATATATAATTTTTTTTTAGACAGAATCTTGTCCTGTTGCACAAGGTGGGGTACAATGGCGCCATCTTTGCTCACTGCAACTTCTGCCTCACGGGTCCAAGCGATTGTCCTCCCTCAGCCTCCCAGGTAGCTGGGACTACACCACACTGGGACTACACCAGCTGCCACCATGCCTAGCTAATTTTTTGGATTTTTAGTAGTGACAGCGTTTCACTGTATTGGCCAGGATGGTCTTGATCTCTTCACCTTGTGATCCCCTTGCCTTGGCCTCCAAATTTGCTGGGATTACAGGCCTGAGCCAAGATCCATATTTTTTAAATGAAAAAAAATTTCAAAGGTACTCTGCTTGGTACAATAATCAAATGTATAAACTGAGGAATAAAACATAACCATGAAA
chromoso

False