In [5]:
import vcfpy
import pandas as pd
import itertools
from Bio import SeqIO
from Bio.Seq import Seq
from collections import OrderedDict

import warnings
from vcfpy.exceptions import FieldInfoNotFound

warnings.filterwarnings('ignore', '.*dup_num.*', category=FieldInfoNotFound)

def read_vcf(vcf_file_path):
    reader = vcfpy.Reader.from_path(vcf_file_path)
    records = []

    for record in reader:
        row = {
            'CHROM': record.CHROM,
            'POS': record.POS,
            'ID': record.ID,
            'REF': record.REF,
            'ALT': ','.join([str(alt.value) for alt in record.ALT]),
            'QUAL': record.QUAL,
            'FILTER': ','.join(record.FILTER) if record.FILTER else None,
            # FORMAT and SAMPLE fields could be added here similarly
        }
        for key, value in record.INFO.items():
            if isinstance(value, list):
                value = ','.join(map(str, value))
            row[key] = value
        
        records.append(row)

    return pd.DataFrame(records)

In [21]:
def read_fasta_file(fasta_file):
    sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences[record.id] = str(record.seq)
    return sequences

def are_equal(fa1: dict, fa2: dict):
    if fa1.keys() != fa2.keys():
        print('different chromosomes:')
        print('fa1: ' + ' '.join(fa1.keys()))
        print('fa2: ' + ' '.join(fa2.keys()))
        return False
    for ch in fa1.keys():
        if len(fa1[ch]) != len(fa2[ch]):
            print(f'chromosome {ch} has different lenghts:')
            print(len(fa1[ch]))
            print(len(fa2[ch]))
            return False
        if fa1[ch] != fa2[ch]:
            print(f'chromosome {ch} is different')
            return False
    return True

def mutate(fa, vcf_df):
    fa = OrderedDict(sorted(fa.items()))
    res_fa = {}
    vcf_df = vcf_df.sort_values(by=['CHROM', 'POS'])
    display(vcf_df)

    iter_fa = iter(fa.items())
    fa_chrom, fa_seq = next(iter_fa)
    curr_pos = 0
    for index, sv in itertools.chain(vcf_df.iterrows(), [('!!fake_index!!', pd.Series({'CHROM': '!!fake_chrom!!'}))]):
        try:
            display(sv)
            while sv.empty or fa_chrom != sv.CHROM:
                if fa_chrom not in res_fa:
                    res_fa[fa_chrom] = ''
                print(f'1. add {curr_pos}: of len {len(fa_seq[curr_pos:])}')
                res_fa[fa_chrom] += fa_seq[curr_pos:]
                fa_chrom, fa_seq = next(iter_fa)
                curr_pos = 0
            assert(fa_chrom == sv.CHROM)
            if sv.CHROM not in res_fa:
                res_fa[sv.CHROM] = ''
            print(f'2. add {curr_pos}:{sv.POS} of len {len(fa_seq[curr_pos:sv.POS])}')
            assert(curr_pos <= sv.POS)
            res_fa[sv.CHROM] += fa_seq[curr_pos:sv.POS]
            curr_pos = sv.POS
            if sv.SVTYPE == 'DUP':
                segm = fa_seq[sv.POS:sv.END]
                print(f'3. dup at {sv.POS} {len(segm)}*{int(sv.dup_num)} of len {len(segm)*int(sv.dup_num)}')
                res_fa[sv.CHROM] += segm*(int(sv.dup_num))
                curr_pos = sv.POS
            elif sv.SVTYPE == 'INS':
                print(f'4. ins of len {len(sv.ALT)}')
                res_fa[sv.CHROM] += sv.ALT
                curr_pos = curr_pos  # nothing changes
            elif sv.SVTYPE == 'DEL':
                if sv.REF != fa_seq[sv.POS:sv.END]:
                    print(sv.REF)
                    print(fa_seq[sv.POS:sv.END])
                    assert(False)
                print(f'5. delete {sv.POS}:{sv.END} of len {len(fa_seq[sv.POS:sv.END])}')
                curr_pos = sv.END
            elif sv.SVTYPE == 'INV':
                segm = Seq(fa_seq[sv.POS:sv.END])
                print(f'6. inv {sv.POS}:{sv.END} of len {len(segm)}')
                res_fa[sv.CHROM] += str(segm.reverse_complement())
                curr_pos = sv.END
            else:
                print(sv.SVTYPE, 'is not a supported SV type')
                assert(False)
                return Null
        except StopIteration:
            break
    assert(len(fa) == len(res_fa))
    return res_fa

vcf_df = read_vcf('refs/t2tChrY-SVs.vcf')
fa_orig = read_fasta_file('refs/t2tChrY.fa')
fa_SURVIVOR = read_fasta_file('refs/t2tChrY-SVs.fasta')
fa_SV = mutate(fa_orig, vcf_df)
#are_equal(fa_orig, fa_SURVIVOR)
are_equal(fa_SV, fa_SURVIVOR)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,PRECISE,SVTYPE,SVMETHOD,CHR2,END,SVLEN,dup_num
4,NC_060948.1,3430510,[INS4SURVIVOR],A,GGAGCTTAGGAGGTAAAAGGAACTTGTGAAGGTGAACTATCCCTAC...,,PASS,True,INS,SURVIVOR_sim,NC_060948.1,3430745,235,
2,NC_060948.1,14398344,[DUP2SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,14398656,312,5.0
5,NC_060948.1,15092668,[DEL5SURVIVOR],TTAGACTTCCTAAATATATAAAGCAAATATTAATGGACATAAAGGG...,T,,PASS,True,DEL,SURVIVOR_sim,NC_060948.1,15092864,196,
8,NC_060948.1,16086589,[INV8SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,16087210,621,
1,NC_060948.1,27847166,[DUP1SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,27847906,740,6.0
0,NC_060948.1,35790329,[DUP0SURVIVOR],N,DUP,,PASS,True,DUP,SURVIVOR_sim,NC_060948.1,35790540,211,2.0
7,NC_060948.1,42134928,[INV7SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,42135682,754,
3,NC_060948.1,47095939,[INS3SURVIVOR],G,AGAACCCCCCTTTAATATGAGCGAAATGCCTCTACCCTGGACCACG...,,PASS,True,INS,SURVIVOR_sim,NC_060948.1,47096432,493,
6,NC_060948.1,56548921,[INV6SURVIVOR],N,INV,,PASS,True,INV,SURVIVOR_sim,NC_060948.1,56549555,634,


CHROM                                             NC_060948.1
POS                                                   3430510
ID                                             [INS4SURVIVOR]
REF                                                         A
ALT         GGAGCTTAGGAGGTAAAAGGAACTTGTGAAGGTGAACTATCCCTAC...
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    INS
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                   3430745
SVLEN                                                     235
dup_num                                                   NaN
Name: 4, dtype: object

2. add 0:3430510 of len 3430510
4. ins of len 235


CHROM          NC_060948.1
POS               14398344
ID          [DUP2SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               14398656
SVLEN                  312
dup_num                  5
Name: 2, dtype: object

2. add 3430510:14398344 of len 10967834
3. dup at 14398344 312*5 of len 1560


CHROM                                             NC_060948.1
POS                                                  15092668
ID                                             [DEL5SURVIVOR]
REF         TTAGACTTCCTAAATATATAAAGCAAATATTAATGGACATAAAGGG...
ALT                                                         T
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    DEL
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  15092864
SVLEN                                                     196
dup_num                                                   NaN
Name: 5, dtype: object

2. add 14398344:15092668 of len 694324
5. delete 15092668:15092864 of len 196


CHROM          NC_060948.1
POS               16086589
ID          [INV8SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               16087210
SVLEN                  621
dup_num                NaN
Name: 8, dtype: object

2. add 15092864:16086589 of len 993725
6. inv 16086589:16087210 of len 621


CHROM          NC_060948.1
POS               27847166
ID          [DUP1SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               27847906
SVLEN                  740
dup_num                  6
Name: 1, dtype: object

2. add 16087210:27847166 of len 11759956
3. dup at 27847166 740*6 of len 4440


CHROM          NC_060948.1
POS               35790329
ID          [DUP0SURVIVOR]
REF                      N
ALT                    DUP
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 DUP
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               35790540
SVLEN                  211
dup_num                  2
Name: 0, dtype: object

2. add 27847166:35790329 of len 7943163
3. dup at 35790329 211*2 of len 422


CHROM          NC_060948.1
POS               42134928
ID          [INV7SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               42135682
SVLEN                  754
dup_num                NaN
Name: 7, dtype: object

2. add 35790329:42134928 of len 6344599
6. inv 42134928:42135682 of len 754


CHROM                                             NC_060948.1
POS                                                  47095939
ID                                             [INS3SURVIVOR]
REF                                                         G
ALT         AGAACCCCCCTTTAATATGAGCGAAATGCCTCTACCCTGGACCACG...
QUAL                                                     None
FILTER                                                   PASS
PRECISE                                                  True
SVTYPE                                                    INS
SVMETHOD                                         SURVIVOR_sim
CHR2                                              NC_060948.1
END                                                  47096432
SVLEN                                                     493
dup_num                                                   NaN
Name: 3, dtype: object

2. add 42135682:47095939 of len 4960257
4. ins of len 493


CHROM          NC_060948.1
POS               56548921
ID          [INV6SURVIVOR]
REF                      N
ALT                    INV
QUAL                  None
FILTER                PASS
PRECISE               True
SVTYPE                 INV
SVMETHOD      SURVIVOR_sim
CHR2           NC_060948.1
END               56549555
SVLEN                  634
dup_num                NaN
Name: 6, dtype: object

2. add 47095939:56548921 of len 9452982
6. inv 56548921:56549555 of len 634


CHROM    !!fake_chrom!!
dtype: object

1. add 56549555: of len 5910474


True