# Package initiation

In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import shutil
import random
import pprint
import itertools
import functools
import collections
import subprocess
import tempfile

import Bio.Seq
import Bio.SeqRecord
import Bio.Align
import Bio.AlignIO
import Bio.Align.AlignInfo
import pysam
import pyranges as pr
import numpy as np

import sys
sys.path.append('/home/users/pjh/scripts/python_genome_packages')
os.chdir('/home/users/pjh/scripts/')

In [44]:
from handygenome import common
from handygenome.common import ChromDict, Vcfspec, Interval
from handygenome.variantplus.breakends import Breakends
from handygenome.variantplus.variantplus import VariantPlus, VariantPlusList
from handygenome.variantplus.vcfplus import VcfPlus
from handygenome import igvhandle

from handygenome.variantplus.breakends import get_bnds_from_vr

import handygenome.readplus.readhandler
import handygenome.align.msa
import handygenome.align.bwa

In [3]:
FASTA_PATH_HG19 = "/home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta"
FASTA_PATH_HG38 = "/home/users/data/01_reference/human_g1k_v38/Homo_sapiens_assembly38.fasta"

FASTA_HG19 = pysam.FastaFile(FASTA_PATH_HG19)
FASTA_HG38 = pysam.FastaFile(FASTA_PATH_HG38)
CHROMDICT_HG19 = ChromDict(fasta_path=FASTA_PATH_HG19)
CHROMDICT_HG38 = ChromDict(fasta_path=FASTA_PATH_HG38)

In [4]:
import time

def timer(func):
    def wrapper(*args, **kwargs):
        ts1 = time.perf_counter()
        result = func(*args, **kwargs)
        ts2 = time.perf_counter()
        print(f'elapsed time: {ts2 - ts1}')
        return result
    return wrapper

In [5]:
igv = igvhandle.IGVHandle(60387)

# Set Functions

In [42]:
def get_softclip_collection(bam, chrom, start, end):
    result = dict()
    for read in bam.fetch(chrom, start, end):
        for clipspec in handygenome.readplus.readhandler.get_softclip_specs(read):
            key = (clipspec.start1, clipspec.is_forward)
            result.setdefault(key, list())
            result[key].append(clipspec)
            
    return result

In [43]:
def check_identical(seqlist):
    seqlist_sorted = sorted(seqlist, key=(lambda x: len(x)))
    for idx in range(1, len(seqlist_sorted)):
        seq1 = seqlist_sorted[idx - 1]
        seq2 = seqlist_sorted[idx]
        if seq2[:len(seq1)] == seq1:
            continue
        else:
            return False
    return True
        

# Scratches

In [13]:
import handygenome.align.bwa
seq = 'CAAGGGCAGGTTCACTGTCCTGCACCTTCCACCCTCCTGCTGAGTCAGTCCGCAGCTGTCCCTCCTCCATGATGTGTTTCTATCCAAAGATAACATGCTGACTCCTTTTGTGCTTCTCTATGCTGACCCCTTATAGGGCTCACATCCTCA'
readlist = handygenome.align.bwa.run_bwa(seq, 'hg19')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1 sequences (150 bp)...
[M::mem_process_seqs] Processed 1 reads in 0.002 CPU sec, 0.001 real sec
[main] Version: 0.7.17-r1188
[main] CMD: /home/users/pjh/scripts/python_genome_packages/utils/bwa mem -Y -M -t 1 -o /home/users/pjh/scripts/tmp64n6z67g/output.sam /home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta /home/users/pjh/scripts/tmp64n6z67g/input.fasta
[main] Real time: 3.581 sec; CPU: 3.528 sec


In [14]:
print(readlist[0].to_string())

query	0	10	5202381	60	150M	*	0	0	CAAGGGCAGGTTCACTGTCCTGCACCTTCCACCCTCCTGCTGAGTCAGTCCGCAGCTGTCCCTCCTCCATGATGTGTTTCTATCCAAAGATAACATGCTGACTCCTTTTGTGCTTCTCTATGCTGACCCCTTATAGGGCTCACATCCTCA	*	NM:i:0	MD:Z:150	AS:i:150	XS:i:62


# LU-ALK05 hard-to-resolve complex SV

In [52]:
tbam_path = '/home/users/hspark/Projects/11_fusion_samples/bam/wgs_smc/LU-ALK05-T.s.md.ir.br.bam'
nbam_path = '/home/users/hspark/Projects/11_fusion_samples/bam/wgs_smc/LU-ALK05-N.s.md.ir.br.bam'
svaba_vcf_path = '/home/users/hspark/Projects/11_fusion_samples/sv/svaba/LU-ALK05.svaba.unfiltered.somatic.sv.vcf'

tbam = pysam.AlignmentFile(tbam_path)
nbam = pysam.AlignmentFile(nbam_path)
svaba_vcf = pysam.VariantFile(svaba_vcf_path)

In [37]:
igv.cmd('new')
igv.load([tbam_path, nbam_path])

OK
OK
OK


In [38]:
bnds1 = Breakends('3', 107_594_794, True, '3', 168_666_451, False, FASTA_HG19)
bnds2 = Breakends('3', 107_594_834, False, '3', 107_598_244, False, FASTA_HG19)
bnds3 = Breakends('3', 107_598_191, True, '3', 174_990_270, False, FASTA_HG19)

In [53]:
distance = 10
relevant_vrs = list()
poslist = [bnds1.pos_bnd1, bnds1.pos_bnd2, bnds2.pos_bnd1, bnds2.pos_bnd2, bnds3.pos_bnd1, bnds3.pos_bnd2]
for vr in svaba_vcf.fetch():
    bnds_vr = get_bnds_from_vr(vr, FASTA_HG19, CHROMDICT_HG19)
    if (
            bnds_vr.chrom_bnd1 == '3' and
            bnds_vr.chrom_bnd2 == '3' and
            (any(bnds_vr.pos_bnd1 in range(bndpos - distance, bndpos + distance) for bndpos in poslist) or
             any(bnds_vr.pos_bnd2 in range(bndpos - distance, bndpos + distance) for bndpos in poslist))):
        relevant_vrs.append(vr)

In [40]:
for vr in relevant_vrs:
    print(vr)

3	107594794	1006650674:1	A	]3:168666451]A	0	LOWAS	DISC_MAPQ=255;EVDNC=TSI_L;HOMSEQ=A;MAPQ=60;MATEID=1006650674:2;MATENM=0;NM=0;NUMPARTS=4;REPSEQ=TATCTATC;SCTG=c_3_107579501_107604501_34C;SPAN=61071657;SVTYPE=BND	GT:AD:DP:GQ:PL:SR:DR:LR:LO	0/0:0:15:4.1:0,4.1,27.9:0:0:4.216:0	0/1:16:114:0.4:0.4,0,148.1:16:0:0.04474:12.75

3	107594833	100183109:1	T	T]3:107598243]	15	LOWAS	DISC_MAPQ=255;EVDNC=TSI_L;HOMSEQ=A;MAPQ=60;MATEID=100183109:2;MATENM=0;NM=0;NUMPARTS=4;REPSEQ=CC;SCTG=c_3_107579501_107604501_34C;SPAN=3410;SVTYPE=BND	GT:AD:DP:GQ:PL:SR:DR:LR:LO	0/0:0:38:10.5:0,10.5,115.5:0:0:10.68:0	0/1:20:181:15.1:15.1,0,440.9:20:0:-15.14:39.36

3	107598190	148116658:1	A	]3:174990215]A	0	LOWMAPQDISC	DISC_MAPQ=60;EVDNC=DSCRD;IMPRECISE;MAPQ=60;MATEID=148116658:2;MATENM=-1;NM=-1;NUMPARTS=0;SCTG=3:107598190(-)-3:174990215(+)__3_107579501_107604501D;SPAN=67392025;SVTYPE=BND	GT:AD:DP:GQ:PL:SR:DR:LR:LO	0/0:0:21:5.7:0,5.7,62.7:0:0:5.903:0	0/0:4:72:6.9:0,6.9,194.7:0:4:7.035:6.63

3	107598243	100183109:2	A	A]3:1

In [73]:
igv.goto([bnds1, bnds2, bnds3])
igv.cmd('viewaspairs')
igv.cmd('group PAIR_ORIENTATION')
igv.cmd('sort base')

OK
OK
OK
OK


In [74]:
chrom = '3'
pos = 168_666_450
start = pos - 1
end = pos

In [75]:
softclip_col = get_softclip_collection(tbam, chrom, start, end)

In [76]:
softclip_col.keys()

dict_keys([(168666452, True), (168666503, True), (168666583, True), (168666446, False)])

In [78]:
softclip_col[(168666452, True)]

[Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTC', qual=[34, 34, 33, 33, 34, 32, 29, 28, 29, 28], qname='A01057:8:H3GMYDSXY:2:2302:16613:32925'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACA', qual=[33, 33, 33, 26, 34, 34, 34, 33, 34, 25, 32, 28, 23, 27, 28], qname='A01057:8:H3GMYDSXY:2:2469:21630:35196'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACACCAATTGAAACAAAGAGTTT', qual=[31, 31, 31, 31, 32, 31, 31, 32, 33, 32, 23, 31, 32, 31, 8, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 33, 30, 29, 27, 27, 26], qname='A01057:8:H3GMYDSXY:2:1212:6361:24972'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACACCAATTGAAACAAAGAGTTTATG', qual=[31, 31, 31, 32, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 30, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 32, 30, 31, 31, 29, 29, 29, 29], qname='A01057:8:H3GMYDSXY:2:2466:11713:35462'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACACCAATTGAAACAAAG

In [79]:
key = (168666452, True)
seqlist = [x.seq for x in softclip_col[key]]
seqnamelist = [x.qname for x in softclip_col[key]]

In [80]:
msa = handygenome.align.msa.align_with_muscle(seqlist, seqnamelist, biopython=False)

In [82]:
print(msa)

 0    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG    A00155:287:H3HKGDSXY:2:1425:21667:13275
 1    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTC--------------------------    A01057:8:H3GMYDSXY:2:2157:28754:23109
 2    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATG------------------------------------------------------    A01057:8:H3GMYDSXY:2:2466:11713:35462
 3    TATCTATCTCACACA-----------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2469:21630:35196
 4    TATCTATCTC----------------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2302:16613:32925
 5    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTT----------------------------------------------------    A01057:8:H3GMYDSXY:2:1353:30996:1125
 6    TATCTATCTCACACACCAATTGAAACAAAGAGTTAATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACAT------------------    A01057:8:H3GMYDSXY:2:2156:2654:5400
 7    TATCTATCTCACACA

In [83]:
msa.get_consensus()

'TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTX'

In [84]:
handygenome.align.msa.modify_msa_gapclose(msa, 0.5)

In [85]:
print(msa)

 0    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG    A00155:287:H3HKGDSXY:2:1425:21667:13275
 1    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTC--------------------------    A01057:8:H3GMYDSXY:2:2157:28754:23109
 2    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATG------------------------------------------------------    A01057:8:H3GMYDSXY:2:2466:11713:35462
 3    TATCTATCTCACACA-----------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2469:21630:35196
 4    TATCTATCTC----------------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2302:16613:32925
 5    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTT----------------------------------------------------    A01057:8:H3GMYDSXY:2:1353:30996:1125
 6    TATCTATCTCACACACCAATTGAAACAAAGAGTTAATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACAT------------------    A01057:8:H3GMYDSXY:2:2156:2654:5400
 7    TATCTATCTCACACA

In [86]:
msa.get_consensus()

'TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG'

In [69]:
consensus_seq = msa.get_consensus()

In [87]:
readlist = handygenome.align.bwa.run_bwa(consensus_seq, 'hg19')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1 sequences (92 bp)...
[M::mem_process_seqs] Processed 1 reads in 0.001 CPU sec, 0.002 real sec
[main] Version: 0.7.17-r1188
[main] CMD: /home/users/pjh/scripts/python_genome_packages/utils/bwa mem -Y -M -t 1 -o /home/users/pjh/scripts/tmpqh5402pd/output.sam /home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta /home/users/pjh/scripts/tmpqh5402pd/input.fasta
[main] Real time: 5.855 sec; CPU: 5.607 sec


In [88]:
for read in readlist:
    print(read.to_string())

query	16	3	107598191	60	53M39S	*	0	0	CAACAACAACTAACATTAATGTCACTGAGTGCTAGGCACACTGTTCATGAGGAACATAAACTCTTTGTTTCAATTGGTGTGTGAGATAGATA	*	NM:i:0	MD:Z:53	AS:i:53	XS:i:0	SA:Z:3,107594795,+,39M53S,60,0;
query	256	3	107594795	60	39M53S	*	0	0	TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG	*	NM:i:0	MD:Z:39	AS:i:39	XS:i:20	SA:Z:3,107598191,-,53M39S,60,0;


# jhkim sample

In [15]:
bam_path = '/home/users/jhkim/Project/05_WGS/24.DSD_WGS/22DGS004_20220208-971-2201_DGS-2_M-ready.bam'
bam = pysam.AlignmentFile(bam_path)

In [16]:
igv.load([bam_path])
igv.goto(['10:5199609-5210138'])

OK
OK


In [45]:
chrom = 'chr10'
pos = 5_202_144
start = pos - 1
end = pos

In [46]:
softclip_col = get_softclip_collection(bam, chrom, start, end)

In [47]:
softclip_col.keys()

dict_keys([(5202145, True), (5202142, False), (5202238, True)])

In [48]:
softclip_col[(5202142, False)]

[Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATCGTTCCATTACGTA', qual=[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 4, 30, 30, 30], qname='E100031906L1C001R00603681898'),
 Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATCGTTCCATTACGTAATCCTTTAACATATAAGTTCACTATTTTG', qual=[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], qname='E100031906L1C012R03202397776'),
 Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATCGTTCCATTACGTAATCCTTTAACATATAAGTTCACT', qual=[30, 30, 30, 30, 30, 30, 20, 30, 30, 30, 30, 30, 30, 5, 30, 30, 30, 20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], qname='E100031906L1C013R01801566726'),
 Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATC', qual=[30, 30, 30, 30, 30,

In [26]:
key = (5202142, False)
seqlist = [x.seq for x in softclip_col[key]]
seqnamelist = [x.qname for x in softclip_col[key]]

In [28]:
msa = handygenome.align.msa.align_with_muscle(seqlist, seqnamelist, biopython=False)

In [31]:
handygenome.align.msa.modify_msa_gapclose(msa, 0.5)

In [33]:
consensus_seq = msa.get_consensus()

In [34]:
readlist = handygenome.align.bwa.run_bwa(consensus_seq, 'hg19')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1 sequences (129 bp)...
[M::mem_process_seqs] Processed 1 reads in 0.001 CPU sec, 0.001 real sec
[main] Version: 0.7.17-r1188
[main] CMD: /home/users/pjh/scripts/python_genome_packages/utils/bwa mem -Y -M -t 1 -o /home/users/pjh/scripts/tmpo74uwacr/output.sam /home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta /home/users/pjh/scripts/tmpo74uwacr/input.fasta
[main] Real time: 3.489 sec; CPU: 3.462 sec


In [35]:
for read in readlist:
    print(read.to_string())

query	4	*	0	0	*	*	0	0	ATATACGTATCGTTCCATTACGTAATCCTTTAACATATAAGTTCACTATTTTGACAAATTAAGTCAGATTATAAATATATACAATTTTGTCTAAAGGTCGTAAATGTTTCAGTTTGCGGTTAACTTTTC	*	AS:i:0	XS:i:0
