# Package initiation

In [1]:
%load_ext autoreload
%autoreload 2

import os

# https://stackoverflow.com/questions/30791550/limit-number-of-threads-in-numpy
os.environ["OMP_NUM_THREADS"] = "1" # export OMP_NUM_THREADS=4
os.environ["OPENBLAS_NUM_THREADS"] = "1" # export OPENBLAS_NUM_THREADS=4 
os.environ["MKL_NUM_THREADS"] = "1" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "1" # export NUMEXPR_NUM_THREADS=6

import re
import shutil
import random
import pprint
import itertools
import functools
import collections

import pysam
# import pyranges as pr
import numpy as np
import Bio.Align
import Bio.Align.AlignInfo
import Bio.SeqRecord
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import scipy.stats
# import sklearn.cluster
# import sklearn.mixture

import sys
sys.path.append('/home/users/pjh/scripts/python_genome_package_dev')

In [2]:
from handygenome import common
from handygenome.common import ChromDict, Interval
from handygenome.variant.vcfspec import Vcfspec
from handygenome.igvhandle import IGVHandle

from handygenome.read.readplus import ReadPlusPairList
from handygenome.sv.breakends import Breakends
from handygenome.variant.variantplus import VariantPlus, VariantPlusList
from handygenome.igvhandle import IGVHandle

import handygenome.variant.filter as libfilter
import handygenome.variant.variantplus as libvp

import handygenome.align.alignhandler as alignhandler
import handygenome.align.msa as libmsa
import handygenome.align.toolwrapper

import handygenome.sv.svcall as svcall

In [3]:
FASTA_PATH_HG19 = common.DEFAULT_FASTA_PATHS['hg19']
FASTA_PATH_HG38 = common.DEFAULT_FASTA_PATHS['hg38']

FASTA_HG19 = common.DEFAULT_FASTAS['hg19']
FASTA_HG38 = common.DEFAULT_FASTAS['hg38']
CHROMDICT_HG19 = common.DEFAULT_CHROMDICTS['hg19']
CHROMDICT_HG38 = common.DEFAULT_CHROMDICTS['hg38']

In [4]:
igv = IGVHandle(60387)

# Set Functions

In [5]:
def get_softclip_collection(bam, chrom, start, end):
    result = dict()
    for read in bam.fetch(chrom, start, end):
        for clipspec in handygenome.readplus.readhandler.get_softclip_specs(read):
            key = (clipspec.start1, clipspec.is_forward)
            result.setdefault(key, list())
            result[key].append(clipspec)
            
    return result

In [6]:
def check_identical(seqlist):
    seqlist_sorted = sorted(seqlist, key=(lambda x: len(x)))
    for idx in range(1, len(seqlist_sorted)):
        seq1 = seqlist_sorted[idx - 1]
        seq2 = seqlist_sorted[idx]
        if seq2[:len(seq1)] == seq1:
            continue
        else:
            return False
    return True
        

# Scratches

In [7]:
bam_path = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/02_BAM/FF18/LU-FF18.normal.bam'
bam = pysam.AlignmentFile(bam_path)

In [8]:
target_chrom = '5'
target_pos0 = 21_573_431

target_start0 = target_pos0
target_end0 = target_start0 + 1

In [9]:
rpplist = ReadPlusPairList.from_bam(bam, target_chrom, target_pos0, target_pos0 + 1)

In [15]:
SA_region_info, consensus_result = svcall.call_breakends(bam, target_chrom, target_start0, target_end0, 'hg19')

In [16]:
pprint.pprint(SA_region_info)

{('5', 21573312, False, 'TATACACGA'): {'confident': [], 'unconfident': []},
 ('5', 21573318, True, 'TATAT'): {'confident': [], 'unconfident': []},
 ('5', 21573342, False, 'AGA'): {'confident': [],
                                 'unconfident': [{'chrom': '6',
                                                  'end0': 57575975,
                                                  'is_reversed': True,
                                                  'start0': 57575807}]},
 ('5', 21573431, True, 'TTTTTGAAACTGGCTGTCTAATCTCACCCTTCAAATCCCTACCCCATTTCCAATCTCCAACTGGCCCTCCTC'): {'confident': [{'chrom': '6',
                                                                                                                    'end0': 57575988,
                                                                                                                    'is_reversed': True,
                                                                                                                    'start0': 

In [17]:
pprint.pprint(consensus_result)

{('5', 21573312, False): [{'consensus': 'AGCACATAT',
                           'msa': <<class 'Bio.Align.MultipleSeqAlignment'> instance (1 records of length 9) at 2b69f1f40b20>,
                           'target': 'AGCACATAT',
                           'true_consensus': 'TATACACGA'}],
 ('5', 21573318, True): [{'consensus': 'TATAT',
                          'msa': <<class 'Bio.Align.MultipleSeqAlignment'> instance (1 records of length 5) at 2b69f1f40e50>,
                          'target': 'TATAT',
                          'true_consensus': 'TATAT'}],
 ('5', 21573342, False): [{'consensus': 'AGA',
                           'msa': <<class 'Bio.Align.MultipleSeqAlignment'> instance (2 records of length 3) at 2b69f1f409a0>,
                           'target': 'AGA',
                           'true_consensus': 'AGA'}],
 ('5', 21573431, True): [{'consensus': 'TTTTTGAAACTGGCTGTCTAATCTCACCCTTCAAATCCCTACCCCATTTCCAATCTCCAACTGGCCCTCCTC',
                          'msa': <<class 'Bio.Ali

In [631]:
target = 'ACCTTGCGA'
query = 'CCTGCG'

In [632]:
alns = aligner.align(target, query)

In [641]:
for x in alns:
    print(x)
    print(x.score)
    print()

target            0 ACCTTGCGA 9
                  0 -||-||||- 9
query             0 -CC-TGCG- 6

15.0

target            0 ACCTTGCGA 9
                  0 -|||-|||- 9
query             0 -CCT-GCG- 6

15.0



In [523]:
rp_dict = dict()
clipspec_dict = dict()
for rp, clipspec in rpplist.iter_clipspecs():
    rp_dict[rp.uid] = rp
    clipspec_dict[rp.uid] = clipspec
    

In [536]:
for ruid, rp in rp_dict.items():
    if rp.SAinfo:
        print(rp.read.cigartuples)
        for x in rp.SAinfo:
            print(x)
        print()

[(0, 4), (2, 2), (0, 90), (4, 57)]
{'chrom': '6', 'pos': 57575866, 'is_forward': True, 'MQ': 60, 'NM': 0, 'cigarstring': '54M97S', 'cigartuples': [(0, 54), (4, 97)]}

[(0, 18), (2, 2), (0, 90), (4, 43)]
{'chrom': '6', 'pos': 57575880, 'is_forward': False, 'MQ': 60, 'NM': 0, 'cigarstring': '40M111S', 'cigartuples': [(0, 40), (4, 111)]}

[(0, 13), (2, 2), (0, 90), (4, 48)]
{'chrom': '6', 'pos': 57575875, 'is_forward': False, 'MQ': 60, 'NM': 0, 'cigarstring': '45M106S', 'cigartuples': [(0, 45), (4, 106)]}

[(0, 7), (2, 2), (0, 90), (4, 54)]
{'chrom': '6', 'pos': 57575869, 'is_forward': False, 'MQ': 60, 'NM': 0, 'cigarstring': '51M100S', 'cigartuples': [(0, 51), (4, 100)]}

[(4, 2), (0, 90), (4, 59)]
{'chrom': '6', 'pos': 57575864, 'is_forward': False, 'MQ': 60, 'NM': 0, 'cigarstring': '56M95S', 'cigartuples': [(0, 56), (4, 95)]}

[(4, 3), (0, 90), (4, 58)]
{'chrom': '6', 'pos': 57575865, 'is_forward': False, 'MQ': 60, 'NM': 0, 'cigarstring': '55M96S', 'cigartuples': [(0, 55), (4, 96)]}

[

In [588]:
import pyranges as pr
gr1 = pr.PyRanges(chromosomes=['1'], starts=[10], ends=[20])
gr2 = pr.PyRanges(chromosomes=['1', '2'], starts=[15, 0], ends=[25, 11])

In [589]:
gr1

Unnamed: 0,Chromosome,Start,End
0,1,10,20


In [590]:
gr2

Unnamed: 0,Chromosome,Start,End
0,1,15,25
1,2,0,11


In [591]:
gr1.join(gr2, report_overlap=True)

Unnamed: 0,Chromosome,Start,End,Start_b,End_b,Overlap
0,1,10,20,15,25,5


In [501]:
clipspec_list = [x[1] for x in clipspec_info]
rp_list = [x[0] for x in clipspec_info]

In [524]:
msa = result[('5', 21573431, True)][0]['msa']

In [529]:
ruids = [x.annotations['clipspec'].readuid for x in msa]

In [531]:
relevant_rps = [rp_dict[x] for x in ruids]

In [533]:
SAinfo = list(itertools.chain.from_iterable(rp.SAinfo for rp in relevant_rps))

In [None]:
rngs = [range(x['pos'] - 1, x['pos'] )]

In [None]:
for ruid, rp in rp_list

In [534]:
SAinfo

[{'chrom': '6',
  'pos': 57575851,
  'is_forward': False,
  'MQ': 60,
  'NM': 1,
  'cigarstring': '66M85S',
  'cigartuples': [(0, 66), (4, 85)]},
 {'chrom': '6',
  'pos': 57575864,
  'is_forward': False,
  'MQ': 60,
  'NM': 0,
  'cigarstring': '56M95S',
  'cigartuples': [(0, 56), (4, 95)]},
 {'chrom': '6',
  'pos': 57575865,
  'is_forward': False,
  'MQ': 60,
  'NM': 0,
  'cigarstring': '55M96S',
  'cigartuples': [(0, 55), (4, 96)]},
 {'chrom': '6',
  'pos': 57575866,
  'is_forward': True,
  'MQ': 60,
  'NM': 0,
  'cigarstring': '54M97S',
  'cigartuples': [(0, 54), (4, 97)]},
 {'chrom': '6',
  'pos': 57575869,
  'is_forward': False,
  'MQ': 60,
  'NM': 0,
  'cigarstring': '51M100S',
  'cigartuples': [(0, 51), (4, 100)]},
 {'chrom': '6',
  'pos': 57575875,
  'is_forward': False,
  'MQ': 60,
  'NM': 0,
  'cigarstring': '45M106S',
  'cigartuples': [(0, 45), (4, 106)]},
 {'chrom': '6',
  'pos': 57575880,
  'is_forward': False,
  'MQ': 60,
  'NM': 0,
  'cigarstring': '40M111S',
  'cigartupl

In [511]:
result = svcall.get_consensus_from_clipspecs(clipspec_list)

In [512]:
pprint.pprint(result)

{('5', 21573312, False): [{'consensus': 'AGCACATAT',
                           'msa': <<class 'Bio.Align.MultipleSeqAlignment'> instance (1 records of length 9) at 2b20cbf01660>,
                           'target': 'AGCACATAT',
                           'true_consensus': 'TATACACGA'}],
 ('5', 21573318, True): [{'consensus': 'TATAT',
                          'msa': <<class 'Bio.Align.MultipleSeqAlignment'> instance (1 records of length 5) at 2b20cbf00b20>,
                          'target': 'TATAT',
                          'true_consensus': 'TATAT'}],
 ('5', 21573342, False): [{'consensus': 'AGA',
                           'msa': <<class 'Bio.Align.MultipleSeqAlignment'> instance (2 records of length 3) at 2b20cbf015a0>,
                           'target': 'AGA',
                           'true_consensus': 'AGA'}],
 ('5', 21573431, True): [{'consensus': 'TTTTTGAAACTGGCTGTCTAATCTCACCCTTCAAATCCCTACCCCATTTCCAATCTCCAACTGGCCCTCCTC',
                          'msa': <<class 'Bio.Ali

In [522]:
print(result[('5', 21573431, True)][0]['msa'][0].annotations['clipspec'].readuid)

ReadUID(qname='ST-E00181:657:HNYC7CCXY:1:2119:25875:40020', flag=129, chrom='5', pos0=21573353)


In [499]:
print(result[('5', 21573431, True)][0]['true_consensus'])

TTTTTGAAACTGGCTGTCTAATCTCACCCTTCAAATCCCTACCCCATTTCCAATCTCCAACTGGCCCTCCTC


In [596]:
for x in np.eye(3).astype(bool):
    print(x)

[ True False False]
[False  True False]
[False False  True]


In [600]:
gr

Unnamed: 0,Chromosome,Start,End
0,1,0,100
1,1,10,150


In [601]:
gr[np.eye(2).astype(bool)[1]]

Unnamed: 0,Chromosome,Start,End
1,1,10,150


In [487]:
igv.load([bam_path])

OK


In [488]:
igv.goto([(target_chrom, target_pos0, target_pos0)])

OK


In [13]:
import handygenome.align.bwa
seq = 'CAAGGGCAGGTTCACTGTCCTGCACCTTCCACCCTCCTGCTGAGTCAGTCCGCAGCTGTCCCTCCTCCATGATGTGTTTCTATCCAAAGATAACATGCTGACTCCTTTTGTGCTTCTCTATGCTGACCCCTTATAGGGCTCACATCCTCA'
readlist = handygenome.align.bwa.run_bwa(seq, 'hg19')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1 sequences (150 bp)...
[M::mem_process_seqs] Processed 1 reads in 0.002 CPU sec, 0.001 real sec
[main] Version: 0.7.17-r1188
[main] CMD: /home/users/pjh/scripts/python_genome_packages/utils/bwa mem -Y -M -t 1 -o /home/users/pjh/scripts/tmp64n6z67g/output.sam /home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta /home/users/pjh/scripts/tmp64n6z67g/input.fasta
[main] Real time: 3.581 sec; CPU: 3.528 sec


In [14]:
print(readlist[0].to_string())

query	0	10	5202381	60	150M	*	0	0	CAAGGGCAGGTTCACTGTCCTGCACCTTCCACCCTCCTGCTGAGTCAGTCCGCAGCTGTCCCTCCTCCATGATGTGTTTCTATCCAAAGATAACATGCTGACTCCTTTTGTGCTTCTCTATGCTGACCCCTTATAGGGCTCACATCCTCA	*	NM:i:0	MD:Z:150	AS:i:150	XS:i:62


# Examples

### Common germline SV(?)

In [None]:
vcfspec = Vcfspec('7', 158254698, 'G', ('C',), refver='hg19')

### LU-ALK05 hard-to-resolve complex SV

In [52]:
tbam_path = '/home/users/hspark/Projects/11_fusion_samples/bam/wgs_smc/LU-ALK05-T.s.md.ir.br.bam'
nbam_path = '/home/users/hspark/Projects/11_fusion_samples/bam/wgs_smc/LU-ALK05-N.s.md.ir.br.bam'
svaba_vcf_path = '/home/users/hspark/Projects/11_fusion_samples/sv/svaba/LU-ALK05.svaba.unfiltered.somatic.sv.vcf'

tbam = pysam.AlignmentFile(tbam_path)
nbam = pysam.AlignmentFile(nbam_path)
svaba_vcf = pysam.VariantFile(svaba_vcf_path)

In [37]:
igv.cmd('new')
igv.load([tbam_path, nbam_path])

OK
OK
OK


In [38]:
bnds1 = Breakends('3', 107_594_794, True, '3', 168_666_451, False, FASTA_HG19)
bnds2 = Breakends('3', 107_594_834, False, '3', 107_598_244, False, FASTA_HG19)
bnds3 = Breakends('3', 107_598_191, True, '3', 174_990_270, False, FASTA_HG19)

In [53]:
distance = 10
relevant_vrs = list()
poslist = [bnds1.pos_bnd1, bnds1.pos_bnd2, bnds2.pos_bnd1, bnds2.pos_bnd2, bnds3.pos_bnd1, bnds3.pos_bnd2]
for vr in svaba_vcf.fetch():
    bnds_vr = get_bnds_from_vr(vr, FASTA_HG19, CHROMDICT_HG19)
    if (
            bnds_vr.chrom_bnd1 == '3' and
            bnds_vr.chrom_bnd2 == '3' and
            (any(bnds_vr.pos_bnd1 in range(bndpos - distance, bndpos + distance) for bndpos in poslist) or
             any(bnds_vr.pos_bnd2 in range(bndpos - distance, bndpos + distance) for bndpos in poslist))):
        relevant_vrs.append(vr)

In [40]:
for vr in relevant_vrs:
    print(vr)

3	107594794	1006650674:1	A	]3:168666451]A	0	LOWAS	DISC_MAPQ=255;EVDNC=TSI_L;HOMSEQ=A;MAPQ=60;MATEID=1006650674:2;MATENM=0;NM=0;NUMPARTS=4;REPSEQ=TATCTATC;SCTG=c_3_107579501_107604501_34C;SPAN=61071657;SVTYPE=BND	GT:AD:DP:GQ:PL:SR:DR:LR:LO	0/0:0:15:4.1:0,4.1,27.9:0:0:4.216:0	0/1:16:114:0.4:0.4,0,148.1:16:0:0.04474:12.75

3	107594833	100183109:1	T	T]3:107598243]	15	LOWAS	DISC_MAPQ=255;EVDNC=TSI_L;HOMSEQ=A;MAPQ=60;MATEID=100183109:2;MATENM=0;NM=0;NUMPARTS=4;REPSEQ=CC;SCTG=c_3_107579501_107604501_34C;SPAN=3410;SVTYPE=BND	GT:AD:DP:GQ:PL:SR:DR:LR:LO	0/0:0:38:10.5:0,10.5,115.5:0:0:10.68:0	0/1:20:181:15.1:15.1,0,440.9:20:0:-15.14:39.36

3	107598190	148116658:1	A	]3:174990215]A	0	LOWMAPQDISC	DISC_MAPQ=60;EVDNC=DSCRD;IMPRECISE;MAPQ=60;MATEID=148116658:2;MATENM=-1;NM=-1;NUMPARTS=0;SCTG=3:107598190(-)-3:174990215(+)__3_107579501_107604501D;SPAN=67392025;SVTYPE=BND	GT:AD:DP:GQ:PL:SR:DR:LR:LO	0/0:0:21:5.7:0,5.7,62.7:0:0:5.903:0	0/0:4:72:6.9:0,6.9,194.7:0:4:7.035:6.63

3	107598243	100183109:2	A	A]3:1

In [73]:
igv.goto([bnds1, bnds2, bnds3])
igv.cmd('viewaspairs')
igv.cmd('group PAIR_ORIENTATION')
igv.cmd('sort base')

OK
OK
OK
OK


In [74]:
chrom = '3'
pos = 168_666_450
start = pos - 1
end = pos

In [75]:
softclip_col = get_softclip_collection(tbam, chrom, start, end)

In [76]:
softclip_col.keys()

dict_keys([(168666452, True), (168666503, True), (168666583, True), (168666446, False)])

In [78]:
softclip_col[(168666452, True)]

[Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTC', qual=[34, 34, 33, 33, 34, 32, 29, 28, 29, 28], qname='A01057:8:H3GMYDSXY:2:2302:16613:32925'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACA', qual=[33, 33, 33, 26, 34, 34, 34, 33, 34, 25, 32, 28, 23, 27, 28], qname='A01057:8:H3GMYDSXY:2:2469:21630:35196'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACACCAATTGAAACAAAGAGTTT', qual=[31, 31, 31, 31, 32, 31, 31, 32, 33, 32, 23, 31, 32, 31, 8, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 33, 30, 29, 27, 27, 26], qname='A01057:8:H3GMYDSXY:2:1212:6361:24972'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACACCAATTGAAACAAAGAGTTTATG', qual=[31, 31, 31, 32, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 30, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 32, 30, 31, 31, 29, 29, 29, 29], qname='A01057:8:H3GMYDSXY:2:2466:11713:35462'),
 Clipspec(start1=168666452, is_forward=True, seq='TATCTATCTCACACACCAATTGAAACAAAG

In [79]:
key = (168666452, True)
seqlist = [x.seq for x in softclip_col[key]]
seqnamelist = [x.qname for x in softclip_col[key]]

In [80]:
msa = handygenome.align.msa.align_with_muscle(seqlist, seqnamelist, biopython=False)

In [82]:
print(msa)

 0    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG    A00155:287:H3HKGDSXY:2:1425:21667:13275
 1    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTC--------------------------    A01057:8:H3GMYDSXY:2:2157:28754:23109
 2    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATG------------------------------------------------------    A01057:8:H3GMYDSXY:2:2466:11713:35462
 3    TATCTATCTCACACA-----------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2469:21630:35196
 4    TATCTATCTC----------------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2302:16613:32925
 5    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTT----------------------------------------------------    A01057:8:H3GMYDSXY:2:1353:30996:1125
 6    TATCTATCTCACACACCAATTGAAACAAAGAGTTAATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACAT------------------    A01057:8:H3GMYDSXY:2:2156:2654:5400
 7    TATCTATCTCACACA

In [83]:
msa.get_consensus()

'TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTX'

In [84]:
handygenome.align.msa.modify_msa_gapclose(msa, 0.5)

In [85]:
print(msa)

 0    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG    A00155:287:H3HKGDSXY:2:1425:21667:13275
 1    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTC--------------------------    A01057:8:H3GMYDSXY:2:2157:28754:23109
 2    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATG------------------------------------------------------    A01057:8:H3GMYDSXY:2:2466:11713:35462
 3    TATCTATCTCACACA-----------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2469:21630:35196
 4    TATCTATCTC----------------------------------------------------------------------------------    A01057:8:H3GMYDSXY:2:2302:16613:32925
 5    TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTT----------------------------------------------------    A01057:8:H3GMYDSXY:2:1353:30996:1125
 6    TATCTATCTCACACACCAATTGAAACAAAGAGTTAATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACAT------------------    A01057:8:H3GMYDSXY:2:2156:2654:5400
 7    TATCTATCTCACACA

In [86]:
msa.get_consensus()

'TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG'

In [69]:
consensus_seq = msa.get_consensus()

In [87]:
readlist = handygenome.align.bwa.run_bwa(consensus_seq, 'hg19')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1 sequences (92 bp)...
[M::mem_process_seqs] Processed 1 reads in 0.001 CPU sec, 0.002 real sec
[main] Version: 0.7.17-r1188
[main] CMD: /home/users/pjh/scripts/python_genome_packages/utils/bwa mem -Y -M -t 1 -o /home/users/pjh/scripts/tmpqh5402pd/output.sam /home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta /home/users/pjh/scripts/tmpqh5402pd/input.fasta
[main] Real time: 5.855 sec; CPU: 5.607 sec


In [88]:
for read in readlist:
    print(read.to_string())

query	16	3	107598191	60	53M39S	*	0	0	CAACAACAACTAACATTAATGTCACTGAGTGCTAGGCACACTGTTCATGAGGAACATAAACTCTTTGTTTCAATTGGTGTGTGAGATAGATA	*	NM:i:0	MD:Z:53	AS:i:53	XS:i:0	SA:Z:3,107594795,+,39M53S,60,0;
query	256	3	107594795	60	39M53S	*	0	0	TATCTATCTCACACACCAATTGAAACAAAGAGTTTATGTTCCTCATGAACAGTGTGCCTAGCACTCAGTGACATTAATGTTAGTTGTTGTTG	*	NM:i:0	MD:Z:39	AS:i:39	XS:i:20	SA:Z:3,107598191,-,53M39S,60,0;


### jhkim sample

In [15]:
bam_path = '/home/users/jhkim/Project/05_WGS/24.DSD_WGS/22DGS004_20220208-971-2201_DGS-2_M-ready.bam'
bam = pysam.AlignmentFile(bam_path)

In [16]:
igv.load([bam_path])
igv.goto(['10:5199609-5210138'])

OK
OK


In [45]:
chrom = 'chr10'
pos = 5_202_144
start = pos - 1
end = pos

In [46]:
softclip_col = get_softclip_collection(bam, chrom, start, end)

In [47]:
softclip_col.keys()

dict_keys([(5202145, True), (5202142, False), (5202238, True)])

In [48]:
softclip_col[(5202142, False)]

[Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATCGTTCCATTACGTA', qual=[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 4, 30, 30, 30], qname='E100031906L1C001R00603681898'),
 Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATCGTTCCATTACGTAATCCTTTAACATATAAGTTCACTATTTTG', qual=[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], qname='E100031906L1C012R03202397776'),
 Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATCGTTCCATTACGTAATCCTTTAACATATAAGTTCACT', qual=[30, 30, 30, 30, 30, 30, 20, 30, 30, 30, 30, 30, 30, 5, 30, 30, 30, 20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30], qname='E100031906L1C013R01801566726'),
 Clipspec(start1=5202142, is_forward=False, seq='ATATACGTATC', qual=[30, 30, 30, 30, 30,

In [26]:
key = (5202142, False)
seqlist = [x.seq for x in softclip_col[key]]
seqnamelist = [x.qname for x in softclip_col[key]]

In [28]:
msa = handygenome.align.msa.align_with_muscle(seqlist, seqnamelist, biopython=False)

In [31]:
handygenome.align.msa.modify_msa_gapclose(msa, 0.5)

In [33]:
consensus_seq = msa.get_consensus()

In [34]:
readlist = handygenome.align.bwa.run_bwa(consensus_seq, 'hg19')

[M::bwa_idx_load_from_disk] read 0 ALT contigs
[M::process] read 1 sequences (129 bp)...
[M::mem_process_seqs] Processed 1 reads in 0.001 CPU sec, 0.001 real sec
[main] Version: 0.7.17-r1188
[main] CMD: /home/users/pjh/scripts/python_genome_packages/utils/bwa mem -Y -M -t 1 -o /home/users/pjh/scripts/tmpo74uwacr/output.sam /home/users/data/01_reference/human_g1k_v37/human_g1k_v37.fasta /home/users/pjh/scripts/tmpo74uwacr/input.fasta
[main] Real time: 3.489 sec; CPU: 3.462 sec


In [35]:
for read in readlist:
    print(read.to_string())

query	4	*	0	0	*	*	0	0	ATATACGTATCGTTCCATTACGTAATCCTTTAACATATAAGTTCACTATTTTGACAAATTAAGTCAGATTATAAATATATACAATTTTGTCTAAAGGTCGTAAATGTTTCAGTTTGCGGTTAACTTTTC	*	AS:i:0	XS:i:0
