In [38]:
import pysam
import pandas as pd
import re

In [60]:
def read_gold(fn, d):
    f_gold = pysam.AlignmentFile(fn, 'r')
    for r in f_gold:
        if r.is_read1:
            d[r.query_name + '_1'] = [r.reference_name, r.reference_start]
        elif r.is_read2:
            d[r.query_name + '_2'] = [r.reference_name, r.reference_start]
    f_gold.close()
    return d
    
def read_query(fn, d):
    f = pysam.AlignmentFile(fn, 'r')
    for r in f:
        declip_pos = r.reference_start
        if r.cigarstring.count('S') > 0:
            re_cigar = re.compile('[SMID+]')
            if re.findall(re_cigar, r.cigarstring)[0] == 'S':
                declip_pos = r.reference_start - int(re.split(re_cigar, r.cigarstring)[0])
            
        info = [r.flag, r.reference_name, r.reference_start, r.mapping_quality,
                 r.get_tag("AS") if r.has_tag("AS") else None, r.get_tag("NM") if r.has_tag("NM") else None, r.cigarstring, declip_pos]
        if r.is_read1:
            d[r.query_name + '_1'].extend(info)
        elif r.is_read2:
            d[r.query_name + '_2'].extend(info)
    f.close()
    df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'flag', 'rname', 'pos', 'mapq', 'score', 'hdist', 'cigar', 'pos_noclip'])
    return df

In [61]:
d = read_gold('chr21-per.bam', {})
df = read_query('bwa-chm13_to_grch38-final.bam', d)
# df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'flag', 'rname', 'pos', 'mapq', 'score', 'hdist'])
# df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'lev_flag', 'lev_rname', 'lev_pos', 'lev_mapq', 'grc_flag', 'grc_rname', 'grc_pos', 'grc_mapq'])
df.to_csv('NA12878-bwa-lev.tsv', sep='\t')


In [62]:
d = read_gold('chr21-per.bam', {})
df = read_query('bwa-chm13_to_grch38-final-ns.bam', d)

# df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'flag', 'rname', 'pos', 'mapq', 'score', 'hdist'])
# df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'lev_flag', 'lev_rname', 'lev_pos', 'lev_mapq', 'grc_flag', 'grc_rname', 'grc_pos', 'grc_mapq'])
df.to_csv('NA12878-bwa-lev-ns.tsv', sep='\t')


In [63]:
# d = read_gold('chr21-per.bam', {})
# df = read_query('bwa-chm13_to_grch38-final-lm.bam', d)

# # df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'flag', 'rname', 'pos', 'mapq', 'score', 'hdist'])
# # df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'lev_flag', 'lev_rname', 'lev_pos', 'lev_mapq', 'grc_flag', 'grc_rname', 'grc_pos', 'grc_mapq'])
# df.to_csv('NA12878-bwa-lev-lm.tsv', sep='\t')


In [64]:
d = read_gold('chr21-per.bam', {})
df = read_query('bwa-grch38.bam', d)

# df = pd.DataFrame.from_dict(d, orient='index', columns=['gold_rname', 'gold_pos', 'flag', 'rname', 'pos', 'mapq', 'score', 'hdist'])
df.to_csv('NA12878-bwa-grc.tsv', sep='\t')

In [32]:
r.cigarstring

'100M'

In [49]:
re_cigar = re.compile('[SMID+]')
print(re.findall(re_cigar, '5S80M1D15M'))
print(re.split(re_cigar, '5S80M1D15M'))

print(re.findall(re_cigar, '80M1D15M5S'))
print(re.split(re_cigar, '80M1D15M5S'))

['S', 'M', 'D', 'M']
['5', '80', '1', '15', '']
['M', 'D', 'M', 'S']
['80', '1', '15', '5', '']
