In [264]:
import pysam
from collections import *
from tools.intervals import *
from tools.transcripts import *
from tools.fileOps import *
from tools.bio import *
from tools.psl import *
from tools.procOps import *
from orderedset import OrderedSet
import itertools

In [265]:
fq_with_barcodes = '/hive/users/ifiddes/notch2nl_berkeley_data/E2del19N_E2del68_combined_longranger/E2del68_E2del19N_combined/E2del68_E2del19N_combined.sorted.fastq'

In [266]:
bcode_map = {}
for x in open(fq_with_barcodes):
    if x.startswith('@'):
        n, t = x.split()
        n = n[1:-2]  # remove identifier
        bcode_map[n] = t

In [269]:
# for each bam, organize reads by barcode
reads_by_bcode = defaultdict(lambda: defaultdict(list))
for n in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'N1', 'N2']:
    for rec in pysam.Samfile(os.path.join(n, n + '.binreads.bam')):
        if rec.is_unmapped:
            continue
        bcode = bcode_map[rec.qname]
        reads_by_bcode[n][bcode].append(rec)

In [270]:
# for each barcode, create a separate bam with a color wheel
# construct a track hub with this

composite_template = '''track {}_linked_bams
compositeTrack on
allButtonPair on
type bam
bamColorMode tag 
indelQueryInsert on
visibility hide
shortLabel Barcode Reads
longLabel Barcode Reads

'''

track_template = '''    track {0}_{1}
    priority {1}
    bigDataUrl {2}
    parent {0}_linked_bams
    type bam
    bamColorMode tag 
    indelQueryInsert on
    visibility hide

'''

color_cycle = ['138,23,15', '138,115,15', '69,138,15', '15,138,53', '15,131,138', '15,39,138', '84,15,138', '138,15,100']
color_iter = itertools.cycle(color_cycle)
for n in reads_by_bcode:
    cmd = ['mkdir', '-p', '{}/linked_bams'.format(n)]
    run_proc(cmd)
    with open('{}/linked_bams.txt'.format(n), 'w') as track_db:
        track_db.write(composite_template.format(n))
        for i, (color, (bcode, recs)) in enumerate(zip(*[color_iter, reads_by_bcode[n].iteritems()])):
            out_bam = '{}/linked_bams/{}.bam'.format(n, bcode)
            trackdb_path = out_bam.split('/', 1)[1]
            track_db.write(track_template.format(n, i + 1000, trackdb_path))
            in_bam = os.path.join(n, n + '.binreads.bam')
            with pysam.Samfile(in_bam) as fh:
                with pysam.Samfile(out_bam, 'wb', template=fh) as outf:
                    for rec in recs:
                        rec.set_tag('YC', color)
                        outf.write(rec)

In [271]:
# construct a bigBed of the transcript locations
# also construct the required 2bit
base_dir = '/hive/users/ifiddes/notch2nl_berkeley_data/E2del19N_E2del68_combined_longranger/E2del68_E2del19N_combined/new-assembly/hg38_scaffolded_contigs'

transcripts = Fasta(os.path.join(base_dir, 'transcripts.fa'))

for n in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'N1', 'N2']:
    fa = os.path.join(base_dir, '{}.fa'.format(n))
    two_bit = '{0}/{0}.2bit'.format(n)
    sizes = '{0}/{0}.chrom.sizes'.format(n)
    cmd = ['faToTwoBit', fa, two_bit]
    run_proc(cmd)
    cmd = ['twoBitInfo', two_bit, sizes]
    run_proc(cmd)
    if n not in ['N1', 'N2']:
        tx_name = 'NOTCH2NL-' + '-'.join(n)
    else:
        tx_name = 'NOTCH2-{}'.format(n[-1])
    tx = transcripts[tx_name]
    with TemporaryFilePath() as tmp_fa, TemporaryFilePath() as tmp_psl, TemporaryFilePath() as tmp_bed:
        write_fasta(tmp_fa, tx_name, str(tx))
        cmd = ['blat', '-noHead', fa, tmp_fa, tmp_psl]
        run_proc(cmd)
        cmd = ['pslToBed', tmp_psl, tmp_bed]
        run_proc(cmd)
        txs = list(transcript_iterator(tmp_bed))
        best = sorted(txs, key=len)[-1]
        print_row(tmp_bed, best.get_bed())
        cmd = ['bedToBigBed', tmp_bed, sizes, '{}/transcript.bb'.format(n)]
        run_proc(cmd)

In [272]:
# construct the coverage tracks
for n in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'N1', 'N2']:
    bam = os.path.join(n, n + '.binreads.bam')
    bw = os.path.join(n, n + '.binreads.bw')
    cmd = ['bamCoverage', '-b', bam, '--normalizeUsingRPKM', '--numberOfProcessors', '4',
          '-o', bw]
    run_proc(cmd)

In [273]:
# construct the hub.txt and the genomes.txt file
hub_str = '''hub H9 ESC
shortLabel H9 ESC NOTCH2NL Assembly
longLabel H9 ESC NOTCH2NL Assembly
genomesFile genomes.txt
email NoEmail

'''

with open('hub.txt', 'w') as outf:
    outf.write(hub_str)

genomes_template = '''genome {0}
twoBitPath {0}/{0}.2bit
trackDb {0}/trackDb.txt
organism {0}
description {0}
scientificName {0}
defaultPos {0}:1-100000

'''

with open('genomes.txt', 'w') as outf:
    for n in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'N1', 'N2']:
        outf.write(genomes_template.format(n))

In [274]:
# construct the trackDb.txt entries

transcript_template = '''track {0}_tx
type bigBed 12
visibility pack
priority 1
bigDataUrl transcript.bb
shortLabel NOTCH2NL-{0}
longLabel NOTCH2NL-{0}

'''

bam_template = '''track {0}_binreads
type bam
visibility hide
priority 2
bigDataUrl {0}.binreads.bam
indelQueryInsert on
pairEndsByName .
shortLabel 10x bin reads
longLabel 10x bin reads

'''

bw_template = '''track {0}_bw
type bigWig
visibility full
shortLabel 10x bin reads coverage
longLabel 10x bin reads coverage
priority 3
bigDataUrl {0}.binreads.bw
alwaysZero on
smoothingWindow 10
windowingFunction mean
autoScale on

'''


for n in ['A1', 'A2', 'B1', 'B2', 'C1', 'C2', 'D1', 'D2', 'N1', 'N2']:
    with open(os.path.join(n, 'trackDb.txt'), 'w') as outf:
        outf.write('include linked_bams.txt\n')
        outf.write(transcript_template.format(n))
        outf.write(bam_template.format(n))
        outf.write(bw_template.format(n))

In [275]:
# index the bams
! find . -name '*bam' | xargs -n 1 -P 20 samtools index