In [1]:
## Bring in needed mods
import numpy as np, pandas as pd

In [2]:
## List to checksum
checkpath = '/home/croth/SELFFILM/FASTQ/Sun_6673_210202B6.checksum'

In [3]:
checksums = pd.read_csv(checkpath,header=None,sep=' ')
checksums.dropna(axis=1,how='all',inplace=True)
checksums.columns = ['Checksum','Filename']
checksums['Fastq'] = [s.split('/')[-1] for s in checksums.Filename]
checksums.head()

Unnamed: 0,Checksum,Filename,Fastq
0,21f0e2cb074b218f6f5c1ec1810fe6a8,Sun_6673_210202B6/SS_20201123_A34_S50_L001_R2_...,SS_20201123_A34_S50_L001_R2_001.fastq.gz
1,5b7bb09edca67e158dae83f9147f300c,Sun_6673_210202B6/SS_20201123_B09_S69_L001_R2_...,SS_20201123_B09_S69_L001_R2_001.fastq.gz
2,d3d2a8db32a4151b1ea880064ed0ee1d,Sun_6673_210202B6/SS_20201123_A36_S52_L001_R2_...,SS_20201123_A36_S52_L001_R2_001.fastq.gz
3,f6c98a47477bab400387ef1a7c604ee8,Sun_6673_210202B6/SEC_20201123_01_S1_L001_R2_0...,SEC_20201123_01_S1_L001_R2_001.fastq.gz
4,9e777f5c9260f5f8af8329da2cc438ab,Sun_6673_210202B6/SS_20201123_A12_S28_L001_R1_...,SS_20201123_A12_S28_L001_R1_001.fastq.gz


In [4]:
B20s = ['SS_20201123_H99_03_S9_L001_R1_001.fastq.gz','SS_20201123_H99_03_S9_L001_R2_001.fastq.gz']

In [5]:
H99s = sorted([f for f in checksums.Fastq if len(f.split("H99")) == 2 and f not in B20s])
len(H99s)

18

In [6]:
fastqs = [ f for f in checksums.Fastq if f.split('_')[0] == 'SS' and f not in H99s]
len(fastqs)/2

57.0

In [7]:
sorted(fastqs)[:6]

['SS_20201123_A01_S17_L001_R1_001.fastq.gz',
 'SS_20201123_A01_S17_L001_R2_001.fastq.gz',
 'SS_20201123_A02_S18_L001_R1_001.fastq.gz',
 'SS_20201123_A02_S18_L001_R2_001.fastq.gz',
 'SS_20201123_A03_S19_L001_R1_001.fastq.gz',
 'SS_20201123_A03_S19_L001_R2_001.fastq.gz']

In [8]:
samples = list(np.unique([f.split('_R')[0] for f in fastqs]))[::-1]
len(samples)

57

In [9]:
samples[:5]

['SS_20201123_H99_03_S9_L001',
 'SS_20201123_B12_S72_L001',
 'SS_20201123_B11_S71_L001',
 'SS_20201123_B10_S70_L001',
 'SS_20201123_B09_S69_L001']

In [10]:
## Set path to reference on big duck
refpath = '/analysis/CROTH/Self-Filamentation_B3502_progeny/REF/FungiDB-48_CneoformansJEC21_Genome.fasta'

## set data path
datapath = '/analysis/CROTH/SELFFILAM/'

## set sam path 
sampath = datapath+'SAM/'

## set bam path
#bampath = datapath+'BAM/'

## set bamaddrg path
addrgpath = '/home/croth/bin/./bamaddrg -b'

## set command
command = 'bwa mem -a -M %s %s %s | samtools view -F 4 -b | samtools sort -o %s'

In [11]:
## gather sample ends 
ends = np.unique([a.split('_R')[-1] for a in fastqs])

## Check work paired end so should be 2
assert len(ends) == 2

In [12]:
## make bam paths
bams = [sampath+s+'-sm.bam' for s in samples]

## check work
assert len(np.unique(bams)) == len(samples)

In [13]:
## construct bam commands and take them ordred by size
bwa = np.array([command%(refpath,datapath+f+'_R'+ends[0],
                         datapath+f+'_R'+ends[1],bams[i]+'\n\n') 
                for i,f in enumerate(samples)])

In [14]:
## view first command
bwa[0]

'bwa mem -a -M /analysis/CROTH/Self-Filamentation_B3502_progeny/REF/FungiDB-48_CneoformansJEC21_Genome.fasta /analysis/CROTH/SELFFILAM/SS_20201123_H99_03_S9_L001_R1_001.fastq.gz /analysis/CROTH/SELFFILAM/SS_20201123_H99_03_S9_L001_R2_001.fastq.gz | samtools view -F 4 -b | samtools sort -o /analysis/CROTH/SELFFILAM/SAM/SS_20201123_H99_03_S9_L001-sm.bam\n\n'

In [15]:
## Split progeny samples across 4 files
nfiles = 4

filenames = './run%s_bwa.sh'
for i in range(nfiles):
    filename = filenames%i
    
    open(filename,'w').writelines(bwa[i::nfiles])

In [None]:
## Gather unique sample names
samples_temp = [s.split('_')[2] for s in bams]
samples = [s if s != 'H99' else 'B20' for s in samples_temp]
samplen = [s.split('_')[-2] for s in bams]

## Check work
assert len(np.unique(samples)) == len(samples)
assert len(np.unique(samplen)) == len(samplen)
assert len(np.unique(samples)) == len(np.unique(samplen))

## Print first few
samples[:5],samplen[:5]

In [41]:
## Make bamaddrg commands
## adds the read groups to the merged bam files
## zip the old bam files too
## eventually we will delete these
add_commands = ['%s %s -s %s -r %s > %s\ngzip %s\n\n'%(
                addrgpath,
                b,
                samples[i],
                samplen[i]+'.%s'%i,
                '-rg.'.join(b.split('.')),
                b) for i,b in enumerate(bams)]

## Check work
assert len(np.unique(add_commands)) == len(add_commands)

## View a few of these commands
for c in add_commands[:5]:
    print(c)

/home/croth/bin/./bamaddrg -b /analysis/CROTH/SELFFILAM/SAM/SS_20201123_H99_03_S9_L001-sm.bam -s B20 -r S9.0 > /analysis/CROTH/SELFFILAM/SAM/SS_20201123_H99_03_S9_L001-sm-rg.bam
gzip /analysis/CROTH/SELFFILAM/SAM/SS_20201123_H99_03_S9_L001-sm.bam


/home/croth/bin/./bamaddrg -b /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B12_S72_L001-sm.bam -s B12 -r S72.1 > /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B12_S72_L001-sm-rg.bam
gzip /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B12_S72_L001-sm.bam


/home/croth/bin/./bamaddrg -b /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B11_S71_L001-sm.bam -s B11 -r S71.2 > /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B11_S71_L001-sm-rg.bam
gzip /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B11_S71_L001-sm.bam


/home/croth/bin/./bamaddrg -b /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B10_S70_L001-sm.bam -s B10 -r S70.3 > /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B10_S70_L001-sm-rg.bam
gzip /analysis/CROTH/SELFFILAM/SAM/SS_20201123_B10_S70_L001-sm.bam


/home/croth/bin/./b

In [43]:
## Write bamaddrg commands, 
## here we are going to make twice as many files 
## to run a total of 8 scripts in paralle
run_addrg = './run%s_addrg.sh'
for i in range(nfiles):
    open(run_addrg%i,'w').writelines(add_commands[i::nfiles])

In [16]:
assert 1 == 0

AssertionError: 

In [None]:
## Write merged bam files
merged_bams = np.unique([bampath+s.split('_L00')[0]+'.bam' for s in samples])

In [None]:
## Gather the sam file names
sams = np.unique([b.split('_L00')[0] for b in bams])

In [None]:
## Write merge commands
merged = ['samtools merge %s %s %s\n\n'%(
    merged_bams[i] ,s+'_L001-sm.bam',s+'_L002-sm.bam') 
          for i,s in enumerate(sams)]

In [None]:
## View first few commands
merged[:5]

In [None]:
## How many merged files are there?
assert len(merged) == len(samples)/2

len(merged)

In [None]:
## Write merge commands
run_merge = '../MERGE/run%s_merge.sh'
for i in range(nfiles):
    open(run_merge%i,'w').writelines(merged[i::nfiles])

In [None]:
## Make bamaddrg commands
## adds the read groups to the merged bam files
## zip the old bam files too
## eventually we will delete these
add_commands = ['%s %s -s %s -r %s > %s\ngzip %s\n\n'%(
                addrgpath,
                b,
                b.split('/')[-1].split('_')[0],
                b.split('_')[-1].split('.ba')[0]+'.%s'%i,
                '-rg.'.join(b.split('.')),
                b) for i,b in enumerate(merged_bams)]

## Check work
assert len(np.unique(add_commands)) == len(add_commands)

## View a few of these commands
add_commands[:3]

In [None]:
## What are the unique two digits after PMY?
np.unique([s.split('_')[0][3:5] for s in samples])

In [None]:
## Write bamaddrg commands, 
## here we are going to make twice as many files 
## to run a total of 8 scripts in paralle
run_addrg = '../ADDRG/run%s_addrg.sh'
for i in range(nfiles*2):
    open(run_addrg%i,'w').writelines(add_commands[i::2*nfiles])

In [None]:
bamrg = ['-rg.'.join(b.split('.')) for b in merged_bams]
bamrg[:4]

In [None]:
## Write samtools index file
samix = ['samtools index %s\n'%b for b in bamrg]
open('../SAMIX/run_samtools_ix.sh','w').writelines(samix)

In [None]:
## Five samples seemed to fail, b/c I missed called samtools ix
## Write command to fix these 5 that failed
## Write the five that failed
failed = ['PMY2557','PMY2601','PMY2701','PMY2801','PMY2901']

## write remove command to destory old versions
failed_bamrg = ['rm %s\n'%a 
                for a in bamrg if a.split('/')[-1].split('_')[0] in failed] 
assert len(failed_bamrg) == len(failed)

## Unzip bam files
failed_unzip = ['gunzip %s.bam.gz\n'%a.split('-rg')[0] 
                for a in bamrg if a.split('/')[-1].split('_')[0] in failed] 
assert len(failed_unzip) == len(failed)

## Gather the add read group commands
failed_addrg = [a for a in add_commands if a.split(' ')[4] in failed]
assert len(failed_addrg) == len(failed)

## Reindex
failed_samix = ['samtools index %s\n'%a 
                for a in bamrg if a.split('/')[-1].split('_')[0] in failed]
assert len(failed_samix) == len(failed)

## Write to file
open('../FAILED/reruns.sh','w').writelines(failed_bamrg+failed_unzip+failed_addrg+failed_samix)

In [None]:
## Write out list of bams
open('../listofbams.txt','w').writelines('\n'.join(bamrg))

### NOTES
You will need to take the .sh files and run chmod +x *.sh to make them executables