# Construct script for allignment to XL280 Illumina Reference

# Construct variables using data on local machine

## Collect local variable names
### These include local dir and file names

In [1]:
## Import needed modules. 
from __future__ import print_function
import os
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from itertools import cycle

In [2]:
scriptsdir = os.getcwd()
datapath = '/Users/croth/Downloads/QTL_RAW/'

## Names of fastq files

In [3]:
files = [f for f in listdir(datapath)]# if isfile(join(mypath, f))]
fileend = '.fastq.gz'
datadir = []
for f in files:
    if f[-len(fileend):] == fileend:
        datadir.append(f)
    else:
        pass

In [4]:
## How many unique paired samples?
len(datadir)/2.0

129.0

In [5]:
datadir[:5]

['RBB127_SS-B407_ATGTCA_L002_R1_001.fastq.gz',
 'RBB127_SS-B407_ATGTCA_L002_R2_001.fastq.gz',
 'RBB128_SS-B410_CCGTCC_L002_R1_001.fastq.gz',
 'RBB128_SS-B410_CCGTCC_L002_R2_001.fastq.gz',
 'RBB129_SS-B411_GTCCGC_L002_R1_001.fastq.gz']

In [6]:
seg_files = np.unique([seg.split('L00')[0] for seg in datadir])

#### patch June 2018

In [7]:
seg_files = [s for s in seg_files if len(s.split('XL280'))<2]

In [8]:
len(seg_files)

127

In [9]:
def get_sample(seg):
    temp = seg.split('SS')[-1].split('_')
    if temp[0][0] == '-' and 'Correction' not in temp[1]:
        seg_s = 'SS'+temp[0]
    elif 'XL280' in temp[1]:
        seg_s = temp[1]
    elif temp[0][0] == '-' and 'Correction' in temp[1]:
        seg_s = 'SS'+temp[0]+'_Correction'
    return seg_s

In [10]:
segs = [get_sample(seg) for seg in seg_files]

In [11]:
lanes = np.unique([a.split('_')[-3] for a in datadir]).tolist()
replicates = np.unique([a.split('_')[-2] for a in datadir]).tolist()

In [12]:
lanes

['L001', 'L002', 'L003']

In [13]:
replicates

['R1', 'R2']

## Files that have not been alinged for some god knows reason

## Write variables 
### These vairalbes will include paths to files and executables on Haraka

In [14]:
## File names
my_illumina_align = '/Crypto-DNX-Illumina-align-sam.sh'
my_sam_to_bam = '/Crypto-DNX-Illumina-sam-to-sorted-bam.sh'
my_bamaddrg = '/Crypto-DNX-Illumina-bamaddrg.sh'
mybamsrg = 'Crypto-list-sort-bams-rg-Illumina.txt'
myfreebay = ['/Crypto-DNX-Illumina-freebayes1.sh',
             '/Crypto-DNX-Illumina-freebayes2.sh',
             '/Crypto-DNX-Illumina-freebayes3.sh']
## Script dir
duck_SCRIPTS = '/bigscratch0/croth/CRYPTO_D/SCRIPTS/'
duck_VCF = '/bigscratch0/croth/CRYPTO_D/VCFs/ILLUM/'
## duckydog paths
duck_SHEBANG = '#!/bin/bash\n'
## Reference 
duck_REF = '/bigscratch0/croth/CRYPTO_D/REF/xl280genome.fasta'
duck_QTL_RAW = '/bigscratch0/croth/CRYPTO_D/QTL/QTL_RAW/'
## Illumina sams and bams
duck_SAMS = '/bigscratch0/croth/CRYPTO_D/SAMS/ILLUM/'
duck_BAMS = '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/'
## Bwa and other commands
duck_BWA = 'bwa mem -v 0 ' + duck_REF
cdbamaddrg = 'cd /bigscratch0/croth/bamaddrg\n'

###### Upload these files to and make executable via chmod, e.g. "chmod +x Crypto-DNX-Pac_Bio_align.sh"

## Align

In [15]:
len(seg_files)

127

In [16]:
len(np.unique(seg_files))

127

In [17]:
f = open(scriptsdir+my_illumina_align,'w') ## open file with samples to be remapped (b/c of their corrected parttners)
print(duck_SHEBANG,file=f) ## print the shebang
sams = []
for seg in seg_files:
    seg_s = get_sample(seg)
    sam = duck_SAMS+seg_s+'-'+duck_REF.split('/')[-1].split('.')[0]+'-aln-pe.sam'
    sams.append(sam)
    print(duck_BWA, ## print the bwa command and reference genome
          duck_QTL_RAW + seg+'*R1*.fastq.gz', ## the first read in pair file 
          duck_QTL_RAW + seg+'*R2*.fastq.gz','>', ## the second
          sam+'\n', ## The final sam file
        file=f) ## tells print which file to print to. Que clever ;)
f.close() ## close the file

## Sam to Bam

In [18]:
len(sams)

127

In [19]:
sams[:5]

['/bigscratch0/croth/CRYPTO_D/SAMS/ILLUM/SS-B407-xl280genome-aln-pe.sam',
 '/bigscratch0/croth/CRYPTO_D/SAMS/ILLUM/SS-B410-xl280genome-aln-pe.sam',
 '/bigscratch0/croth/CRYPTO_D/SAMS/ILLUM/SS-B411-xl280genome-aln-pe.sam',
 '/bigscratch0/croth/CRYPTO_D/SAMS/ILLUM/SS-C026-xl280genome-aln-pe.sam',
 '/bigscratch0/croth/CRYPTO_D/SAMS/ILLUM/SS-C029-xl280genome-aln-pe.sam']

In [20]:
f = open(scriptsdir+my_sam_to_bam,'w') ## Open file to print text to ... 
bams = []
for i,seg in enumerate(segs):
    bam =  duck_BAMS+seg+'-'+duck_REF.split('/')[-1].split('.')[0]+'-aln-pe-sorted'
    bams.append(bam+'.bam')
    print('samtools view -bS %s | samtools sort - %s'%(
        sams[i],
        bam+'\n'),
          file=f) ## Make sam to bam file
f.close()

## Bam to Bamaddrg

#### Check bams

In [21]:
bams[:5]

['/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B407-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B410-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B411-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-C026-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-C029-xl280genome-aln-pe-sorted.bam']

In [22]:
bams[-5:]

['/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-C036-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-C039-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-A837-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-A853-xl280genome-aln-pe-sorted.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B830-xl280genome-aln-pe-sorted.bam']

### Write bamaddrg commands

In [23]:
f = open(scriptsdir+my_bamaddrg,'w') ## Open file to print text to ... 
print(cdbamaddrg,file=f)
bamrgs = []
for i,bam in enumerate(bams):
    bamrg = bam.split('.')[0]+'-rg.bam'
    bamrgs.append(bamrg)
    print('./bamaddrg -b %s -s %s -r %s > %s\n'%(
        bam,
        bam.split('-xl280genome')[0].split('/')[-1]+'-'+str(i),
        bam.split('genome')[0].split('/')[-1]+'-'+str(i),
        bamrg
    ),file=f) 
    print('rm %s\n'%(bam),file=f)
f.close()

## Index bam files with samtools

In [24]:
f = open(scriptsdir+'/Crypto-DNX-Illum-Bam-Index.sh','w')
for i,bam in enumerate(bamrgs):
    print('samtools index %s %s'%(bam,bam+'.bai\n'),file=f)
f.close()

## FREEBAYES!!!!!

### Write list of bams with read group info added

In [25]:
bamrgs[:5]

['/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B407-xl280genome-aln-pe-sorted-rg.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B410-xl280genome-aln-pe-sorted-rg.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-B411-xl280genome-aln-pe-sorted-rg.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-C026-xl280genome-aln-pe-sorted-rg.bam',
 '/bigscratch0/croth/CRYPTO_D/BAMS/ILLUM/SS-C029-xl280genome-aln-pe-sorted-rg.bam']

In [26]:
len(bamrgs)

127

In [27]:
f = open(scriptsdir+'/'+mybamsrg,'w')
for bam in bamrgs:
    print(bam,file=f)
f.close()

In [28]:
mybamsrg

'Crypto-list-sort-bams-rg-Illumina.txt'

### Gather chromosomes names in xl280fasta

In [29]:
illum = open('../FASTA/xl280genome.fasta')
tigs_keep = []
for line in illum:
    if line[0] in ['>']:
        tigs_keep.append(line[1:-1])
illum.close()
chrom_map = sorted(tigs_keep)

In [30]:
f0 = open(scriptsdir+myfreebay[0],'w');
f1 = open(scriptsdir+myfreebay[1],'w');
f2 = open(scriptsdir+myfreebay[2],'w');
clrf = cycle([f0,f1,f2])
for region in chrom_map:
    f = clrf.next()
    print(region)
    new_vcf = duck_VCF+'DNX-'+region+'-'+duck_REF.split('.fasta'
                )[0].split('/')[-1]+'-'+str(len(bamrgs))+'.vcf'
    freebayes = '/usr/local/bin/freebayes -f %s -p %s -r %s -L %s -Z -= > %s'%(
        duck_REF,
        str(1),
        region,
        duck_SCRIPTS+mybamsrg,
        new_vcf)
    print(freebayes,file=f)
    print('gzip %s'%new_vcf,file=f)
    print(' ',file=f)
for f in [f0,f1,f2]:
    f.close()

Chr01
Chr02
Chr03
Chr04
Chr05
Chr06
Chr07
Chr08
Chr09
Chr10
Chr11
Chr12
Chr13
Chr14
