Extract amplicon sequence data from a set of genomes using seekdeep

In [1]:
import pandas as pd
import os

In [2]:
# panel data
PANEL = '../../data/panel_extended_info.csv'
# reference data
GENOME_DIR = '../../../data/genome_vobs/'
# primers file
PRIMERS = os.path.join(GENOME_DIR, 'primers.txt')
# output for extraction
OUT_DIR = '../../../data/phylo_ampl/vobs_genomes'
# extraction parameters
READLEN = '150'

## Prepare files for extraction

In [3]:
# primers file from panel
# ambiguous chars and gaps are acceptable
p = pd.read_csv(PANEL, dtype=str)[['Primary_ID', 'F', 'R']]
p.columns = ['target','forward','reverse']
p.to_csv(PRIMERS, sep='\t', index=False)
! head {PRIMERS}

target	forward	reverse
0	TGTSTACGGTCTGAAGAACATc	TTATCCGGCTCCAAGTTAAGG
1	GAGCGtGCGGCcAAGATG	ACAgACCGACGTTAATGGC
2	CAGTCAAATTTCCAGACAATCT	CGGAAGTGCATTTGAAGG-AAaA
3	GaTATAAATTGTCGATCACACAAACT	TGCATTTATCGTAGTACAATCTCA
4	ATGcTBGTCATgATGATGATCT	CCGATCCACGATAAGGAGTAC
5	GCTGGCGCATAATTATCaCAAA	tTTCCACTTCATCGCTCGC
6	GCAAAATTTCCGTCCCATTA	TGTAATTAGCTGTGTCTTGTG
7	GTcTCgGAGCACATYGTG	TCGTACTTCATTATTCTTTGGACTG
8	AGTGRCTCCAGACGGTgTT	CCAAGGATTTGCTACTACCAcT


In [10]:
# reference genomes have to be with fasta extension
# naming of reference genome is then used for species inference
! ls {GENOME_DIR} | grep '.fasta'

anopheles-albimanus-steclachromosomesaalbs2.fasta
anopheles-aquasalis-A_aquasalis_v1.0-scaffolds.fasta
anopheles-arabiensis-dongolascaffoldsaarad1.fasta
anopheles-arabiensis-sharakhov.fasta
anopheles-atroparvus-ebrochromosomesaatre3.fasta
anopheles-christyi-achkn1017scaffoldsachra1.fasta
anopheles-coluzzii-mali-nihscaffoldsacolm1.fasta
anopheles-cracens-ASM209184v1-scaffolds.fasta
anopheles-culicifacies-37scaffoldsacula1.fasta
anopheles-darlingi-coariscaffoldsadarc3.fasta
anopheles-dirus-wrair2scaffoldsadirw1.fasta
anopheles-epiroticus-epiroticus2scaffoldsaepie1.fasta
anopheles-farauti-far1scaffoldsafarf2.fasta
anopheles-funestus-fumozchromosomesafunf3.fasta
anopheles-funestus-fumozscaffoldsafunf1.fasta
anopheles-gambiae-pestchromosomesagamp4.fasta
anopheles-gambiae-pimperenascaffoldsagams1.fasta
anopheles-koliensis-akwgs3contigsjxxb01.fasta
anopheles-maculatus-maculatus3scaffoldsamacm1.fasta
anopheles-melas-cm1001059ascaffoldsamelc2.fasta
anopheles-merus-mafscaffol

## Run extraction

In [4]:
! SeekDeep --version

Version 2.6.4
Programs
Use SeekDeep [PROGRAM] --help to see more details about each program
Commands are not case sensitive
SeekDeep
1) extractor
2) extractorPairedEnd
3) makeSampleDirectories
4) processClusters
5) qluster
SeekDeepServer
1) genProjectConfig
2) popClusteringViewer
SeekDeepUtils
1) dryRunQualityFiltering
2) genTargetInfoFromGenomes
3) rBind
4) replaceUnderscores
5) runMultipleCommands
6) setupTarAmpAnalysis


In [6]:
# this takes quite some time
! rm -rf {OUT_DIR}
! SeekDeep genTargetInfoFromGenomes \
        --genomeDir {GENOME_DIR} \
        --primers {PRIMERS} \
        --numThreads 2 \
        --pairedEndLength {READLEN} \
        --dout {OUT_DIR} 



no sequences for Plasmodium primers - that's a good sign

## Output folder structure
- `{target}` dir:
    - `{target}`.fasta - per-genome sequences, same seqeunces for multiple genomes - reflected in header
    - `{target}`_primersRemoved.fasta - same, without primers
    - `extractionCounts.tab.txt` - correct counts for primer hits and extraction counts
    - `genomeLocations/{genome}.bed` - per-genome amplicon coordinates
    - `refAlignments/{genome}_[forward|reverse]Primer.sorted.bam` - bowtie2 alignments of primers
    - `[forward|reverse]Primer.fasta` - all primers sequences resulting from ambiguous bases
- `forSeekDeep` dir - additional input files for SeekDeep:
    - `lenCutOffs.txt`: target minlen maxlen - from alignments to reference genomes
    - `overlapStatuses.txt` target overlap status
    - `refSeqs/{target}.fasta` same as `{target}/{target}.fasta`
- `locationsByGenome/{genome}.bed` - all amplicon locations for each genome

In [8]:
# targets with unexpected overlap
# NoOverLap is driven by one or more sequences being longer than 2 * READ_LEN
! grep -i -v R1EndsInR2 {OUT_DIR}/forSeekDeep/overlapStatuses.txt

target	status
16	NoOverLap
28	NoOverLap
