In [None]:
# default_exp utils

In [None]:
%load_ext autoreload
%autoreload 2

# create references

> API details.

In [None]:
#hide
from nbdev.showdoc import *
import random

In [None]:
# export
import os
import sys
import re
import argparse
import ftplib
from create_reference import defaults

In [None]:
# export
def get_args():
    parser = argparse.ArgumentParser(prog='fetchr',
                                     description='Fetch and Generate references for bioinformatics analysis',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter
                                    )
    ana_parser = parser.add_argument_group('Analysis parameters')
    ana_parser.add_argument('--species',nargs='+',choices=defaults.species,default=['homo_sapiens','mus_musculus'],
                       help='Reference or references from which species')
    ana_parser.add_argument('--indexs',nargs='+',choices=defaults.softwares,default='samtools',
                       help='Using which software(s) to create indexes')
    ana_parser.add_argument('--reference-version','-rv',default=99,type=int,
                       help='For homo_spaiens, version=75 is the last version of Grch37 reference, \
                       you can check the version in ftp://ftp.ensembl.org/pub/')
    ana_parser.add_argument('--outdir','-o',default='./',
                       help='Reference and indexes generated direction')
    ana_parser.add_argument('--thread','-t',default=4,type=int,
                       help='Thread number')
    conf_parser=parser.add_argument_group('Config parameters')
    conf_parser.add_argument('--bwa',default='bwa',type=str,
                            help='bwa execute path')
    conf_parser.add_argument('--bowtie',default='bowtie',type=str,
                            help='bowtie execute path')
    conf_parser.add_argument('--bowtie2',default='bowtie2',type=str,
                            help='bowtie2 execute path')
    conf_parser.add_argument('--hisat2',default='hisat2',type=str,
                            help='hisat2 execute path')
    conf_parser.add_argument('--picard',default='picard',type=str,
                            help='picard execute path, hint: this picard was install by conda')
    return parser.parse_args()


In [None]:
# args=get_args('-h')

In [None]:
# export 

def file_exists(f):
    return os.path.exists(f)

In [None]:
file_exists('./00_utils.ipynb')


True

In [None]:
# export

def get_ffp(species):
    return re.compile(defaults.fasta_file_pattern.format(species=species),re.IGNORECASE)

def get_gfp(species,version):
    return re.compile(defaults.gtf_file_pattern.format(species=species,version=version),re.IGNORECASE)

In [None]:
# export

def get_likely_file_from_ftp(ftp,ftp_server,version,species,ftype,dtype,pattern):
    second_p = ''
    first_p = ''
    gtf_f = ''
    if ftype == 'fasta':
        direction='/pub/release-{version}/{ftype}/{species}/{dtype}/'.format(
            version=version,
            species=species,
            ftype=ftype,
            dtype=dtype)
    elif ftype == 'gtf':
        direction='/pub/release-{version}/{ftype}/{species}/'.format(
            version=version,
            species=species,
            ftype=ftype)
    try:
#         print(direction)
        ftp.cwd(direction)
        files = ftp.nlst()
        for f in files:
#                 print(f)
            if len(pattern.findall(f))>0:
                if '.toplevel.' in f:
                    second_p = f
                if '.primary_assembly.' in f:
                    first_p = f
                if '.gtf.gz' in f:
                    gtf_f = f
                
    except ftplib.all_errors as e:
        print(e)
        sys.exit(1)
    
    if first_p != '':
        return 'ftp://'+ftp_server+direction+first_p
    elif second_p != '':
        return 'ftp://'+ftp_server+direction+second_p
    elif gtf_f != '':
        return 'ftp://'+ftp_server+direction+gtf_f
    else:
        raise ValueError('No fit ' + ftype +' file in ftp://'+ftp_server+direction)
        

In [None]:
print(get_gfp('homo_sapiens',99))
with ftplib.FTP(defaults.ensembl_ftp) as ftp:
    ftp.login()
    for s in random.sample(defaults.species,2):
        print(s)
        print(get_likely_file_from_ftp(ftp,defaults.ensembl_ftp,'99',s,'fasta','dna',get_ffp(s)))
        print(get_likely_file_from_ftp(ftp,defaults.ensembl_ftp,'99',s,'gtf',None,get_gfp(s,99)))


re.compile('homo_sapiens.+?99.gtf.gz', re.IGNORECASE)
oreochromis_niloticus
ftp://ftp.ensembl.org/pub/release-99/fasta/oreochromis_niloticus/dna/Oreochromis_niloticus.O_niloticus_UMD_NMBU.dna.primary_assembly.LG9.fa.gz
ftp://ftp.ensembl.org/pub/release-99/gtf/oreochromis_niloticus/Oreochromis_niloticus.O_niloticus_UMD_NMBU.99.gtf.gz
gallus_gallus
ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz
ftp://ftp.ensembl.org/pub/release-99/gtf/gallus_gallus/Gallus_gallus.GRCg6a.99.gtf.gz


In [None]:
# export

def get_local_files(outdir,species,version):
    sample_outdir='{outdir}/{species}/{version}'.format(outdir=outdir,species=species,version=version)
    local_genome_fasta='{sample_outdir}/genome.fa'.format(sample_outdir=sample_outdir)
    local_transcriptome_gtf = '{sample_outdir}/transcriptome.gtf'.format(sample_outdir=sample_outdir)
    bwa_idx = local_genome_fasta,
    samtools_idx = local_genome_fasta+'.fai'
    picard_idx = local_genome_fasta.replace('.fa','.dict')
    local_genome_fasta_gz = local_genome_fasta+'.gz'
    local_transcriptome_gtf_gz = local_transcriptome_gtf+'.gz'
    bowtie_idx = '{sample_outdir}/bowtie_idx/genome'.format(sample_outdir=sample_outdir)
    bowtie2_idx = '{sample_outdir}/bowtie2_idx/genome'.format(sample_outdir=sample_outdir)
    hisat2_idx = '{sample_outdir}/hisat2_idx/genome'.format(sample_outdir=sample_outdir)
    os.makedirs(sample_outdir,exist_ok=True)
    return locals()

In [None]:

para=get_local_files('outdir','human',99)
para

{'outdir': 'outdir',
 'species': 'human',
 'version': 99,
 'sample_outdir': 'outdir/human/99',
 'local_genome_fasta': 'outdir/human/99/genome.fa',
 'local_transcriptome_gtf': 'outdir/human/99/transcriptome.gtf',
 'bwa_idx': ('outdir/human/99/genome.fa',),
 'samtools_idx': 'outdir/human/99/genome.fa.fai',
 'picard_idx': 'outdir/human/99/genome.dict',
 'local_genome_fasta_gz': 'outdir/human/99/genome.fa.gz',
 'local_transcriptome_gtf_gz': 'outdir/human/99/transcriptome.gtf.gz',
 'bowtie_idx': 'outdir/human/99/bowtie_idx/genome',
 'bowtie2_idx': 'outdir/human/99/bowtie2_idx/genome',
 'hisat2_idx': 'outdir/human/99/hisat2_idx/genome'}

In [None]:
# export

def get_paras(args,ftp,ftp_server):
    paras=[]
    for sp in args.species:
        para={}
        para['species']=sp
        para['version']=args.reference_version
        para['link_genome_fasta']=get_likely_file_from_ftp(ftp,
                                                           ftp_server,
                                                           args.reference_version,
                                                           sp,
                                                           'fasta',
                                                           'dna',
                                                           get_ffp(sp))
        para['link_transcriptome_gtf']=get_likely_file_from_ftp(ftp,
                                                                ftp_server,
                                                                args.reference_version,
                                                                sp,'gtf',
                                                                None,
                                                                get_gfp(sp,
                                                                        args.reference_version))
        para['local_files']= get_local_files(
            args.outdir,
            sp,
            args.reference_version)
        paras.append(para)
        
    return paras


In [None]:
# args=get_args()

In [None]:
# hide
from collections import namedtuple
args=namedtuple('args',['species','reference_version','outdir'])
args.species=['homo_sapiens']
args.reference_version='99'
args.outdir='/tmp/outdir'
with ftplib.FTP(defaults.ensembl_ftp) as ftp:
    ftp.login()
    ftp.cwd('/pub/release-99/fasta/homo_sapiens')
    print(ftp.nlst())
    print(get_paras(args,ftp,defaults.ensembl_ftp))

['cdna', 'cds', 'dna', 'dna_index', 'ncrna', 'pep']
[{'species': 'homo_sapiens', 'version': '99', 'link_genome_fasta': 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz', 'link_transcriptome_gtf': 'ftp://ftp.ensembl.org/pub/release-99/gtf/homo_sapiens/Homo_sapiens.GRCh38.99.gtf.gz', 'local_files': {'outdir': '/tmp/outdir', 'species': 'homo_sapiens', 'version': '99', 'sample_outdir': '/tmp/outdir/homo_sapiens/99', 'local_genome_fasta': '/tmp/outdir/homo_sapiens/99/genome.fa', 'local_transcriptome_gtf': '/tmp/outdir/homo_sapiens/99/transcriptome.gtf', 'bwa_idx': ('/tmp/outdir/homo_sapiens/99/genome.fa',), 'samtools_idx': '/tmp/outdir/homo_sapiens/99/genome.fa.fai', 'picard_idx': '/tmp/outdir/homo_sapiens/99/genome.dict', 'local_genome_fasta_gz': '/tmp/outdir/homo_sapiens/99/genome.fa.gz', 'local_transcriptome_gtf_gz': '/tmp/outdir/homo_sapiens/99/transcriptome.gtf.gz', 'bowtie_idx': '/tmp/outdir/homo_sapiens/99/bowtie_idx/genome',

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_utils.ipynb.
Converted 01_defaults.ipynb.
Converted 02_tasks.ipynb.
Converted 03_commands.ipynb.
Converted 04_pipelines.ipynb.
Converted 05_recipes.ipynb.
Converted index.ipynb.
