# Notebook 2: Download shotgun SRA data

In this notebook you can find the names, source location, and code used to download shotgun sequence data to map to the reference genome. 

In [1]:
# conda install -c bioconda sra-tools entrez-direct

# git clone --single-branch hotfix https://github.com/dereneaton/ipyrad.git
# cd ipyrad/
# pip install -e .

In [2]:
import pandas as pd
import numpy as np
import ipyrad.analysis as ipa

### Parallel Client

In [4]:
import ipyparallel as ipp
ipyclient = ipp.Client()

            Controller appears to be listening on localhost, but not on this machine.
            If this is true, you should specify Client(...,sshserver='you@oud')
            or instruct your controller to listen on an external IP.


### Sampling table

In [5]:
# load in table and print
df = pd.read_csv("./data/SRA-table.csv")
df[["Species", "Group", "BioSample", "Sample", "Study", "PRJ"]]


Unnamed: 0,Species,Group,BioSample,Sample,Study,PRJ
0,Macaca mulatta (reference),mulatta,SAMN07305139,SRP103539,SRP103539,PRJNA382404
1,Macaca mulatta (Chinese),mulatta,SAMN05883701,SRS1762028,SRP092140,PRJNA345528
2,Macaca fascicularis,mulatta,SAMN00811240,SRS300124,SRP011089,PRJNA20409
3,Macaca fuscata,mulatta,SAMD00011919,DRS001583,DRP000620,PRJDB2459
4,Macaca thibethana,sinica,SAMN02390221,SRS498543,SRP032525,PRJNA226187
5,Macaca assamensis,sinica,SAMN04316321,SRS1196892,SRP067118,PRJNA305009
6,Macaca arctoides,sinica,SAMN04316319,SRS1196879,SRP067118,PRJNA305009
7,Macaca nemestrina,nemestrina,SAMN07503410,SRS2436621,SRP115775,PRJNA398198
8,Macaca tonkeana,nemestrina,SAMN07503429,SRS2441459,SRP115775,PRJNA398198
9,Macaca nigra,nemestrina,SAMN07503430,SRS2436633,SRP115775,PRJNA398198


# Query NCBI and download run files
To select the best Sample from each Study and get the run IDS (SRR). 

### *Macaca mulatta* (Chinese)

In [5]:
sra = ipa.sratools("SRP092140")
df = sra.fetch_runinfo([1,4,6,7,25,28,29,30,35])
df.sort_values(by="spots", ascending=False).head()

Fetching project data...

Unnamed: 0,Run,spots,spots_with_mates,avgLength,Sample,TaxID,ScientificName,SampleName,Sex
20,SRR4454031,91947333,91947333,200,SRS1762020,9544,Macaca mulatta,M13,missing
6,SRR4454029,83943423,83943423,200,SRS1762018,9544,Macaca mulatta,M11,missing
14,SRR4454030,68556063,68556063,200,SRS1762019,9544,Macaca mulatta,M12,missing
21,SRR5009874,57066114,57066114,200,SRS1762020,9544,Macaca mulatta,M13,missing
13,SRR5009873,46149740,46149740,200,SRS1762019,9544,Macaca mulatta,M12,missing


In [None]:
sra = ipa.sratools("SRR4454031")
sra.run(
    ipyclient=ipyclient,
    name_fields=(29, 30, 1), 
    name_separator="-",
)

### Macaca fascicularis
This project incorrectly listed that the data was not paired (no 'spots with mates') but that is not true. So we use the 'split_pairs' argument in `.run()` below. 

In [6]:
sra = ipa.sratools("SRP011089")
df = sra.fetch_runinfo([1,4,6,7,25,28,29,30,35])
df.sort_values(by="spots", ascending=False).head()

Fetching project data...

Unnamed: 0,Run,spots,spots_with_mates,avgLength,Sample,TaxID,ScientificName,SampleName,Sex
50,SRR445695,80398858,0,200,SRS300124,9541,Macaca fascicularis,125200,
49,SRR445694,80321376,0,200,SRS300124,9541,Macaca fascicularis,125200,
10,SRR445630,36095170,0,200,SRS300124,9541,Macaca fascicularis,125200,
22,SRR445666,36003689,0,200,SRS300124,9541,Macaca fascicularis,125200,
18,SRR445662,35992819,0,200,SRS300124,9541,Macaca fascicularis,125200,


In [None]:
sra = ipa.sratools("SRR445695")
sra.run(
    ipyclient=ipyclient, 
    name_fields=(29, 30, 1), 
    name_separator="-",
    split_pairs=True,
)

### Macaca thibethana

In [6]:
# Macaca thibethana
sra = ipa.sratools("SRP032525")
df = sra.fetch_runinfo([1,4,6,7,25,28,29,30,35])
df

Fetching project data...

Unnamed: 0,Run,spots,spots_with_mates,avgLength,Sample,TaxID,ScientificName,SampleName,Sex
0,SRR1024051,637506195,637506195,180,SRS498543,54602,Macaca thibetana,Tibetan macaque NO. 3,


In [None]:
sra = ipa.sratools("SRR1024051")
sra.run(ipyclient=ipyclient, name_fields=(29, 30, 1), name_separator="-")

### Macaca assamensis AND arctoides

In [17]:
sra = ipa.sratools("SRP067118")
df = sra.fetch_runinfo([1,4,6,7,25,28,29,30,35])
df

Fetching project data...

Unnamed: 0,Run,spots,spots_with_mates,avgLength,Sample,TaxID,ScientificName,SampleName,Sex
0,SRR2981139,500517130,500517130,200,SRS1196879,9540,Macaca arctoides,SM1,female
1,SRR2981140,235902683,235902683,250,SRS1196878,9540,Macaca arctoides,SM2,female
2,SRR2981114,615827332,615827332,250,SRS1196892,9551,Macaca assamensis,XH1,male


In [20]:
# Macaca nigra AND tonkeana AND nemestrina
sra = ipa.sratools("SRP115775")
df = sra.fetch_runinfo([1,4,6,7,25,28,29,30,35])
df

Fetching project data...

Unnamed: 0,Run,spots,spots_with_mates,avgLength,Sample,TaxID,ScientificName,SampleName,Sex
0,SRR5947294,449692098,449692098,302,SRS2436633,54600,Macaca nigra,PF660,female
1,SRR5947293,448474934,448474934,302,SRS2441459,40843,Macaca tonkeana,PM592,male
2,SRR5947292,465782866,465782866,302,SRS2436621,90388,Macaca nemestrina nemestrina,PM664,male


### Get just the best run for each species...