# Python book to first search to create a list of putative SRP projects then to get expanded metadata

Using the pysradb package https://pypi.org/project/pysradb/ https://bio.tools/pysradb, searched the sequence read archive of the US NIH https://www.ncbi.nlm.nih.gov/sra to determine whether or not the samples contained within the project were relevant to our coronovirus-nf experiment.

The experiment as outlined is searching to confirm/deny the initial finding that there is evidence that the virus disrupts the cell-cell junctions and this could be the means by which access to the cell is obtained, injury caused and a myriad of susceptibility for individuals may be laid bare.


## Installation

Following directions for quick start found here https://saket-choudhary.me/pysradb/ under here https://saket-choudhary.me/pysradb/quickstart.html, installed and verified the version.

Note that the package though capable of being installed via `conda install` results in an older version installation rather than through `pip install`


In [126]:
pip install -U pysradb

Requirement already up-to-date: pysradb in /opt/conda/lib/python3.7/site-packages (0.10.4)
Note: you may need to restart the kernel to use updated packages.


In [127]:
!pysradb  --version


  from pandas import Panel
  from pandas import Panel
pysradb 0.10.4


In [9]:
# search for calu-3 cell lines that are rna-seq -- they all may not be infection
!pysradb search '"calu-3" "rna-seq"' > calu-3_rna-seq.txt

  from pandas import Panel
  from pandas import Panel


In [10]:
!awk -F " " '{print $1}' calu-3_rna-seq.txt  > calu-3_rna-seq_srp.txt

In [11]:
!head calu-3_rna-seq_srp.txt

study_accession
SRP253951
SRP253951
SRP253951
SRP253951
SRP253951
SRP253951
SRP170549
SRP170549
SRP170549


In [12]:
# make the list non-redudnant
!sort -u calu-3_rna-seq_srp.txt > s.calu-3_rna-seq_srp.txt

In [14]:
!wc -l s.calu-3_rna-seq_srp.txt

5 s.calu-3_rna-seq_srp.txt


In [15]:
!more s.calu-3_rna-seq_srp.txt

SRP049988
SRP056612
SRP170549
SRP253951
study_accession


In [7]:
# search more specifically for sars infection
!pysradb search '"sars" "rna-seq" "homo sapiens"' > sars_rna-seq_homo-sapiens.txt

  from pandas import Panel
  from pandas import Panel


In [8]:
!awk -F " " '{print $1}' sars_rna-seq_homo-sapiens.txt > sars_rna-seq_homo-sapiens_srp.txt

In [9]:
# make the list unique
!sort -u sars_rna-seq_homo-sapiens_srp.txt > s.sars_rna-seq_homo-sapiens_srp.txt

In [10]:
!wc -l s.sars_rna-seq_homo-sapiens_srp.txt

17 s.sars_rna-seq_homo-sapiens_srp.txt


In [11]:
# look for mers 
!pysradb search '"mers" "rna-seq" "homo sapiens"' > mers_rna-seq_homo-sapiens.txt

  from pandas import Panel
  from pandas import Panel


In [14]:
!awk -F " " '{print $1}' mers_rna-seq_homo-sapiens.txt > mers_rna-seq_homo-sapiens_srp.txt

In [15]:
# make the list unique
!sort -u mers_rna-seq_homo-sapiens_srp.txt > s.mers_rna-seq_homo-sapiens_srp.txt

In [16]:
!wc -l s.mers_rna-seq_homo-sapiens_srp.txt

7 s.mers_rna-seq_homo-sapiens_srp.txt


In [17]:
# look for h1n1
!pysradb search '"h1n1" "rna-seq" "homo sapiens"' > h1n1_rna-seq_homo-sapiens.txt

  from pandas import Panel
  from pandas import Panel


In [19]:
!awk -F " " '{print $1}' h1n1_rna-seq_homo-sapiens.txt > h1n1_rna-seq_homo-sapiens_srp.txt

In [20]:
!sort -u h1n1_rna-seq_homo-sapiens_srp.txt > s.h1n1_rna-seq_homo-sapiens_srp.txt

In [21]:
# make the list unique
!wc -l s.h1n1_rna-seq_homo-sapiens.txt

23 s.h1n1_rna-seq_homo-sapiens.txt


In [22]:
!ls s.*txt

s.calu-3_rna-seq_srp.txt	     s.mers_rna-seq_homo-sapiens.txt
s.h1n1_rna-seq_homo-sapiens.txt      s.sars_rna-seq_homo-sapiens_srp.txt
s.mers_rna-seq_homo-sapiens_srp.txt


In [24]:
!mv s.h1n1_rna-seq_homo-sapiens.txt s.h1n1_rna-seq_homo-sapiens_srp.txt

In [27]:
!ls -l s.*txt

-rw-r--r-- 1 jovyan users  56 Apr 16 18:21 s.calu-3_rna-seq_srp.txt
-rw-r--r-- 1 jovyan users 236 Apr 16 18:44 s.h1n1_rna-seq_homo-sapiens_srp.txt
-rw-r--r-- 1 jovyan users  76 Apr 16 18:40 s.mers_rna-seq_homo-sapiens_srp.txt
-rw-r--r-- 1 jovyan users 176 Apr 16 18:36 s.sars_rna-seq_homo-sapiens_srp.txt


In [28]:
# make a single list
!cat s.*txt > calu-3_h1n1_mers_sars_rna-seq_srp.txt

In [31]:
# clean it up.
!grep -v 'study_accession' calu-3_h1n1_mers_sars_rna-seq_srp.txt > srp.txt

In [32]:
 !wc -l srp.txt

48 srp.txt


In [35]:
# final list
!sort -u srp.txt > s.srp.txt

In [37]:
!wc -l s.srp.txt

43 s.srp.txt


In [38]:
!cat s.srp.txt

ERP105150
ERP114921
SRP032367
SRP040070
SRP040661
SRP049988
SRP056027
SRP056612
SRP059219
SRP060334
SRP065794
SRP076102
SRP077920
SRP082170
SRP091886
SRP093448
SRP097673
SRP103821
SRP113333
SRP118721
SRP136329
SRP148144
SRP162136
SRP170549
SRP182839
SRP189350
SRP192740
SRP198374
SRP227272
SRP230823
SRP239551
SRP242169
SRP248092
SRP249613
SRP250446
SRP250653
SRP251618
SRP252988
SRP253640
SRP253951
SRP254080
SRP254688
SRP255769


In [58]:
!pysradb search '"rna-seq" "respiratory" "cell-lines"'

  from pandas import Panel
  from pandas import Panel
study_accession experiment_accession experiment_title                              experiment_desc                               organism_taxid  organism_name library_strategy library_source  library_selection sample_accession sample_title instrument           total_spots total_size  run_accession run_total_spots run_total_bases
SRP139919       SRX3932496                  GSM3097024: P3; Homo sapiens; RNA-Seq         GSM3097024: P3; Homo sapiens; RNA-Seq  9606            Homo sapiens  RNA-Seq          TRANSCRIPTOMIC  cDNA              SRS3165478                    Illumina HiSeq 2500  24497335    1360950284  SRR6998005    24497335        2415907957    
SRP139919       SRX3932495                  GSM3097023: P2; Homo sapiens; RNA-Seq         GSM3097023: P2; Homo sapiens; RNA-Seq  9606            Homo sapiens  RNA-Seq          TRANSCRIPTOMIC  cDNA              SRS3165479                    Illumina HiSeq 2500  34328839    1931534618  

In [1]:
!pysradb metadata SRP139919 --detailed --expand > ../data/SRP139919_GSM_expanded.txt
!pysradb metadata SRP170549 --detailed --expand > ../data/SRP170549_GSM_expanded.txt
!pysradb metadata SRP040070 --detailed --expand > ../data/SRP040070_GSM_expanded.txt
!pysradb metadata SRP049988 --detailed --expand > ../data/SRP049988_GSM_expanded.txt
!pysradb metadata SRP056612 --detailed --expand > ../data/SRP056612_GSM_expanded.txt
!pysradb metadata SRP091886 --detailed --expand > ../data/SRP091886_GSM_expanded.txt
!pysradb metadata SRP097673 --detailed --expand > ../data/SRP097673_GSM_expanded.txt
!pysradb metadata SRP118721 --detailed --expand > ../data/SRP118721_GSM_expanded.txt
!pysradb metadata SRP060334 --detailed --expand > ../data/SRP060334_GSM_expanded.txt
!pysradb metadata SRP076102 --detailed --expand > ../data/SRP076102_GSM_expanded.txt
!pysradb metadata SRP189350 --detailed --expand > ../data/SRP189350_GSM_expanded.txt
!pysradb metadata SRP227272 --detailed --expand > ../data/SRP227272_GSM_expanded.txt
!pysradb metadata SRP230823 --detailed --expand > ../data/SRP230823_GSM_expanded.txt
!pysradb metadata SRP253951 --detailed --expand > ../data/SRP253951_GSM_expanded.txt
!pysradb metadata SRP254688 --detailed --expand > ../data/SRP254688_GSM_expanded.txt
!pysradb metadata SRP248940 --detailed --expand > ../data/SRP248940_GSM_expanded.txt


  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
  from pandas import Panel
Traceback (most recent call last):
  File "/opt/conda/bin/pysradb", line 8, in <module>
    sys.exit(parse_args())
  File "/opt/conda/lib/python3.7/site-packages/pysradb/cli.py", line 1048, in parse_args
    args.saveto,
  File "/opt/conda/lib/python3.7/site-packages/pysradb/cli.py", line 97, in metadata
    expand_sample_attributes=expand,
  File "/opt/conda/lib/python3.7/site-packages/pysradb/sraweb.py", line 508, in sra_metadata
    ena_results = self.fetch_ena_fastq(srp)
  File "/opt/conda/lib/python3.7/s