In [1]:
import pandas as pd
import altair as alt
from datetime import date
today = date.today()

In [2]:
sra = pd.read_csv("https://www.dropbox.com/s/sy2c1ds5gbi16x2/ena.tsv.gz?dl=1", compression='gzip',sep="\t",low_memory=False)

In [3]:
len(sra)

6187417

In [4]:
sra.sample(5)

Unnamed: 0,study_accession,base_count,accession,collection_date,country,culture_collection,description,sample_collection,sample_title,sequencing_method,...,library_name,library_construction_protocol,library_layout,instrument_model,instrument_platform,isolation_source,isolate,investigation_type,collection_date_submitted,center_name
2646888,PRJEB47340,110403990.0,SAMEA13299239,2022-02-20,Portugal,,NextSeq 550 sequencing; Portugal_PT28865_2022,,Portugal_PT28865_2022,,...,Portugal_PT28865_2022,RANDOM,PAIRED,NextSeq 550,ILLUMINA,,Portugal_PT28865_2022,,2022-02-20,NATIONAL INSTITUTE OF HEALTH DR. RICARDO JORGE
941734,PRJEB37886,553473296.0,SAMEA9596368,2021-07-27,United Kingdom,,Illumina NovaSeq 6000 sequencing; Illumina Nov...,,COG-UK/QEUH-1A37012,,...,NT1681963S / HT-116555:E12,,PAIRED,Illumina NovaSeq 6000,ILLUMINA,,,,2021-07-27,SC
4878797,PRJNA731152,351028409.0,SAMN26494479,2022-02-22,USA: California,,Illumina NovaSeq 6000 sequencing,,CDC Sars CoV2 Sequencing Baseline Constellation,,...,CDC Flu SC2,Fulgent COVIDSeq v5,PAIRED,Illumina NovaSeq 6000,ILLUMINA,nasal swab,SARS-CoV-2/Human/USA/CA-CDC-FG-285994/2022,,2022-02-22,
1223753,PRJEB37886,170955425.0,SAMEA10133356,2021-09-07,United Kingdom,,Illumina NovaSeq 6000 sequencing; Illumina Nov...,,COG-UK/ALDP-1DDEE3E,,...,NT1694981O / HT-119424:E2,,PAIRED,Illumina NovaSeq 6000,ILLUMINA,,,,2021-09-07,SC
2114871,PRJEB37886,489829419.0,SAMEA12286485,2022-01-05,United Kingdom,,Illumina NovaSeq 6000 sequencing; Illumina Nov...,,COG-UK/MILK-3128518,,...,NT1717049L / HT-128842:H6,,PAIRED,Illumina NovaSeq 6000,ILLUMINA,,not provided,,2022-01-05,SC


In [5]:
for _ in sra.columns: print(_)

study_accession
base_count
accession
collection_date
country
culture_collection
description
sample_collection
sample_title
sequencing_method
sample_material
sample_description
sample_accession
sample_capture_status
sample_alias
library_selection
location
run_accession
read_count
project_name
library_source
library_strategy
library_name
library_construction_protocol
library_layout
instrument_model
instrument_platform
isolation_source
isolate
investigation_type
collection_date_submitted
center_name


In [6]:
sra = sra[[ 'study_accession','run_accession','collection_date','instrument_platform','library_strategy','library_construction_protocol' ]]

In [7]:
sra = sra.assign(collection_date = pd.to_datetime(sra["collection_date"]))

In [8]:
sra = sra[sra['collection_date'] >= '2020-01-01']

In [11]:
sra['collection_date'].max()

Timestamp('2023-01-23 00:00:00')

In [10]:
sra = sra[sra['collection_date'] <= '2023-02-13']

In [11]:
sra['collection_date'].max()

Timestamp('2023-01-23 00:00:00')

In [12]:
dataset_count = sra.run_accession.nunique()
dataset_count

6164922

In [13]:
heatmap_2d = sra.groupby(['instrument_platform','library_strategy']).agg({'run_accession':'nunique'}).reset_index()

In [14]:
heatmap_2d.sample(5)

Unnamed: 0,instrument_platform,library_strategy,run_accession
19,ION_TORRENT,RNA-Seq,75
26,OXFORD_NANOPORE,WGS,13670
18,ION_TORRENT,AMPLICON,108617
5,CAPILLARY,AMPLICON,3
8,DNBSEQ,RNA-Seq,64


In [61]:
grid1 = alt.Chart(heatmap_2d).mark_rect(opacity=1).encode(
    x=alt.X(
        "instrument_platform:N",
        title="Instrument"
    ),
    y=alt.Y(
        "library_strategy:N",
        title="Strategy",
        axis=alt.Axis(orient='right')
    ),
    color=alt.Color(
        "run_accession:Q",
        title="# Samples",
        scale=alt.Scale(
            scheme="goldred",
            type="log"
        ),
    ),
    tooltip=[
        alt.Tooltip(
            "instrument_platform:N",
            title="Machine"
        ),
        alt.Tooltip(
            "run_accession:Q",
            title="Number of runs"
        ),
        alt.Tooltip(
            "library_strategy:N",
            title="Protocol"
        )
    ]
).properties(
    width=500,
    height=150,
    title={
      "text": 
        ["Breakdown of {:,} datasets (unique accessions) from ENA".format(dataset_count),
         "by Platform and Library Strategy"],
      "subtitle":"(Updated {})".format(today.strftime("%B %d, %Y"))
    }
)



grid1

In [62]:
text1 = grid1.mark_text(
    align="center",
    baseline="middle",
    fontSize=12,
    fontWeight="bold",
).encode(
    text=alt.Text("run_accession:Q",format=",.0f"),
    color=alt.condition(
        alt.datum.run_accession > 200,
        alt.value("white"),
        alt.value("black")
    )
)

text1

In [63]:
grid1 + text1

Unnamed: 0,instrument_platform,library_strategy,run_accession
0,BGISEQ,AMPLICON,21
1,BGISEQ,OTHER,1067
2,BGISEQ,RNA-Seq,64
3,BGISEQ,Targeted-Capture,38
4,BGISEQ,WGA,1
5,CAPILLARY,AMPLICON,3
6,DNBSEQ,AMPLICON,325
7,DNBSEQ,OTHER,5
8,DNBSEQ,RNA-Seq,64
9,ILLUMINA,AMPLICON,4833110
