In [1]:
import pandas as pd
import altair as alt
import tabulate
from datetime import date
today = date.today()

In [20]:
sra = pd.read_csv(
    "https://github.com/nekrut/BMMB554/raw/master/2023/data/ena_subsample.tsv.gz",
    compression='gzip',
    sep="\t",
    low_memory=False
)

In [21]:
len(sra)

99630

In [22]:
sra.sample(5)

Unnamed: 0.1,Unnamed: 0,study_accession,run_accession,collection_date,instrument_platform,library_strategy,library_construction_protocol
68904,5860135,PRJNA720050,SRR21631708,2022-09-07,ILLUMINA,AMPLICON,Helix Hybrid-Capture Test
65568,2500510,PRJEB37886,ERR8648404,2022-02-08,ILLUMINA,AMPLICON,
56355,1928331,PRJEB37886,ERR7762020,2021-12-07,ILLUMINA,AMPLICON,
10166,508036,PRJEB37886,ERR5473708,2021-02-24,ILLUMINA,AMPLICON,Nextera XT|Internal Nextera XT (reduced volume)
80053,2035026,PRJEB37886,ERR7883989,2021-12-17,ILLUMINA,AMPLICON,


In [23]:
for _ in sra.columns: print(_)

Unnamed: 0
study_accession
run_accession
collection_date
instrument_platform
library_strategy
library_construction_protocol


In [6]:
sra = sra[
    [ 
        'study_accession',
        'run_accession',
        'collection_date',
        'instrument_platform',
        'library_strategy',
        'library_construction_protocol' 
    ]
]

In [8]:
sra = sra.assign(collection_date = pd.to_datetime(sra["collection_date"]))

In [11]:
print('Earliest entry:', sra['collection_date'].min())
print('Latest entry:', sra['collection_date'].max())

Earliest entry: 2020-01-01 00:00:00
Latest entry: 2023-01-20 00:00:00


In [10]:
sra = sra[ 
    ( sra['collection_date'] >= '2020-01-01' ) 
    & 
    ( sra['collection_date'] <= '2023-02-16' ) 
]

In [11]:
print('Earliest entry:', sra['collection_date'].min())
print('Latest entry:', sra['collection_date'].max())

Earliest entry: 2020-01-01 00:00:00
Latest entry: 2023-01-20 00:00:00


In [12]:
sra[sra['collection_date'] == sra['collection_date'].max()]['run_accession'].nunique()

1

In [13]:
heatmap_2d = sra.groupby(
    ['instrument_platform','library_strategy']
).agg(
    {'run_accession':'nunique'}
).reset_index()

In [14]:
heatmap_2d

Unnamed: 0,instrument_platform,library_strategy,run_accession
0,BGISEQ,OTHER,18
1,BGISEQ,RNA-Seq,3
2,DNBSEQ,AMPLICON,5
3,ILLUMINA,AMPLICON,77943
4,ILLUMINA,OTHER,7
5,ILLUMINA,RNA-Seq,556
6,ILLUMINA,Targeted-Capture,239
7,ILLUMINA,WCS,3
8,ILLUMINA,WGA,1511
9,ILLUMINA,WGS,946


In [15]:
back = alt.Chart(heatmap_2d).mark_rect(opacity=1).encode(
    x=alt.X(
        "instrument_platform:N",
        title="Instrument"
    ),
    y=alt.Y(
        "library_strategy:N",
        title="Strategy",
        axis=alt.Axis(orient='right')
    ),
    color=alt.Color(
        "run_accession:Q",
        title="# Samples",
        scale=alt.Scale(
            scheme="goldred",
            type="log"
        ),
    ),
    tooltip=[
        alt.Tooltip(
            "instrument_platform:N",
            title="Machine"
        ),
        alt.Tooltip(
            "run_accession:Q",
            title="Number of runs"
        ),
        alt.Tooltip(
            "library_strategy:N",
            title="Protocol"
        )
    ]
).properties(
    width=500,
    height=150,
    title={
      "text": 
        ["Breakdown of datasets (unique accessions) from ENA",
         "by Platform and Library Strategy"],
      "subtitle":"(Updated {})".format(today.strftime("%B %d, %Y"))
    }
)

back

In [17]:
front = back.mark_text(
    align="center",
    baseline="middle",
    fontSize=12,
    fontWeight="bold",
).encode(
    text=alt.Text("run_accession:Q",format=",.0f"),
    color=alt.condition(
        alt.datum.run_accession > 200,
        alt.value("white"),
        alt.value("black")
    )
)

front

In [18]:
back + front