In [None]:
import pandas as pd
import altair as alt
import tabulate
from datetime import date
today = date.today()

In [None]:
sra = pd.read_csv(
    "https://github.com/nekrut/BMMB554/raw/master/2023/data/ena_subsample.tsv.gz",
    compression='gzip',
    sep="\t",
    low_memory=False
)

In [None]:
len(sra)

In [None]:
sra.sample(5)

In [None]:
for _ in sra.columns: print(_)

In [None]:
sra = sra[
    [ 
        'study_accession',
        'run_accession',
        'collection_date',
        'instrument_platform',
        'library_strategy',
        'library_construction_protocol' 
    ]
]

In [None]:
sra = sra.assign(collection_date = pd.to_datetime(sra["collection_date"]))

In [None]:
print('Earliest entry:', sra['collection_date'].min())
print('Latest entry:', sra['collection_date'].max())

In [None]:
sra = sra[ 
    ( sra['collection_date'] >= '2020-01-01' ) 
    & 
    ( sra['collection_date'] <= '2023-02-16' ) 
]

In [None]:
print('Earliest entry:', sra['collection_date'].min())
print('Latest entry:', sra['collection_date'].max())

In [None]:
sra[sra['collection_date'] == sra['collection_date'].max()]['run_accession'].nunique()

In [None]:
heatmap_2d = sra.groupby(
    ['instrument_platform','library_strategy']
).agg(
    {'run_accession':'nunique'}
).reset_index()

In [None]:
heatmap_2d

In [None]:
back = alt.Chart(heatmap_2d).mark_rect(opacity=1).encode(
    x=alt.X(
        "instrument_platform:N",
        title="Instrument"
    ),
    y=alt.Y(
        "library_strategy:N",
        title="Strategy",
        axis=alt.Axis(orient='right')
    ),
    color=alt.Color(
        "run_accession:Q",
        title="# Samples",
        scale=alt.Scale(
            scheme="goldred",
            type="log"
        ),
    ),
    tooltip=[
        alt.Tooltip(
            "instrument_platform:N",
            title="Machine"
        ),
        alt.Tooltip(
            "run_accession:Q",
            title="Number of runs"
        ),
        alt.Tooltip(
            "library_strategy:N",
            title="Protocol"
        )
    ]
).properties(
    width=500,
    height=150,
    title={
      "text": 
        ["Breakdown of datasets (unique accessions) from ENA",
         "by Platform and Library Strategy"],
      "subtitle":"(Updated {})".format(today.strftime("%B %d, %Y"))
    }
)

back

In [None]:
front = back.mark_text(
    align="center",
    baseline="middle",
    fontSize=12,
    fontWeight="bold",
).encode(
    text=alt.Text("run_accession:Q",format=",.0f"),
    color=alt.condition(
        alt.datum.run_accession > 200,
        alt.value("white"),
        alt.value("black")
    )
)

front

In [None]:
back + front