# Plot counts per lineage through time

## Imports

In [1]:
import altair as alt
import pandas as pd

## Configuration

In [2]:
min_date = "2019-10-01"

## Load metadata

In [3]:
def load_lineage_dates(metadata_file, lineage):
    df = pd.read_csv(metadata_file, sep="\t").dropna(subset=["date"])
    df["lineage"] = lineage
    dates = df.loc[~(df["date"].str.contains("X")), ["lineage", "date", "region"]]
    
    return dates

In [4]:
h3n2_dates = load_lineage_dates("data/h3n2/metadata.tsv", "H3N2")

In [5]:
h1n1pdm_dates = load_lineage_dates("data/h1n1pdm/metadata.tsv", "H1N1pdm")

In [6]:
vic_dates = load_lineage_dates("data/vic/metadata.tsv", "Vic")

In [7]:
yam_dates = load_lineage_dates("data/yam/metadata.tsv", "Yam")

In [8]:
#dates = pd.concat([h3n2_dates, h1n1pdm_dates, vic_dates])
dates = pd.concat([h3n2_dates, h1n1pdm_dates, vic_dates, yam_dates])

In [9]:
dates = dates.query("date != '36-09-05'").copy()

In [10]:
dates.head()

Unnamed: 0,lineage,date,region
0,H3N2,2018-03-13,?
1,H3N2,2022-04-06,Europe
2,H3N2,2018-03-12,Europe
3,H3N2,2018-02-27,?
4,H3N2,2022-04-19,Europe


In [11]:
dates["date"].max()

'2022-07-19'

In [12]:
dates.shape

(192299, 3)

In [13]:
recent_dates = dates[dates["date"] > min_date].copy()

In [14]:
recent_dates["date"] = pd.to_datetime(recent_dates["date"])

In [15]:
recent_dates.shape

(37408, 3)

In [34]:
regions = [region for region in sorted(recent_dates["region"].drop_duplicates().values) if region != "?"]

In [16]:
binned_counts = recent_dates.set_index("date").groupby("lineage").resample("1MS").count().rename(columns={"lineage": "samples"}).reset_index()

In [17]:
binned_counts

Unnamed: 0,lineage,date,samples,region
0,H1N1pdm,2019-10-01,395,395
1,H1N1pdm,2019-11-01,752,752
2,H1N1pdm,2019-12-01,1553,1553
3,H1N1pdm,2020-01-01,2399,2399
4,H1N1pdm,2020-02-01,1632,1632
...,...,...,...,...
102,Yam,2019-11-01,46,46
103,Yam,2019-12-01,59,59
104,Yam,2020-01-01,43,43
105,Yam,2020-02-01,23,23


In [18]:
alt.Chart(binned_counts).mark_line(point=alt.OverlayMarkDef(size=100)).encode(
    x=alt.X("yearmonth(date):T", title="Date"),
    y=alt.Y("samples:Q", title="Number of samples"),
    color=alt.Color("lineage:N", sort=["H3N2", "H1N1pdm", "Vic", "Yam"], title="Lineage"),
    tooltip=["lineage:N", "date:T", "samples:Q"],
).configure_axis(
    labelFontSize=14,
    titleFontSize=14,
).configure_legend(
    labelFontSize=14,
    titleFontSize=14,
).properties(
    width=900,
    height=400,
)

In [19]:
binned_counts_h1n1pdm = recent_dates.set_index("date").query(
    "(lineage == 'H1N1pdm') & (date >= '2021-01-01')"
).groupby(
    "region"
).resample("1MS").count().rename(columns={"region": "samples"}).reset_index()

In [20]:
binned_counts_h1n1pdm

Unnamed: 0,region,date,lineage,samples
0,Africa,2021-01-01,42,42
1,Africa,2021-02-01,48,48
2,Africa,2021-03-01,48,48
3,Africa,2021-04-01,7,7
4,Africa,2021-05-01,16,16
...,...,...,...,...
86,West Asia,2022-02-01,8,8
87,West Asia,2022-03-01,15,15
88,West Asia,2022-04-01,12,12
89,West Asia,2022-05-01,10,10


In [35]:
alt.Chart(binned_counts_h1n1pdm).mark_line(point=alt.OverlayMarkDef(size=100)).encode(
    x=alt.X("yearmonth(date):T", title="Date"),
    y=alt.Y("samples:Q", title="Number of samples"),
    color=alt.Color("region:N", title="Region", scale=alt.Scale(domain=regions, scheme='category10')),
    tooltip=["region:N", "date:T", "samples:Q"],
).configure_axis(
    labelFontSize=14,
    titleFontSize=14,
).configure_legend(
    labelFontSize=14,
    titleFontSize=14,
).properties(
    width=900,
    height=300,
)

In [37]:
binned_counts_h3n2 = recent_dates.set_index("date").query(
    "(lineage == 'H3N2') & (date >= '2019-11-01')"
).groupby(
    "region"
).resample("1MS").count().rename(columns={"region": "samples"}).reset_index()

In [40]:
binned_counts_h3n2.query("region == 'China'")

Unnamed: 0,region,date,lineage,samples
32,China,2019-11-01,61,61
33,China,2019-12-01,102,102
34,China,2020-01-01,29,29
35,China,2020-02-01,9,9
36,China,2020-03-01,9,9
37,China,2020-04-01,1,1
38,China,2020-05-01,1,1
39,China,2020-06-01,1,1
40,China,2020-07-01,7,7
41,China,2020-08-01,0,0


In [38]:
alt.Chart(binned_counts_h3n2).mark_line(point=alt.OverlayMarkDef(size=100)).encode(
    x=alt.X("yearmonth(date):T", title="Date"),
    y=alt.Y("samples:Q", title="Number of samples"),
    color=alt.Color("region:N", title="Region", scale=alt.Scale(domain=regions, scheme='category10')),
    tooltip=["region:N", "date:T", "samples:Q"],
).configure_axis(
    labelFontSize=14,
    titleFontSize=14,
).configure_legend(
    labelFontSize=14,
    titleFontSize=14,
).properties(
    width=900,
    height=300,
)

In [41]:
binned_counts_vic = recent_dates.set_index("date").query(
    "(lineage == 'Vic') & (date >= '2021-01-01')"
).groupby(
    "region"
).resample("1MS").count().rename(columns={"region": "samples"}).reset_index()

In [42]:
alt.Chart(binned_counts_vic).mark_line(point=alt.OverlayMarkDef(size=100)).encode(
    x=alt.X("yearmonth(date):T", title="Date"),
    y=alt.Y("samples:Q", title="Number of samples"),
    color=alt.Color("region:N", title="Region", scale=alt.Scale(domain=regions, scheme='category10')),
    tooltip=["region:N", "date:T", "samples:Q"],
).configure_axis(
    labelFontSize=14,
    titleFontSize=14,
).configure_legend(
    labelFontSize=14,
    titleFontSize=14,
).properties(
    width=900,
    height=300,
)