In [1]:
# Import packages.
import intake
import allel
import pandas as pd
import numpy as np
import dask.array as da
from dask.diagnostics import ProgressBar as progress
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Open the Ag1000G phase 3 data catalog.
cat = intake.open_catalog("https://malariagen.github.io/intake/gcs.yml")

In [3]:
# Read in the list of available sample sets.
df_sample_sets = cat.ag3.sample_sets.read()

  import pandas.util.testing as tm


In [4]:
df_sample_general = pd.concat(
    [cat.ag3.samples(sample_set=s).read() for s in df_sample_sets.sample_set], 
    axis=0).set_index("sample_id")

df_sample_general.head()

Unnamed: 0_level_0,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F
AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F
AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F
AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F
AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F


In [5]:
df_sample_species_aim = pd.concat(
    {s: cat.ag3.species_calls_20200422_aim(sample_set=s).read() for s in df_sample_sets.sample_set}, 
    axis=0)

df_sample_species_aim = df_sample_species_aim.droplevel(1)
df_sample_species_aim.index.name = "sample_set"

df_sample_species_aim = df_sample_species_aim.reset_index().set_index("sample_id")
df_sample_species_aim.head()

Unnamed: 0_level_0,sample_set,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AR0047-C,AG1000G-AO,0.945,0.001,gamb_colu,coluzzii
AR0049-C,AG1000G-AO,0.933,0.001,gamb_colu,coluzzii
AR0051-C,AG1000G-AO,0.937,0.002,gamb_colu,coluzzii
AR0061-C,AG1000G-AO,0.938,0.002,gamb_colu,coluzzii
AR0078-C,AG1000G-AO,0.926,0.001,gamb_colu,coluzzii


In [6]:
# on left now as the extra rows need to be removed
df_sample_metadata = df_sample_general.join(df_sample_species_aim, how="left")

In [7]:
df_sample_metadata.head()

Unnamed: 0_level_0,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.945,0.001,gamb_colu,coluzzii
AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.933,0.001,gamb_colu,coluzzii
AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.937,0.002,gamb_colu,coluzzii
AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.938,0.002,gamb_colu,coluzzii
AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.926,0.001,gamb_colu,coluzzii


In [8]:
summ = df_sample_metadata.groupby(["sample_set", "country", "species_gambcolu_arabiensis"]).size()
summ.name = "count"

In [9]:
out_summary = pd.pivot_table(
    summ.reset_index(),
    values="count", 
    index=["sample_set", "country"], 
    columns=["species_gambcolu_arabiensis"], fill_value=0)

In [10]:
out_summary.to_csv("tables/sample_set_summary.csv")