## Overview

Using `bokeh` to define populations for later structure investigation.

Broadly, in theory a population is a freely breeding set of individuals. In practice, it is a set of individuals from the same location which are members of the same species. The idea of location is wooly, but broadly within 50km, and there being an absence of PCA structure within the region.

In [1]:
import gcsfs
import numpy as np
import allel
import yaml
import zarr
import pandas as pd

In [2]:
# let's try bokeh...
# imports required for `bokeh`
from bokeh.plotting import figure, show, ColumnDataSource, output_file
import matplotlib as mpl
import seaborn as sns
from bokeh.layouts import gridplot

In [3]:
from ag3 import release_data
v3 = release_data()

In [4]:
# Data storage, uses about 34 MB
pca_cloud_zarr_path_template = 'vo_agam_production/ag3_data_paper/{}.pca_output.zarr'
# Writing the PCA data to the cloud will require the appropriate authentication and authorization.

# UNCOMMENT THIS TO AUTHENTICATE. YOU ONLY NEED TO RUN THIS ONCE.
# After running this once, your authentication token should then be cached in `~/.gcs_tokens`
# Once you have authenticated, you should comment this out again to avoid re-authenticating.
# gcs_browser = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='browser')

# Use `cache_timeout=0` to prevent object list cache, to avoid recreating map for Zarr consolidated metadata
auth_fs = gcsfs.GCSFileSystem(project='malariagen-jupyterhub', token='cache', cache_timeout=0)

In [5]:
pca_cloud_zarr_path = pca_cloud_zarr_path_template.format('gamb_colu')
pca_cloud_zarr_path

'vo_agam_production/ag3_data_paper/gamb_colu.pca_output.zarr'

In [6]:
# Sometimes errors with `overwrite=True`, sometimes errors without, when dir not exist
# Keep the zarr_store for zarr.consolidate_metadata(zarr_store)
zarr_store = auth_fs.get_mapper(pca_cloud_zarr_path)
zarr_group = zarr.group(zarr_store)

In [7]:
sample_names = zarr_group["sample_names"][:]
sample_names = [s.decode() for s in sample_names]

In [8]:
pca_coordinates = zarr_group["coords"]
pca_components = zarr_group["components"]
pca_pve = zarr_group["explained_variance_ratio"]

In [9]:
def plot_pca_coords(coords, pve, pc1, pc2, fig, df, group_var, col_dict="auto", query=None):

    x = coords[:, pc1]
    y = coords[:, pc2]
    
    qdf = df.reset_index().copy()
    qdf["x"] = x
    qdf["y"] = y
    
    # apply_query
    if query is not None:
        qdf = qdf.query(query)
    
    g = qdf.groupby(group_var)
    
    if col_dict == "auto":
        cp = sns.color_palette(n_colors=len(g))
        col_dict = {k:v for k, v in zip(g.groups, cp)}
        
    for label, frame in g:
        
        source = ColumnDataSource(data=frame)
        
        fig.circle(
            'x', 'y', 
            source=source,
            line_color='black',
            line_width=0.5,
            size=6,
            fill_color=mpl.colors.rgb2hex(col_dict[label]))
    
    fig.xaxis.axis_label = 'PC {0} ({1:.2f}%)'.format(
        pc1 + 1, 100 * pve[pc1])
    
    fig.yaxis.axis_label = 'PC {0} ({1:.2f}%)'.format(
        pc2 + 1, 100 * pve[pc2])
    
    return fig

In [10]:
all_meta = v3.load_sample_set_metadata(v3.all_wild_sample_sets)

  return op(a, b)
  return op(a, b)


In [11]:
df_species_samples = all_meta.loc[sample_names]
df_species_samples.head()

Unnamed: 0_level_0,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,is_arabiensis,is_gamb_colu,is_gambiae,is_coluzzii
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AR0047-C,LUA047,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.945,0.001,gamb_colu,coluzzii,False,True,False,True
AR0049-C,LUA049,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.933,0.001,gamb_colu,coluzzii,False,True,False,True
AR0051-C,LUA051,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.937,0.002,gamb_colu,coluzzii,False,True,False,True
AR0061-C,LUA061,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.938,0.002,gamb_colu,coluzzii,False,True,False,True
AR0078-C,LUA078,Joao Pinto,Angola,Luanda,2009,4,-8.884,13.302,F,AG1000G-AO,0.926,0.001,gamb_colu,coluzzii,False,True,False,True


In [12]:
# This defines what is displayed when the mouse hovers over a point.
# The @ values correspond to values in the table.
TOOLTIPS = [
    ("ox_code", "@sample_id"),
    ("country", "@country"),
    ("location", "@location"),
    ("collection year", "@year"),
    ("species", "@species_gambcolu_arabiensis"),
    ("species_gamcol", "@species_gambiae_coluzzii"),
    ("(x, y)", "($x, $y)"),]

In [13]:
components = np.array(range(8)).reshape((2, 2, 2)).tolist()

In [14]:
# qlabel = pd.Series(data="", index=cs.index, name="qlabel")
# qlabel

# i = 0
# for q, z in cs.groupby(0).groups.items():
#     qlabel.loc[z] = f"group{i}"
#     i += 1

In [15]:
def pca_figure_wrapper(filename, qstring):

    output_file(filename)
    grid = []

    for row in components:

        l = []

        for (c1, c2) in row:

            p = figure(plot_width=400, plot_height=400, tooltips=TOOLTIPS)
            l.append(
                plot_pca_coords(
                    pca_coordinates, 
                    pca_pve, 
                    c1, 
                    c2, 
                    p,
                    df_species_samples,
                    "location",
                    col_dict="auto",
                    query=qstring))
                    #"location_label"))

        grid.append(l)

    col = gridplot(grid)
    show(col)

## Population definitions

Begin defining populatinos: NB populations must be disjoint!

In [16]:
!pip install geopy



In [17]:
import geopy.distance
from itertools import combinations
def lat_lon_distances(frame):
    """
    Given a dataframe containing fields "location", "latitude", "longitude" report the distance in km between all possible combinations
    """    
    d = frame.groupby("location").first()[["latitude", "longitude"]]
    
    combs = combinations(d.index.tolist(), 2)
    
    pwd = {}
    
    for a, b in combs:
        pwd[f"{a} -> {b}"] = geopy.distance.distance(d.loc[a].values.tolist(), d.loc[b].values.tolist()).km
        
    return pd.Series(pwd)

In [18]:
# for separation of populations
df_species_samples["PC1"] = pca_coordinates[:, 0]
df_species_samples["PC2"] = pca_coordinates[:, 1]

In [19]:
definition_file = "../content/population_definitions.yml"
population_definitions = {}

## 1. Gulf West Africa

ie. The Gambia and Guinea Bissau.

8 sampling locations in an approximate square. 5 in The Gambia, 3 in GB.

Broadly there is an intermediate cluster, a gambiae cluster (small), a coluzzii cluster.

Intermediate sample sites: 
1. Sare Samba Sowe
2. Njabakunda
3. Antula GB
4. Safim GB

Very interestingly, the IM cluster gts pulled apart by PC6. The 2 GB populations are distinct from the Gambia ones. 
As the intermediate species call isn't reliable, we use the PCA coordinates to define this grouping.

Coluzzii sample sites: (all Gambia)
1. Tankular
2. Wali Kunda
3. Kalataba
4. Njabakunda
5. Sare Samba Sowe

Gambiae sample sites:
1. Leibala - the most easterly of the GB sites. The other 2 are close, but one is more urban.
2. Wali Kunda - the most easterly of the Gambia sites. Not quite as easterly as Leibala. (only 2 samples here)

Populations:
 - All Coluzzii clearly belong in one population. Doesn't split by PCA.
 - Leibala forms it's own cluster.
 - IM: Question here is whether to make 2 or 1 cluster. Given 250km and crossing 2 major rivers. We should consider as 2.

Gambia Coluzzii
Gambia Intermediate
Guinea-Bissau Intermediate
Guinea-Bissau Leibala Gambiae

The 2 Gambiae in Wali Kunda can't be considered a population and are not included in the population level analyses.

In [20]:
lat_lon_distances(
    df_species_samples.query("country == 'Gambia, The'"))

Kalataba -> Njabakunda            30.632172
Kalataba -> Sare Samba Sowe       30.846880
Kalataba -> Tankular              47.383297
Kalataba -> Wali Kunda            75.789248
Njabakunda -> Sare Samba Sowe      3.650968
Njabakunda -> Tankular            20.588217
Njabakunda -> Wali Kunda         106.413569
Sare Samba Sowe -> Tankular       23.337082
Sare Samba Sowe -> Wali Kunda    106.404325
Tankular -> Wali Kunda           121.960253
dtype: float64

In [21]:
df_species_samples.query("country == 'Guinea'").drop_duplicates(subset=["location"])

Unnamed: 0_level_0,partner_sample_id,contributor,country,location,year,month,latitude,longitude,sex_call,sample_set,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,is_arabiensis,is_gamb_colu,is_gambiae,is_coluzzii,PC1,PC2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AV0043-C,KD306,Ken Vernick,Guinea,Koundara,2012,10,8.48,-9.53,F,AG1000G-GN-A,0.03,0.003,gamb_colu,gambiae,False,True,True,False,7.356262,-26.575178
AV0025-C,KB281,Ken Vernick,Guinea,Koraboh,2012,10,9.28,-10.03,F,AG1000G-GN-A,0.022,0.002,gamb_colu,gambiae,False,True,True,False,3.768453,-26.496941


In [22]:
lat_lon_distances(
    df_species_samples.query("country in ('Guinea', 'Guinea-Bissau')").drop_duplicates(subset=["location"]))

Antula -> Koraboh      672.683733
Antula -> Koundara     762.869596
Antula -> Leibala      153.944479
Antula -> Safim         10.323438
Koraboh -> Koundara    104.179810
Koraboh -> Leibala     565.398469
Koraboh -> Safim       682.386161
Koundara -> Leibala    663.198791
Koundara -> Safim      772.801147
Leibala -> Safim       159.198283
dtype: float64

In [23]:
lat_lon_distances(
    df_species_samples.query("country == 'Guinea-Bissau'"))

Antula -> Leibala    153.944479
Antula -> Safim       10.323438
Leibala -> Safim     159.198283
dtype: float64

In [24]:
gulf_wa_q = 'country in ("Gambia, The", "Guinea-Bissau")'

In [25]:
lat_lon_distances(
    df_species_samples.query(gulf_wa_q))

Antula -> Kalataba               183.571546
Antula -> Leibala                153.944479
Antula -> Njabakunda             186.753122
Antula -> Safim                   10.323438
Antula -> Sare Samba Sowe        190.341904
Antula -> Tankular               175.782946
Antula -> Wali Kunda             198.980532
Kalataba -> Leibala              207.141136
Kalataba -> Njabakunda            30.632172
Kalataba -> Safim                176.265411
Kalataba -> Sare Samba Sowe       30.846880
Kalataba -> Tankular              47.383297
Kalataba -> Wali Kunda            75.789248
Leibala -> Njabakunda            230.540965
Leibala -> Safim                 159.198283
Leibala -> Sare Samba Sowe       232.788344
Leibala -> Tankular              233.859793
Leibala -> Wali Kunda            161.904637
Njabakunda -> Safim              178.326307
Njabakunda -> Sare Samba Sowe      3.650968
Njabakunda -> Tankular            20.588217
Njabakunda -> Wali Kunda         106.413569
Safim -> Sare Samba Sowe        

In terms of distance alone, there is an argument to include intermediate samples from 2 countries in same population, but given the fact we see PC6 separation, elect to keep separate.

In [26]:
pca_figure_wrapper("gulf_west_africa.html", gulf_wa_q)

In [27]:
population_definitions["GM_col"] = df_species_samples.query(
    "country == 'Gambia, The'").query(
        "species_gambiae_coluzzii == 'coluzzii'").index.values.tolist()

In [28]:
population_definitions["GM_im"] = df_species_samples.query(
    "country == 'Gambia, The'").query(
        "species_gambiae_coluzzii != 'coluzzii'").query(
            "PC1 < 0").index.values.tolist()

In [29]:
population_definitions["GW_im"] = df_species_samples.query(
    "country == 'Guinea-Bissau'").query(
        "PC1 < 0").index.values.tolist()

In [30]:
population_definitions["GW_gam"] = df_species_samples.query(
    "country == 'Guinea-Bissau'").query(
        "PC1 > 0").index.values.tolist()

## 2. East Africa

No coluzzii present in East Africa.

--

Mozambique gambiae, some similarity to KE from 2000. Mozambique gambiae all one cluster, they appear on some kind of continuum with KE. 
However, owing to large geographic distance makes a single population.

Mayotte gambiae, island population in a clear cluster, clear population.

In [31]:
east_gambiae_q = 'country in ("Mozambique", "Kenya")'

pca_figure_wrapper("east_africa_gambiae.html", east_gambiae_q)

In [32]:
lat_lon_distances(df_species_samples.query(east_gambiae_q))

Furvela -> Kilifi    2289.92165
dtype: float64

In [33]:
population_definitions["MZ_gam"] = df_species_samples.query(
    "country == 'Mozambique'").query(
        "species_gambiae_coluzzii == 'gambiae'").index.values.tolist()

In [34]:
population_definitions["MY_gam"] = df_species_samples.query(
    "country == 'Mayotte'").index.values.tolist()

df_species_samples.loc[population_definitions["MY_gam"]].species_gambiae_coluzzii.value_counts()

gambiae    23
Name: species_gambiae_coluzzii, dtype: int64

Tanzania is interesting: there are 2 clear gambiae populations, separated by a large distance: Muheza, and Muleba.

There is also an intermediate population: made up of individuals from Muheza, with one Muleba individual (possible mix-up)?

This intermediate population is very close to KE. And is only 190km away. However, PC4 suggests they are distinct.

The Kenyan Kilifi intermediate population was sampled from the same location as the gambiae population, but 12 years apart. 

We use the year to define the KE populations, but PCs to define the TZ. In both cases the intermediate species group is inconsistent with the PCA.

In [35]:
coastal_africa_q = 'country in ("Tanzania", "Kenya")'

In [36]:
pca_figure_wrapper("coastal_east_africa.html", coastal_africa_q)

In [37]:
lat_lon_distances(df_species_samples.query(coastal_africa_q))

Kilifi -> Muheza    190.662252
Kilifi -> Muleba    934.044431
Muheza -> Muleba    875.066948
dtype: float64

In [38]:
df_species_samples.query(
    "country == 'Kenya'").query(
        "year == 2000").species_gambiae_coluzzii.value_counts()

gambiae    19
Name: species_gambiae_coluzzii, dtype: int64

In [39]:
population_definitions["KE_gam"] = df_species_samples.query(
    "country == 'Kenya'").query(
        "year == 2000").index.values.tolist()

In [40]:
population_definitions["KE_im"] = df_species_samples.query(
    "country == 'Kenya'").query(
        "year == 2012").index.values.tolist()

In [41]:
population_definitions["TZ_gam_1"] = df_species_samples.query(
    "country == 'Tanzania'").query(
    "location == 'Muleba'").query(
    "PC2 < 10").index.values.tolist()

In [42]:
population_definitions["TZ_gam_2"] = df_species_samples.query(
    "country == 'Tanzania'").query(
    "location == 'Muheza'").query(
    "PC1 > 65").index.values.tolist()

In [43]:
population_definitions["TZ_im"] = df_species_samples.query(
    "country == 'Tanzania'").query(
    "PC1 < 55").query(
    "PC2 > 40").index.values.tolist()

## 3. Central-ish Africa

Clear separation of gambiae/coluzzii with no intermediates.

Angola and CAR form 2 distinct coluzzii populations.

In [44]:
central_q = 'country in ("Angola", "Uganda", "Gabon", "Democratic Republic of Congo", "Central African Republic")'

pca_figure_wrapper("central_africa.html", central_q)

In [45]:
population_definitions["AO_col"] = df_species_samples.query(
    "country == 'Angola'").query(
        "species_gambiae_coluzzii == 'coluzzii'").index.values.tolist()

df_species_samples.loc[population_definitions["AO_col"]].species_gambiae_coluzzii.value_counts()

coluzzii    81
Name: species_gambiae_coluzzii, dtype: int64

In [46]:
population_definitions["CA_col"] = df_species_samples.query(
    "country == 'Central African Republic'").query(
        "species_gambiae_coluzzii == 'coluzzii'").index.values.tolist()

df_species_samples.loc[population_definitions["CA_col"]].species_gambiae_coluzzii.value_counts()

coluzzii    18
Name: species_gambiae_coluzzii, dtype: int64

CAR gambiae also form a single group with DRC gambiae. 

They are not separated by any PCA, but only 270km distance... but given time better to separate.

In [47]:
lat_lon_distances(
    df_species_samples.query('country in ("Democratic Republic of Congo", "Central African Republic")'))

Bangui -> Gbadolite    270.344685
dtype: float64

In [48]:
df_species_samples.query('country in ("Democratic Republic of Congo", "Central African Republic")').groupby("location").year.first()

location
Bangui       1993
Gbadolite    2015
Name: year, dtype: int64

In [49]:
population_definitions["CA_gam"] = df_species_samples.query(
    "country == 'Central African Republic'").query(
        "species_gambiae_coluzzii == 'gambiae'").index.values.tolist()

df_species_samples.loc[population_definitions["CA_gam"]].species_gambiae_coluzzii.value_counts()

gambiae    55
Name: species_gambiae_coluzzii, dtype: int64

In [50]:
population_definitions["DC_gam"] = df_species_samples.query(
    "country == 'Democratic Republic of Congo'").query(
        "species_gambiae_coluzzii == 'gambiae'").index.values.tolist()

df_species_samples.loc[population_definitions["DC_gam"]].species_gambiae_coluzzii.value_counts()

gambiae    76
Name: species_gambiae_coluzzii, dtype: int64

In [51]:
lat_lon_distances(
    df_species_samples.query("country in ('Uganda', 'Gabon')"))

Kihihi -> Libreville       2257.217793
Kihihi -> Nagongera         509.973365
Libreville -> Nagongera    2735.419128
dtype: float64

The two Uganda locations are 500km apart, so reasonable to split.
Gabon splits out from Uganda and other Gambiae populations, so falls on its own.

In [52]:
population_definitions["UG_gam_1"] = df_species_samples.query(
    "location == 'Nagongera'").index.values.tolist()

df_species_samples.loc[population_definitions["UG_gam_1"]].species_gambiae_coluzzii.value_counts()

gambiae    112
Name: species_gambiae_coluzzii, dtype: int64

In [53]:
population_definitions["UG_gam_2"] = df_species_samples.query(
    "location == 'Kihihi'").index.values.tolist()

df_species_samples.loc[population_definitions["UG_gam_2"]].species_gambiae_coluzzii.value_counts()

gambiae    95
Name: species_gambiae_coluzzii, dtype: int64

In [54]:
population_definitions["GA_gam"] = df_species_samples.query(
    "location == 'Libreville'").index.values.tolist()

df_species_samples.loc[population_definitions["GA_gam"]].species_gambiae_coluzzii.value_counts()

gambiae    69
Name: species_gambiae_coluzzii, dtype: int64

## 4, Cameroon and Bioko

This is more complex, as CMS has many more sampling locations than other sites.

There are some coluzzii here, so let's look at those first agains the backdrop of some CI.

In [55]:
cms_colu_q = "(country in ('Cameroon', 'Equatorial Guinea')) & (species_gambiae_coluzzii == 'coluzzii')"

In [56]:
cms_colu_df = df_species_samples.query(cms_colu_q)

In [57]:
cms_colu_df.location.value_counts()

Yaounde           7
Nkolondom         5
Douala            2
Campo             2
Tibati            2
Lagdo             1
Gakle             1
Badankali         1
Gamba             1
Palama            1
Gouna             1
Wouro Andre       1
Carrefour Poli    1
Name: location, dtype: int64

In [58]:
pca_figure_wrapper("cameroon_coluzzii.html", cms_colu_q)

In [59]:
cms_south = df_species_samples.query("location in ('Yaounde', 'Campo', 'Nkolondom', 'Douala')")
lat_lon_distances(cms_south)

Campo -> Douala         186.960093
Campo -> Nkolondom      259.148722
Campo -> Yaounde        251.466303
Douala -> Nkolondom     199.542848
Douala -> Yaounde       199.174450
Nkolondom -> Yaounde     10.233754
dtype: float64

These "south" samples form a reasonable cluster, I think it's ok to include all 4 in a cameroon south population. 

If we are being cautious, we could restrict to Yaounde and Campo, which are just 10km apart. Only lose 4 samples / 21.

In [60]:
cms_south.location.value_counts()

Nkolondom    10
Yaounde       7
Douala        2
Campo         2
Name: location, dtype: int64

In [61]:
population_definitions["CM_col"] = df_species_samples.query(
    "location in ('Yaounde', 'Nkolondom')").query(
    "species_gambiae_coluzzii == 'coluzzii'").index.tolist()
population_definitions["CM_col"]

['AN0348-C',
 'AN0402-C',
 'AN0344-C',
 'AN0601-CW',
 'AN0616-CW',
 'AN0619-CW',
 'AN0603-CW',
 'AN0604-CW',
 'AN0607-CW',
 'AN0621-CW',
 'AN0622-CW',
 'AN0623-CW']

In [62]:
cms_gam_q = "(country in ('Cameroon', 'Equatorial Guinea')) & (species_gambiae_coluzzii == 'gambiae')"

In [63]:
pca_figure_wrapper("cameroon_gambiae.html", cms_gam_q)

There is very little structure in _gambiae_, but Bioko should be considered separately if only due to it's status as an island.

In [64]:
cms_gam_df = df_species_samples.query(cms_gam_q)

In [65]:
cms_gam_df.location.value_counts()

Mayos                 110
Daiguene               96
Gado Badzere           73
Zembe Borongo          24
Manda                  11
                     ... 
Palama                  1
Makabay (Djarengo)      1
Séboré                  1
Carrefour Nari          1
Banda                   1
Name: location, Length: 66, dtype: int64

66 unique locations. Considering sampling sites with >= 10 samples only gives:

In [66]:
n_samples_by_site = cms_gam_df.groupby("location").size()
n_samples_by_site.loc[n_samples_by_site >= 10]

location
Bioko             10
Daiguene          96
Gado Badzere      73
Manda             11
Mayos            110
Zembe Borongo     24
dtype: int64

In [67]:
include_locs = n_samples_by_site.loc[n_samples_by_site >= 10].index.tolist()
include_locs.remove("Bioko")

In [68]:
cms_gam_df.query("location in @include_locs").groupby("location").first()

Unnamed: 0_level_0,partner_sample_id,contributor,country,year,month,latitude,longitude,sex_call,sample_set,aim_fraction_colu,aim_fraction_arab,species_gambcolu_arabiensis,species_gambiae_coluzzii,is_arabiensis,is_gamb_colu,is_gambiae,is_coluzzii,PC1,PC2
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Daiguene,CM0901912,Nora Besansky,Cameroon,2009,9,4.777,13.844,M,AG1000G-CM-A,0.039,0.002,gamb_colu,gambiae,False,True,True,False,7.97475,-19.970627
Gado Badzere,CM0902285,Nora Besansky,Cameroon,2009,9,5.747,14.442,M,AG1000G-CM-A,0.022,0.002,gamb_colu,gambiae,False,True,True,False,8.355163,-24.784523
Manda,1253,Brad White,Cameroon,2013,10,5.726,10.868,F,AG1000G-CM-C,0.026,0.002,gamb_colu,gambiae,False,True,True,False,7.557045,-27.504055
Mayos,CM0901778,Nora Besansky,Cameroon,2009,9,4.341,13.558,F,AG1000G-CM-A,0.027,0.002,gamb_colu,gambiae,False,True,True,False,8.435379,-22.509165
Zembe Borongo,CM0901968,Nora Besansky,Cameroon,2009,9,5.747,14.442,F,AG1000G-CM-A,0.022,0.002,gamb_colu,gambiae,False,True,True,False,10.258307,-21.264961


In [69]:
lat_lon_distances(cms_gam_df.query("location in @include_locs"))

Daiguene -> Gado Badzere         126.096513
Daiguene -> Manda                346.190845
Daiguene -> Mayos                 57.721646
Daiguene -> Zembe Borongo        126.096513
Gado Badzere -> Manda            395.882843
Gado Badzere -> Mayos            183.801179
Gado Badzere -> Zembe Borongo      0.000000
Manda -> Mayos                   335.316105
Manda -> Zembe Borongo           395.882843
Mayos -> Zembe Borongo           183.801179
dtype: float64

Logically, I think there should be 4 distinct CMS gambiae populations. 

The transect has 126 / 183 km between sites, but there are ~ 100 samples in each.

Manda is a good distance west of the other sites- so could be important.

Manda is the same site as Mfelap, so that is included. As is 

In [70]:
r = lat_lon_distances(cms_gam_df)

In [71]:
x = [y for y in r.index if "Manda" in y]

r.loc[x].sort_values()[:20]

Manda -> Mfelap             0.000000
Manchoutvi -> Manda        31.755058
Manda -> Mangoum           40.551759
Manda -> Mgbandji          50.961557
Bamendi -> Manda           53.099264
Manda -> Otibili          175.899406
Manda -> Obala            187.700105
Manda -> Nkolondom        206.850485
Manda -> Tibati           211.539909
Essos -> Manda            218.320351
Ahala -> Manda            223.942975
Manda -> Mbalmayo         254.246644
Avebe -> Manda            270.465571
Beka Goto -> Manda        272.874486
Manda -> Tekel            281.742827
Mabarangal'L -> Manda     282.913315
Birsok -> Manda           287.318924
Lougga Tapadi -> Manda    300.649096
Foulassi I -> Manda       321.762586
Dombé -> Manda            324.358391
dtype: float64

In [72]:
n_samples_by_site.loc["Mfelap"], n_samples_by_site.loc["Manchoutvi"], n_samples_by_site.loc["Mangoum"]

(1, 2, 1)

In [73]:
population_definitions["CM_gam_1"] = cms_gam_df.query("location in ('Gado Badzere', 'Zembe Borongo')").index.tolist()

In [74]:
population_definitions["CM_gam_2"] = cms_gam_df.query("location == 'Daiguene'").index.tolist()

In [75]:
population_definitions["CM_gam_3"] = cms_gam_df.query("location == 'Mayos'").index.tolist()

In [76]:
population_definitions["CM_gam_4"] = cms_gam_df.query("location in ('Manda', 'Mfelap')").index.tolist()

In [77]:
population_definitions["BI_gam"] = cms_gam_df.query("country == 'Equatorial Guinea'").index.tolist()

## 5, West Africa

Initial plotting shows clear separation of gambiae and coluzzii. 

The exceptions being 2 Mali samples that are outliers from all groups. One is classed as "intermediate" AV0331-C. Doesn't cluster exactly with the gulf west africa samples. The other is AV0255-C, which looks coluzzii, but is distinct from all other samples. Appears to be really out on its own.

Initially we look at coluzzii samples, excluding AV0255-C.

In [78]:
west_colu_q = "(country in ('Ghana', 'Cote d\\'Ivoire', 'Guinea', 'Mali', 'Burkina Faso')  & (species_gambiae_coluzzii == 'coluzzii'))"

In [79]:
pca_figure_wrapper("west_africa_coluzzii.html", west_colu_q)

There is broad separation between Ghana/Cote d'Ivoire vs Burkina, Mali, and Guinea.

A question is then to include Ghana/CI as how many separate populations?

In [80]:
df_species_samples.query(west_colu_q).groupby(["country", "location"]).size()

country        location      
Burkina Faso   Bana              89
               Pala              11
               Souroukoudinga    35
Cote d'Ivoire  Tiassale          80
Ghana          Koforidua          1
               Madina            14
               Takoradi          24
               Twifo Praso       25
Guinea         Koundara          11
Mali           Douna             19
               Fanzana            6
               Kababougou        12
               Moribobougou       7
               N'Gabakoro         4
               Ouassorola         9
               Takan             26
               Tieneguebougou     6
               Toumani Oulena     2
dtype: int64

In [81]:
lat_lon_distances(
    df_species_samples.query("country in ('Ghana', 'Cote d\\'Ivoire')"))

Koforidua -> Madina          47.338640
Koforidua -> Takoradi       212.585535
Koforidua -> Tiassale       505.542827
Koforidua -> Twifo Praso    152.387366
Madina -> Takoradi          191.572299
Madina -> Tiassale          510.556093
Madina -> Twifo Praso       147.487682
Takoradi -> Tiassale        355.065674
Takoradi -> Twifo Praso      81.011919
Tiassale -> Twifo Praso     364.040881
dtype: float64

In [82]:
population_definitions["CI_col"] = df_species_samples.query(west_colu_q).query("location == 'Tiassale'").index.tolist()

population_definitions["GH_col"] = df_species_samples.query(west_colu_q).query("country == 'Ghana'").index.tolist()

Given Tiassele is at least 355km from nearest Ghanian site- and the maximum distance between ghanian sites is 212km we group as a single population.

Now look at Guinea, Burkina and Mali.

In [83]:
lat_lon_distances(
    df_species_samples.query(west_colu_q).query("country in ('Guinea', 'Burkina Faso')"))

Bana -> Koundara              632.812044
Bana -> Pala                   27.464332
Bana -> Souroukoudinga          6.883185
Koundara -> Pala              651.571790
Koundara -> Souroukoudinga    626.869978
Pala -> Souroukoudinga         34.087197
dtype: float64

Guinean samples are a long way from BF. All BF sample sites are close together. 

Question only remains about Mali: Decide to take from Takan and Douna as sep populations.

Takan to be included in gambiae of WA.

In [84]:
population_definitions["GN_col"] = df_species_samples.query(west_colu_q).query("country == 'Guinea'").index.tolist()
population_definitions["BF_col"] = df_species_samples.query(west_colu_q).query("country == 'Burkina Faso'").index.tolist()

In [85]:
df_species_samples.query(west_colu_q).query("country=='Mali'").groupby("location").size()

location
Douna             19
Fanzana            6
Kababougou        12
Moribobougou       7
N'Gabakoro         4
Ouassorola         9
Takan             26
Tieneguebougou     6
Toumani Oulena     2
dtype: int64

In [86]:
df_species_samples.query("country == 'Mali'").groupby(["contributor", "location", "year", "species_gambiae_coluzzii" ]).size()

contributor    location        year  species_gambiae_coluzzii
Austin Burt    Kababougou      2014  coluzzii                    12
                                     gambiae                     28
               Ouassorola      2014  coluzzii                     9
                                     gambiae                      4
               Tieneguebougou  2014  coluzzii                     6
                                     gambiae                      1
Ken Vernick    Takan           2012  coluzzii                    26
                                     gambiae                      5
               Toumani Oulena  2012  coluzzii                     2
                                     gambiae                     60
                                     intermediate                 1
Nora Besansky  Bancoumana      2004  gambiae                      9
               Douna           2004  coluzzii                    19
               Fanzana         2004  coluzzii         

In [87]:
# Austin's sites are very close together
lat_lon_distances(
    df_species_samples.query("country == 'Mali'").query("contributor == 'Austin Burt'").groupby(["contributor", "location"]).first()
)

Kababougou -> Ouassorola         1.549765
Kababougou -> Tieneguebougou    11.664698
Ouassorola -> Tieneguebougou    13.211463
dtype: float64

In [88]:
lat_lon_distances(
    df_species_samples.query("country == 'Mali'").query("contributor == 'Nora Besansky'").groupby(["contributor", "location"]).first()
)

Bancoumana -> Douna           273.652173
Bancoumana -> Fanzana         250.570138
Bancoumana -> Kela             44.655586
Bancoumana -> Moribobougou     65.004420
Bancoumana -> N'Gabakoro       65.966647
Douna -> Fanzana               24.955389
Douna -> Kela                 313.760344
Douna -> Moribobougou         221.362109
Douna -> N'Gabakoro           218.518693
Fanzana -> Kela               291.367390
Fanzana -> Moribobougou       197.053970
Fanzana -> N'Gabakoro         194.265503
Kela -> Moribobougou          109.591508
Kela -> N'Gabakoro            110.616089
Moribobougou -> N'Gabakoro      3.441266
dtype: float64

In [89]:
lat_lon_distances(
    df_species_samples.query("country == 'Mali'").query("contributor == 'Ken Vernick'").groupby(["contributor", "location"]).first()
)

Takan -> Toumani Oulena    90.763862
dtype: float64

In [90]:
population_definitions["MA_col_1"] = df_species_samples.query(west_colu_q).query("location == 'Douna'").index.tolist()
population_definitions["MA_col_2"] = df_species_samples.query(west_colu_q).query("location == 'Takan'").index.tolist()
population_definitions["MA_col_2"].remove("AV0255-C")


population_definitions["MA_col_3"] = df_species_samples.query(west_colu_q).query("location == 'Kababougou'").index.tolist()

Move onto gambiae

In [91]:
west_gam_q = "(country in ('Ghana', 'Cote d\\'Ivoire', 'Guinea', 'Mali', 'Burkina Faso')  & (species_gambiae_coluzzii == 'gambiae'))"

In [92]:
pca_figure_wrapper("west_africa_gambiae.html", west_gam_q)

Almost a complete absense of structure in these West African gambiae samples.

In [93]:
lat_lon_distances(
    df_species_samples.query(west_gam_q).query("country == 'Ghana'"))

Koforidua -> Madina    47.33864
dtype: float64

In [94]:
population_definitions["GH_gam"] = df_species_samples.query(west_gam_q).query("country == 'Ghana'").index.tolist()

In [95]:
population_definitions["GN_gam"] = df_species_samples.query(west_gam_q).query("country == 'Guinea'").index.tolist()

In [96]:
# burkina we exclude Monomtenga
population_definitions["BF_gam"] = df_species_samples.query(west_gam_q).query("location in ('Bana', 'Pala', 'Souroukoudinga')").index.tolist()

In [97]:
lat_lon_distances(
    df_species_samples.query(west_gam_q).query("country == 'Burkina Faso'"))

Bana -> Monomtenga              371.492961
Bana -> Pala                     27.464332
Bana -> Souroukoudinga            6.883185
Monomtenga -> Pala              349.087863
Monomtenga -> Souroukoudinga    378.099973
Pala -> Souroukoudinga           34.087197
dtype: float64

In [98]:
# again Mali is the most complex... we take 

In [99]:
df_species_samples.query(west_gam_q).groupby(["country", "contributor", "location", "year"]).size()

country       contributor    location        year
Burkina Faso  Austin Burt    Bana            2012    22
                                             2014    15
                             Pala            2012    48
                                             2014    16
                             Souroukoudinga  2012    28
                                             2014    15
              Nora Besansky  Monomtenga      2004    13
Ghana         David Weetman  Koforidua       2012    23
                             Madina          2012    13
Guinea        Ken Vernick    Koraboh         2012    60
                             Koundara        2012    63
Mali          Austin Burt    Kababougou      2014    28
                             Ouassorola      2014     4
                             Tieneguebougou  2014     1
              Ken Vernick    Takan           2012     5
                             Toumani Oulena  2012    60
              Nora Besansky  Bancoumana      2004     

In [100]:
lat_lon_distances(
    df_species_samples.query(west_gam_q).query("country == 'Mali'"))

Bancoumana -> Fanzana               250.570138
Bancoumana -> Kababougou             76.525587
Bancoumana -> Kela                   44.655586
Bancoumana -> Ouassorola             77.560629
Bancoumana -> Takan                  81.986408
Bancoumana -> Tieneguebougou         68.731084
Bancoumana -> Toumani Oulena        157.406280
Fanzana -> Kababougou               221.766673
Fanzana -> Kela                     291.367390
Fanzana -> Ouassorola               222.666394
Fanzana -> Takan                    306.397274
Fanzana -> Tieneguebougou           215.895257
Fanzana -> Toumani Oulena           319.687080
Kababougou -> Kela                  116.396892
Kababougou -> Ouassorola              1.549765
Kababougou -> Takan                 158.302420
Kababougou -> Tieneguebougou         11.664698
Kababougou -> Toumani Oulena        230.871116
Kela -> Ouassorola                  117.161723
Kela -> Takan                        47.203588
Kela -> Tieneguebougou              110.471462
Kela -> Touma

In [101]:
# Ken Vernick
population_definitions["MA_gam_1"] = df_species_samples.query(west_gam_q).query("location == 'Toumani Oulena'").index.tolist()

In [102]:
# Austin's sites can be combined
population_definitions["MA_gam_2"] = df_species_samples.query(west_gam_q).query("country == 'Mali'").query("contributor == 'Austin Burt'").index.tolist()

In [103]:
df_species_samples.loc[population_definitions["MA_gam_2"]].groupby("year").size()

year
2014    33
dtype: int64

In [104]:
# Nora
population_definitions["MA_gam_3"] = df_species_samples.query(west_gam_q).query("location == 'Kela'").index.tolist()

## Make a table of samples not in a "population"

Mainly Cameroon and Mali, odd samples elsewhere

In [105]:
included_samples = set([y for x in population_definitions.values() for y in x])

In [106]:
excl_samples = df_species_samples.index.map(lambda y: y not in included_samples)

In [107]:
pd.set_option('display.max_rows', 100)

In [108]:
df_species_samples.loc[excl_samples].groupby(["country", "location", "year", "species_gambiae_coluzzii"]).size()

country       location            year  species_gambiae_coluzzii
Burkina Faso  Bana                2012  intermediate                 1
              Monomtenga          2004  gambiae                     13
Cameroon      Afan-Essokye        2013  gambiae                      1
              Ahala               2005  gambiae                      4
              Avebe               2005  gambiae                      1
              Badankali           2005  coluzzii                     1
              Badjawa             2005  gambiae                      3
              Balda Bouri         2005  gambiae                      2
              Bamendi             2005  gambiae                      3
              Banda               2005  gambiae                      1
              Beka Goto           2005  gambiae                      2
              Bini                2005  gambiae                      1
              Birsok              2005  gambiae                      1
            

## Validate and write

In [109]:
for a, b in combinations(population_definitions.keys(), 2):
    
    set1 = set(population_definitions[a])
    set2 = set(population_definitions[b])
    assert set1.isdisjoint(set2), f"{a} and {b} are not disjoint. Share {set1.intersection(set2)}"

In [110]:
with open(definition_file, mode="w") as wr:
    print(yaml.dump(population_definitions), file=wr)