In [None]:
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import scipy
import allel
import zarr
import sys
import os
allel.__version__, zarr.__version__

In [None]:
PANEL = '../../data/panel_extended_info.csv'
SUBSET_DIR = '../../../data/phylo_ampl_ag1k/phase2/AR1'
SAMPLE_META = os.path.join(SUBSET_DIR, 'samples/samples.meta.txt')

In [None]:
DIV_FILE = 'data/diversity.csv'

In [None]:
panel = pd.read_csv(PANEL)
# remove plasmodium primers from consideration
panel_mosquito = panel[~panel['Primary_ID'].str.startswith('P')].set_index('Primary_ID')

In [None]:
# read ag1000g variation data
callset = zarr.open(SUBSET_DIR, mode='r')

In [None]:
# read samples metadata
samples = pd.read_csv(SAMPLE_META, delimiter='\t')
samples.loc[samples.m_s.isna(), 'country'].value_counts()

In [None]:
# add gambiae species for missing M/S values - applicable to Kenya and Gambia
# not using unknown species anymore
samples['m_s'] = samples.m_s.fillna('S')
samples.m_s.value_counts()

In [None]:
# long population names
pop_labels = OrderedDict([
    ('AOcol', 'Angola $coluzzii$'),
    ('BFcol', 'Burkina Faso $coluzzii$'),
    ('GHcol', 'Ghana $coluzzii$'),
    ('CIcol', "Côte d'Ivoire $coluzzii$"),
    ('GNcol', 'Guinea $coluzzii$'),
    ('GW', 'Guinea-Bissau'),
    ('GM', 'The Gambia'),
    ('GNgam', 'Guinea $gambiae$'),
    ('BFgam', 'Burkina Faso $gambiae$'),
    ('GHgam', 'Ghana $gambiae$'),
    ('CMgam', 'Cameroon $gambiae$'),
    ('UGgam', 'Uganda $gambiae$'),
    ('GAgam', 'Gabon $gambiae$'),
    ('GQgam', 'Bioko $gambiae$'),
    ('FRgam', 'Mayotte $gambiae$'),
    ('KE', 'Kenya'),
    ('run1','ANO_SPP Gabon $coluzzii$'), 
    ('run2','ANO_SPP Gabon $gambiae$'),
    ('ref','Reference genome')
#     ('colony', 'colony'),
])

In [None]:
# list of populations
samples['pop_label'] = samples.population.replace(pop_labels)
populations = list(pop_labels.keys())
populations

In [None]:
# colors for PCA and umap
reds = sns.color_palette('Reds', 5)
blues = sns.color_palette('Blues', 4)
greens = sns.color_palette('Greens', 2)
browns = sns.color_palette('YlOrBr', 4)
purples = sns.color_palette('Purples', 2)
greys = sns.color_palette('Greys', 4)
pop_colors = {
    'Angola $coluzzii$': reds[4],
    'Ghana $coluzzii$': reds[3],
    'Burkina Faso $coluzzii$': reds[2],
    'Côte d\'Ivoire $coluzzii$': reds[1],
    'Guinea $coluzzii$': reds[0],
    'Cameroon $gambiae$': blues[3],
    'Ghana $gambiae$': blues[2],
    'Burkina Faso $gambiae$': blues[1],
    'Guinea $gambiae$': blues[0],
    'Guinea-Bissau': browns[1],
    'The Gambia': browns[2],
    'Gabon $gambiae$': greens[1],
    'Uganda $gambiae$': greens[0],
    'Mayotte $gambiae$': purples[1],
    'Bioko $gambiae$': purples[0],
    'Kenya': browns[3],
    'ANO_SPP Gabon $coluzzii$':greys[3],
    'ANO_SPP Gabon $gambiae$':greys[3],
    'Reference genome':greys[2]
}