Common libs and constants for Ag1000g data analysis

In [1]:
from scipy.spatial.distance import pdist, cdist
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import scipy
import allel
import zarr
import sys
import os
allel.__version__, zarr.__version__

('1.2.1', '2.3.2')

In [2]:
# ag1k data files
# newer zarr does not have sample metadata
# HAPS_ZARR = '../../../data/ag1k/ag1000g.phase2.ar1.haplotypes/'
HAPS_ZARR = '../../../data/ag1k/haplotypes/'
HAPS_X_ZARR = '../../../data/ag1k/haplotypes_X/'
SAMPLE_META = '../../../data/ag1k/phase2_samples.meta.txt'
# panel
PANEL = '../../data/panel_extended_info.csv'
# mosquito amplicons
AMPLS = [str(i) for i in range(62)]
# outputs
WD = '../../../data/phylo_ampl_ag1k_hap/'
AMPL_HAP_ZARR = os.path.join(WD, 'ampl_hap')
WSP_VAR_FILE = 'data/1_wsp_var.csv'

In [3]:
! mkdir -p {AMPL_HAP_ZARR}
! mkdir -p 'data'

In [4]:
# read amplicon data
panel = pd.read_csv(PANEL)
# get insert coordinates - variation in primers is not picked up by sequencing
panel['start_insert'] = panel.start + panel.F.str.len()
panel['end_insert'] = panel.end - panel.R.str.len()
# remove plasmodium primers from consideration
panel_mosquito = panel[panel['Primary_ID'].isin(AMPLS)].set_index('Primary_ID')
panel_mosquito.columns

Index(['ID', 'SeekDeep_ID', 'SNP', 'F', 'R', 'chr', 'start', 'end', 'F_deg',
       'R_deg', 'identical_seq', 'idenified_lineages', 'aligned_spp',
       'unaligned_spp', 'aligned_ampl_length', 'agam_ampl_length',
       'aligned_insert_length', 'agam_insert_length', 'total_snvs',
       'total_indels', 'insert_snvs', 'insert_indels', 'prop_id_lineages',
       'exon', 'gene', 'intron', 'mRNA', 'repeat', 'utr', 'AaegL5_identity',
       'AaegL5_q_span', 'AaegL5_s_span', 'AgamP3_identity', 'AgamP3_q_span',
       'AgamP3_s_span', 'BDGP6_identity', 'BDGP6_q_span', 'BDGP6_s_span',
       'CpipJ2_identity', 'CpipJ2_q_span', 'CpipJ2_s_span', 'Comments',
       'Outgroup ampl', 'start_insert', 'end_insert'],
      dtype='object')

In [5]:
# read samples metadata
samples = pd.read_csv(SAMPLE_META, delimiter='\t')
samples.columns

Index(['ox_code', 'src_code', 'population', 'country', 'location', 'site',
       'contributor', 'contact', 'year', 'm_s', 'sex', 'n_sequences',
       'mean_coverage', 'ebi_sample_acc', 'latitude', 'longitude'],
      dtype='object')

In [6]:
# species
ms_sp = {
 'M':'coluzzii',
 'S':'gambiae',
 'M/S':'hybrid'}
samples['species'] = samples.m_s.replace(ms_sp)
# add unknown species for missing M/S values - applicable to Kenya
samples['species'] = samples.species.fillna('unknown')
samples.species.value_counts()

gabmiae     654
coluzzii    283
unknown     204
hybrid        1
Name: species, dtype: int64

In [7]:
# populations
populations = {
    'AOcol': 'Angola coluzzii',
    'GHcol': 'Ghana coluzzii',
    'BFcol': 'Burkina coluzzii',
    'CIcol': 'Côte d\'Ivoire coluzzii',
    'GNcol': 'Guinea colizzii',
    'CMgam': 'Cameroon coluzzii',
    'GHgam': 'Ghana gambiae',
    'BFgam': 'Burkina gambiae',
    'GNgam': 'Guinea gambiae',
    'GW': 'Guinea-Bissau',
    'GM': 'Gambia',
    'GAgam': 'Gabon gambiae',
    'UGgam': 'Uganda gambiae',
    'FRgam': 'Mayotte gambiae',
    'GQgam': 'Equatorial Guinea gambiae',
    'KE': 'Kenya',
}
samples['pop_long'] = samples.population.replace(populations)
samples.pop_long.value_counts()

Cameroon coluzzii            297
Uganda gambiae               112
Burkina gambiae               92
Guinea-Bissau                 91
Angola coluzzii               78
Burkina coluzzii              75
Côte d'Ivoire coluzzii        71
Gabon gambiae                 69
Gambia                        65
Ghana coluzzii                55
Kenya                         48
Guinea gambiae                40
Mayotte gambiae               24
Ghana gambiae                 12
Equatorial Guinea gambiae      9
Guinea colizzii                4
Name: pop_long, dtype: int64

In [8]:
# population colours
reds = sns.color_palette('Reds', 5)
blues = sns.color_palette('Blues', 4)
greens = sns.color_palette('Greens', 2)
browns = sns.color_palette('YlOrBr', 4)
purples = sns.color_palette('Purples', 2)
greys = sns.color_palette('Greys', 3)
pop_colors = {
    'Angola coluzzii': reds[4],
    'Ghana coluzzii': reds[3],
    'Burkina coluzzii': reds[2],
    'Côte d\'Ivoire coluzzii': reds[1],
    'Guinea coluzzii': reds[0],
    'Cameroon colizzii': blues[3],
    'Ghana gambiae': blues[2],
    'Burkina gambiae': blues[1],
    'Guinea gambiae': blues[0],
    'Guinea-Bissau': browns[1],
    'Gambia': browns[2],
    'Gabon gambiae': greens[1],
    'Uganda gambiae': greens[0],
    'Mayotte gambiae': purples[1],
    'Equatorial Guinea gambiae': purples[0],
    'Kenya': greys[1],
}