# Common setup

In [None]:
import zarr
from pyprojroot import here
import pandas as pd
import numpy as np
import allel
import yaml
import matplotlib.pyplot as plt
import functools
import seaborn as sns
import dask.array as da
import scipy.interpolate
import scipy.stats
import petl as etl
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
callset_haps_path = here() / 'data/external/ag1000g/phase2/AR1/haplotypes/main/zarr/ag1000g.phase2.ar1.haplotypes'
callset_haps = zarr.open_consolidated(str(callset_haps_path))

In [None]:
df_haps_a = pd.read_csv(here() / 'data/external/ag1000g/phase2/AR1/haplotypes/main/haplotypes.autosomes.meta.txt',
                        sep='\t', index_col=0)
df_haps_a.head()

In [None]:
df_haps_x = pd.read_csv(here() / 'data/external/ag1000g/phase2/AR1/haplotypes/main/haplotypes.X.meta.txt',
                        sep='\t', index_col=0)
df_haps_x.head()

In [None]:
with open('pop_defs.yml', mode='r') as f:
    pop_defs = yaml.safe_load(f)

In [None]:
import pyfasta
genome_path = here() / 'data/external/vectorbase/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
genome = pyfasta.Fasta(str(genome_path), key_fn=lambda x: x.split()[0])

In [None]:
tbl_chromatin = [
    ('name', 'chrom', 'start', 'end'),
    ('CHX', 'X', 20009764, 24393108),
    ('CH2R', '2R', 58984778, 61545105),
    ('CH2L', '2L', 1, 2431617),
    ('PEU2L', '2L', 2487770, 5042389),
    ('IH2L', '2L', 5078962, 5788875),
    ('IH3R', '3R', 38988757, 41860198),
    ('CH3R', '3R', 52161877, 53200684),
    ('CH3L', '3L', 1, 1815119),
    ('PEU3L', '3L', 1896830, 4235209),
    ('IH3L', '3L', 4264713, 5031692)
]

In [None]:
seq_ids = '2R', '2L', '3R', '3L', 'X'

In [None]:
def build_gmap():
    
    # crude recombination rate lookup, keyed off chromatin state
    # use units of cM / bp, assume 2 cM / Mbp == 2x10^-6 cM / bp
    tbl_rr = (
        etl.wrap(tbl_chromatin)
        # extend heterochromatin on 2L - this is empirical, based on making vgsc peaks symmetrical
        .update('end', 2840000, where=lambda r: r.name == 'CH2L')
        .update('start', 2840001, where=lambda r: r.name == 'PEU2L')
        .addfield('rr', lambda r: .5e-6 if 'H' in r.name else 2e-6)
    )
    
    # per-base map of recombination rates
    rr_map = {seq_id: np.full(len(genome[seq_id]), fill_value=2e-6, dtype='f8')
              for seq_id in seq_ids}
    for row in tbl_rr.records():
        rr_map[row.chrom][row.start - 1:row.end] = row.rr
        
    # genetic map
    gmap = {seq_id: np.cumsum(rr_map[seq_id]) for seq_id in seq_ids}
    gmap['2'] = np.concatenate([gmap['2R'], gmap['2L'] + gmap['2R'][-1]])
    gmap['3'] = np.concatenate([gmap['3R'], gmap['3L'] + gmap['3R'][-1]])
    
    return gmap

gmap = build_gmap()