In [None]:
from Bio.Phylo.TreeConstruction import DistanceCalculator
from Bio import AlignIO
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import itertools
import ete3
import sys
import os 

In [None]:
# editable text in pdf export
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

In [None]:
PANEL = '../../data/panel_extended_info.csv'

# key data files
SEQ_FILE = 'data/0_haplotypes.csv'
SEQ_META_FILE = 'data/0_samples.csv'
TAXONOMY_FILE = 'data/0_taxonomy.csv'
REF_FILE = '../4_ref_extraction/data/refs.csv'
REF_META_FILE = '../4_ref_extraction/data/samples_ref.csv'

# sample and sequencing metadata
COMB_SEQ_META = 'data/1_sample_seq.csv'

# threholds
GC_THRESHOLDS = '../6_ag1k_extraction/data/1_wsp_var.csv'
COMB_THRESHOLDS = 'data/2_thresholds.csv'

# filters
BAD_SAMPLES = 'data/2_bad_samples.csv'
OUTLIER_GENOTYPES = 'data/3_outlier_genotypes.csv'
SPLIT_ALLELES = 'data/3_split_alleles.csv'

# clustering
SPP_PRED = 'data/4_spp_predictions.csv'

# species tree
GENE_TREES = 'data/5_gene_trees.nwk'
SPECIES_TREE = 'data/5_species_tree.nwk'
SPECIES_TREE_FIG = 'data/6_species_tree.pdf'
# sample tree
SAMPLE_TREE = 'data/5_sample_tree.nwk'
SAMPLE_TREE_FIG = 'data/6_sample_tree.pdf'


# whole-genome tree
WG_TREE = 'src/neafsey2015_fig1.nwk'
WG_TREE_FIG = 'data/7_wg_tree.png'

# working files
WD = '../../../data/phylo_ampl_dada2/phylo_reduced'
ALN_ALL = os.path.join(WD, 'aln_all/{}.fa')
COMB_SEQ = os.path.join(WD, 'comb_seq.csv')
COMB_META = os.path.join(WD, 'comb_meta.csv')
CLUSTERING = os.path.join(WD, 'clustering.csv')
SP_TREE_DIR = os.path.join(WD, 'species_tree')
MAPPING = os.path.join(SP_TREE_DIR, 'sp-sample_map.txt')
SAMPLE_MAPPING = os.path.join(WD, '/5-sample-hap-map.txt')

ALN_SP_TREE = os.path.join(SP_TREE_DIR, '{}.fa')


# params
AMPLS = [str(x) for x in range(62)]

In [None]:
! mkdir -p {os.path.dirname(ALN_ALL)}

In [None]:
! mkdir -p {SP_TREE_DIR}

In [None]:
# read panel
panel_data = pd.read_csv(PANEL)
# remove non-mosquito amplicons
panel_data = panel_data[panel_data.Primary_ID.isin(AMPLS)]
# reset index
panel_data = panel_data.set_index('Primary_ID')
panel_data.columns