# Genomic analysis of a parasite invasion: colonization of the New World by the blood fluke, Schistosoma mansoni 

Roy Nelson Platt II*, Frédéric D. Chevalier*, Winka Le Clec'h, Marina McDew-White, Philip T. LoVerde, Rafael Ramiro de Assis, Guilherme Oliveira, Safari Kinunghi, Anouk Gouvras, Bonnie Webster, Joanne Webster, Aidan Emery, David Rollinson, Timothy J. Anderson

# Explore impacts of invasion/introduction on mitochondrial diveristy (not phylogeny)

use the `sch_man_nwinvasion-mito_diversity` conda env

In [8]:
import os
#import shutil
import allel
#import math
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
#from scipy import stats
#import itertools
#import scipy.spatial
#import random
#from tqdm import tqdm

proj_dir="/master/nplatt/sch_man_nwinvasion"
results_dir="{}/results".format(proj_dir)

In [2]:
os.chdir(proj_dir)

if not os.path.exists("{}/mito_diversity".format(results_dir)):
    os.mkdir("{}/mito_diversity".format(results_dir))
    
os.chdir("{}/mito_diversity".format(results_dir))

In [3]:
with open('{}/data/pop_assign.yml'.format(proj_dir)) as yaml_file:
    pop_assign = yaml.load(yaml_file, Loader=yaml.FullLoader)

In [72]:
#initialize an list the lenght of each contig to fale
accessible_bases = {}
chrom_length = {}

with open('{}/data/genomes/Smansoni_v7.fa.fai'.format(proj_dir), 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chrom_length[chrom] = int(length)
        accessible_bases[chrom]=[False] * int(length)


#now read the bed
with open('{}/data/renamed-sma_agilent_baits.v7.0.chr_reorderd.bed'.format(proj_dir), 'r') as in_bed_file:
    for bed_entry in in_bed_file:
        chrom, start, stop = bed_entry.rstrip().split("\t")
        for base in range(int(start) - 1, int(stop)):
             accessible_bases[chrom][base]=True

In [80]:
%%bash
vcftools \
    --vcf /master/nplatt/sch_man_nwinvasion/results/mito_network/mito.vcf \
    --maf 0.05 \
    --recode \
    --recode-INFO-all \
    --stdout \
    >mito_maf05.vcf


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf /master/nplatt/sch_man_nwinvasion/results/mito_network/mito.vcf
	--recode-INFO-all
	--maf 0.05
	--recode
	--stdout

After filtering, kept 142 out of 142 Individuals
Outputting VCF file...
After filtering, kept 298 out of a possible 815 Sites
Run Time = 0.00 seconds


In [81]:
#-------------------------------------------------------------------------------
# get genotype info per population

#read in vcf
filtered_callset=allel.read_vcf('{}/mito_network/mito.vcf'.format(results_dir))

#now get an index for each sample/population
samples = filtered_callset["samples"]

i=0
pop_idxs = defaultdict(list)   
for sample in samples:  
    pop_idxs[pop_assign[sample]].append(i) 
    i=i+1

pops= list(pop_idxs.keys()) 

#get genotypes
h=allel.HaplotypeArray(filtered_callset['calldata/GT'][:,:,0])

#now get allele count per population
ac=h.count_alleles()

pop_ac={}
for pop in pops:
    pop_ac[pop] = h.count_alleles(subpop=pop_idxs[pop])

In [82]:
print("haplotype diversity")
for pop in ["senegal", "niger", "brazil", "tanzania"]:
    sub_h=h[:,pop_idxs[pop]]
    hdiv=allel.haplotype_diversity(sub_h)
    print("{}: {}".format(pop, hdiv))

haplotype diversity
senegal: 0.9966666666666666
niger: 0.9777777777777779
brazil: 0.9838383838383838
tanzania: 1.0


In [83]:
#mito Pi
#allel.sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None)
pos_s=filtered_callset['variants/POS']
accessible_bases["SM_V7_MITO"]


for pop in ["senegal", "niger", "brazil", "tanzania"]:
    pi=allel.sequence_diversity(pos_s, pop_ac[pop], is_accessible=accessible_bases["SM_V7_MITO"])
    print("{}: {}".format(pop, pi))

senegal: 0.004721205528336565
niger: 0.005889746547185994
brazil: 0.00021015453066303578
tanzania: 0.0072544713041787865
