# calculate genome-wide pi (and eventally replot fig X)

## sch_man_nwinvasion


will need to run in the sch_man_nwinvasion-postproc env.

In [2]:
#run in sch_man_nwinvasion-jupyter environment

import os
import shutil
import allel
import math
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from scipy import stats
import itertools
import scipy.spatial
import random
from tqdm.notebook import tqdm

In [40]:
os.chdir("/master/nplatt/sch_man_nwinvasion")

if not os.path.exists("results/windowed_pi_and_fst"):
    os.mkdir("results/windowed_pi_and_fst")

Calculate the cumulative length fo each chromosome for downstream plotting

# Read in and process data

 Get population assignments

In [4]:
with open('data/pop_assign.yml') as yaml_file:
    pop_assign = yaml.load(yaml_file, Loader=yaml.FullLoader)

read in the vcf file and get pop specific allele counts

In [5]:
#-------------------------------------------------------------------------------
# get genotype info per population

#read in vcf
filtered_callset=allel.read_vcf('results/variant_filtration/smv7_ex_autosomes.vcf')

#now get an index for each sample/population
samples = filtered_callset["samples"]

i=0
pop_idxs = defaultdict(list)   
for sample in samples:  
    pop_idxs[pop_assign[sample]].append(i) 
    i=i+1

pops= list(pop_idxs.keys()) 

#get genotypes
gt=allel.GenotypeArray(filtered_callset['calldata/GT'])

#now get allele count per population
ac=gt.count_alleles()

pop_ac={}
for pop in pops:
    pop_ac[pop] = gt.count_alleles(subpop=pop_idxs[pop])

Find all of the accessible bases (since used probes) and get an idea of the length of each chr

In [6]:
#initialize an list the lenght of each contig to fale
accessible_bases = {}
chrom_length = {}

with open('data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chrom_length[chrom] = int(length)
        accessible_bases[chrom]=[False] * int(length)


#now read the bed
with open('data/renamed-sma_agilent_baits.v7.0.chr_reorderd.bed', 'r') as in_bed_file:
    for bed_entry in in_bed_file:
        chrom, start, stop = bed_entry.rstrip().split("\t")
        for base in range(int(start) - 1, int(stop)):
             accessible_bases[chrom][base]=True


In [7]:
#make sure that all stops are not gt chrom length
chr_length = {}
#genome_size = 0
with open('/master/nplatt/sch_man_nwinvasion/data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chr_length[chrom]=int(length)

    cumul_start={}
    cumul_start['SM_V7_1']=0
    cumul_start['SM_V7_2']= cumul_start['SM_V7_1'] + chr_length['SM_V7_1']
    cumul_start['SM_V7_3']= cumul_start['SM_V7_2'] + chr_length['SM_V7_2']
    cumul_start['SM_V7_4']= cumul_start['SM_V7_3'] + chr_length['SM_V7_3']
    cumul_start['SM_V7_5']= cumul_start['SM_V7_4'] + chr_length['SM_V7_4']
    cumul_start['SM_V7_6']= cumul_start['SM_V7_5'] + chr_length['SM_V7_5']
    cumul_start['SM_V7_7']= cumul_start['SM_V7_6'] + chr_length['SM_V7_6']
    scanned_size = cumul_start['SM_V7_7'] + chr_length['SM_V7_7']

# Calculate Pi and Fst in same windows

In [None]:
#set window size
window_size = 100_000

#get chrom and pos values from vcf file 
pos=filtered_callset["variants/POS"]
chroms=filtered_callset["variants/CHROM"]

#get idxs from samples in all major pops
major_pop_idxs = pop_idxs["niger"] + pop_idxs["brazil"] + pop_idxs["senegal"] + pop_idxs["tanzania"]

#for pop in ["niger", "senegal", "brazil", "tanzania"]:
for pop in ["tanzania"]:
    
    #initialize empty lists for values to be saved
    pis=[]
    starts=[]
    stops=[]
    bases=[]
    snvs=[]
    fsts=[]
    labels=[]
    
    for chrom in ["SM_V7_6", "SM_V7_7"]:
    #for chrom in np.unique(filtered_callset["variants/CHROM"]):
        #subset the positions and allele counts for each chrom
        #  makes sure to restrict calculations along a single chrom rather
        #  than windows extending over 2 chroms
        chrom_pos = pos[chroms==chrom] 
        chrom_ac  = pop_ac[pop][chroms==chrom] 
        chrom_gt  = gt[chroms==chrom] 
        chrom_fsts = []
        #calculate pi over "window" bases
        chrom_pis, chrom_windows, chrom_bases, chrom_snvs = allel.windowed_diversity(sub_pos, sub_ac, size=window_size, is_accessible=accessible_bases[chrom])
        
        #calculate fst in each pi window
        query_pop=pop
        query_idxs=pop_idxs[query_pop]
        subject_idxs = list(set(major_pop_idxs) - set(query_idxs))
    
        i=0
        chrom_fsts = chrom_fsts + [float("nan")] * len(chrom_windows)
        for start, stop in chrom_windows:
            fst=allel.windowed_weir_cockerham_fst(chrom_pos, chrom_gt, [query_idxs, subject_idxs], windows=[[start, stop]])
            chrom_fsts[i]=float(fst[0])
            i+=1
            
        #append values for each stat
        pis=np.hstack((pis, chrom_pis))
        starts=np.hstack((starts, chrom_windows[:,0]))
        stops=np.hstack((stops, chrom_windows[:,1]))
        bases=np.hstack((bases, chrom_bases))
        snvs=np.hstack((snvs, chrom_snvs))
        labels.extend([chrom] * len(chrom_pis))
        fsts=fsts.extend(chrom_fsts)

    #add values to a data frame
    df=pd.DataFrame(columns=["chrom", "start", "stop", "pi", "fst", "n_access_bases", "n_vars"])
    df["chrom"]=labels
    df["start"]=starts
    df["stop"]=stops
    df["pi"]=pis
    df["fst"]=fsts
    df["n_access_bases"]=bases
    df["n_vars"]=snvs
    
    #save the dataframe to a csv
    #csv_file = "./results/windowed_pi_and_fst/{}_windowed_pi_and_fst.csv".format(pop)
    #df.to_csv(csv_file, index=False, header=True, mode='w')

In [None]:
df