# calculate genome-wide pi (and eventally replot fig X)

## sch_man_nwinvasion


will need to run in the sch_man_nwinvasion-postproc env.

In [1]:
#run in sch_man_nwinvasion-jupyter environment

import os
import shutil
import allel
import math
import yaml
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from scipy import stats
import itertools
import scipy.spatial
import random
from tqdm.notebook import tqdm

In [145]:
os.chdir("/master/nplatt/sch_man_nwinvasion")

if not os.path.exists("results/windowed_pi"):
    os.mkdir("results/windowed_pi")

Calculate the cumulative length fo each chromosome for downstream plotting

# Read in and process data

 Get population assignments

In [3]:
with open('data/pop_assign.yml') as yaml_file:
    pop_assign = yaml.load(yaml_file, Loader=yaml.FullLoader)

read in the vcf file and get pop specific allele counts

In [4]:
#-------------------------------------------------------------------------------
# get genotype info per population

#read in vcf
filtered_callset=allel.read_vcf('results/variant_filtration/smv7_ex_autosomes.vcf')

#now get an index for each sample/population
samples = filtered_callset["samples"]

i=0
pop_idxs = defaultdict(list)   
for sample in samples:  
    pop_idxs[pop_assign[sample]].append(i) 
    i=i+1

pops= list(pop_idxs.keys()) 

#get genotypes
gt=allel.GenotypeArray(filtered_callset['calldata/GT'])

#now get allele count per population
ac=gt.count_alleles()

pop_ac={}
for pop in pops:
    pop_ac[pop] = gt.count_alleles(subpop=pop_idxs[pop])

Find all of the accessible bases (since used probes) and get an idea of the length of each chr

In [5]:
#initialize an list the lenght of each contig to fale
accessible_bases = {}
chrom_length = {}

with open('data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chrom_length[chrom] = int(length)
        accessible_bases[chrom]=[False] * int(length)


#now read the bed
with open('data/renamed-sma_agilent_baits.v7.0.chr_reorderd.bed', 'r') as in_bed_file:
    for bed_entry in in_bed_file:
        chrom, start, stop = bed_entry.rstrip().split("\t")
        for base in range(int(start) - 1, int(stop)):
             accessible_bases[chrom][base]=True


In [6]:
#make sure that all stops are not gt chrom length
chr_length = {}
#genome_size = 0
with open('/master/nplatt/sch_man_nwinvasion/data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chr_length[chrom]=int(length)

    cumul_start={}
    cumul_start['SM_V7_1']=0
    cumul_start['SM_V7_2']= cumul_start['SM_V7_1'] + chr_length['SM_V7_1']
    cumul_start['SM_V7_3']= cumul_start['SM_V7_2'] + chr_length['SM_V7_2']
    cumul_start['SM_V7_4']= cumul_start['SM_V7_3'] + chr_length['SM_V7_3']
    cumul_start['SM_V7_5']= cumul_start['SM_V7_4'] + chr_length['SM_V7_4']
    cumul_start['SM_V7_6']= cumul_start['SM_V7_5'] + chr_length['SM_V7_5']
    cumul_start['SM_V7_7']= cumul_start['SM_V7_6'] + chr_length['SM_V7_6']
    scanned_size = cumul_start['SM_V7_7'] + chr_length['SM_V7_7']

# Pi

In [151]:
#set window size
window_size = 100_000

#get chrom and pos values from vcf file 
pos=filtered_callset["variants/POS"]
chroms=filtered_callset["variants/CHROM"]

for pop in ["niger", "senegal", "brazil", "tanzania"]:
    
    #initialize empty lists for values to be saved
    pis=[]
    starts=[]
    stops=[]
    bases=[]
    snvs=[]
    chrom_labels=[]

    for chrom in np.unique(filtered_callset["variants/CHROM"]):
        #subset the positions and allele counts for each chrom
        #  makes sure to restrict calculations along a single chrom rather
        #  than windows extending over 2 chroms
        sub_pos = pos[chroms==chrom] 
        sub_ac  = pop_ac[pop][chroms==chrom] 
        
        #calculate pi over "window" bases
        pi, window, base, snv = allel.windowed_diversity(sub_pos, sub_ac, size=window_size, is_accessible=accessible_bases[chrom])
        
        #append values for each stat
        pis=np.hstack((pis, pi))
        starts=np.hstack((starts, window[:,0]))
        stops=np.hstack((stops, window[:,0]))
        bases=np.hstack((bases, base))
        snvs=np.hstack((snvs, snv))
        chrom_labels.extend([chrom] * len(pi))

    #add values to a data frame
    pi_df=pd.DataFrame(columns=["chrom", "start", "stop", "pi", "n_access_bases", "n_vars"])
    pi_df["chrom"]=chrom_labels
    pi_df["start"]=starts
    pi_df["stop"]=stops
    pi_df["pi"]=pis
    pi_df["n_access_bases"]=bases
    pi_df["n_vars"]=snvs
    
    #save the dataframe to a csv
    csv_file = "./results/windowed_pi/{}_windowed_pi.csv".format(pop)
    pi_df.to_csv(csv_file, index=False, header=True, mode='w')

# Fst

In [None]:
fst_window_size=100
pos=filtered_callset["variants/POS"]

if not os.path.exists("results/windowed_fst_target_vs_others"):
    os.mkdir("results/windowed_fst_target_vs_others")
    
chroms=filtered_callset["variants/CHROM"]

#get idxs from samples in all major pops
major_pop_idxs = pop_idxs["niger"] + pop_idxs["brazil"] + pop_idxs["senegal"] + pop_idxs["tanzania"]

#each pop is the "target" pop
for query_pop in ["niger", "senegal", "brazil", "tanzania"]:
   
    #clean vars
    labels=[]
    fsts=[]
    starts=[]
    stops=[]

    #get idxs of query pop and idxs from all other major pops
    query_idxs=pop_idxs[query_pop]
    subject_idxs = list(set(major_pop_idxs) - set(query_idxs))

    #break up calcs into specific chroms
    for chrom in np.unique(filtered_callset["variants/CHROM"]):
    
        #get chrom gt
        chrom_gt = gt[chroms==chrom] 
        sub_pos  = pos[chroms==chrom]

        #calculate fst on chrom
        chrom_fsts = allel.moving_weir_cockerham_fst(chrom_gt, [query_idxs, subject_idxs], fst_window_size)

        #get start and stop coords for each fst window
        chrom_starts=sub_pos[0:-fst_window_size:fst_window_size]
        chrom_stops=sub_pos[fst_window_size-1::fst_window_size]

        #append chrom values to those from the rest of the genomes
        fsts=np.hstack((fsts, chrom_fsts))
        starts=np.hstack((starts, chrom_starts))
        stops=np.hstack((stops, chrom_stops))
        labels.extend([chrom] * len(chrom_fsts))
    
    #add values to a data frame
    fst_df=pd.DataFrame(columns=["chrom", "start", "stop", "fst"])
    fst_df["chrom"]=labels
    fst_df["start"]=starts
    fst_df["stop"]=stops
    fst_df["fst"]=fsts

    #all negative fsts changed to 0
    fst_df.loc[fst_df["fst"] < 0, "fst"] = float(0)
    
    #save to csv
    csv_file = "./results/windowed_fst_target_vs_others/{}_windowed-{}snvs_fst_vs_others_.csv".format(query_pop, fst_window_size)
    fst_df.to_csv(csv_file, index=False, header=True, mode='w')

  p = ac / an[:, np.newaxis, :]
  a = ((n_bar / n_C) *
