In [70]:
#run in sch_man_nwinvasion-jupyter environment

import os
import shutil
import allel
import math
import yaml
import pandas as pd
#import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
from scipy import stats
import itertools


In [2]:
os.chdir("/master/nplatt/sch_man_nwinvasion")

# Read in and process data

 Get population assignments

In [3]:
with open('data/pop_assign.yml') as yaml_file:
    pop_assign = yaml.load(yaml_file, Loader=yaml.FullLoader)

read in the vcf file and get pop specific allele counts

In [4]:
#-------------------------------------------------------------------------------
# get genotype info per population

#read in vcf
filtered_callset=allel.read_vcf('results/variant_filtration/smv7_ex_autosomes.vcf')

#now get an index for each sample/population
samples = filtered_callset["samples"]

i=0
pop_idxs = defaultdict(list)   
for sample in samples:  
    pop_idxs[pop_assign[sample]].append(i) 
    i=i+1

pops= list(pop_idxs.keys()) 

#get genotypes
gt=allel.GenotypeArray(filtered_callset['calldata/GT'])

#now get allele count per population
ac=gt.count_alleles()

pop_ac={}
for pop in pops:
    pop_ac[pop] = gt.count_alleles(subpop=pop_idxs[pop])

Find all of the accessible bases (since used probes) and get an idea of the length of each chr

In [5]:
#initialize an list the lenght of each contig to fale
accessible_bases = {}
chrom_length = {}

with open('data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chrom_length[chrom] = int(length)
        accessible_bases[chrom]=[False] * int(length)


#now read the bed
with open('data/renamed-sma_agilent_baits.v7.0.chr_reorderd.bed', 'r') as in_bed_file:
    for bed_entry in in_bed_file:
        chrom, start, stop = bed_entry.rstrip().split("\t")
        for base in range(int(start) - 1, int(stop)):
             accessible_bases[chrom][base]=True


## How many SNPs are segregating on Chr 1 per pop

In [90]:
for pop in ["tanzania", "brazil", "senegal", "niger"]:
    seg=np.sum(pop_ac[pop].is_segregating())
    chr1_seg=np.sum(pop_ac[pop][0:162218].is_segregating())
    print("{} chr1={} total={}".format(pop, chr1_seg, seg))

tanzania chr1=29765 total=98136
brazil chr1=8947 total=33042
senegal chr1=9381 total=35593
niger chr1=6107 total=24017


# Pi, Tajima's D, Theta, Ne

In [45]:
for pop in ["rodhaini", "brazil", "niger", "senegal", "tanzania"]:
    accessible_genome_size = 0
    pi_s=[]
    td_s=[]
    theta_s=[]
    mu=8.1e-9

    #now loop through each chromosome
    for chrom in list(set(filtered_callset['variants/CHROM'])) :
        target_sites = filtered_callset['variants/CHROM'] == chrom

     
        chr_poss = filtered_callset['variants/POS'][target_sites]
        chr_acs  = pop_ac[pop][target_sites]
        chr_len  = len(accessible_bases[chrom])
        

        chr_pi    = allel.sequence_diversity(chr_poss, chr_acs, start=1, stop=chr_len, is_accessible=accessible_bases[chrom])
        chr_theta = allel.watterson_theta(chr_poss, chr_acs, is_accessible=accessible_bases[chrom])
        chr_td    = allel.tajima_d(chr_acs, pos=chr_poss, start=1, stop=chr_len, min_sites=3)


        num_accessible_bases = sum(accessible_bases[chrom])
        theta_s += [chr_theta] * num_accessible_bases
        pi_s    += [chr_pi]    * num_accessible_bases
        td_s    += [chr_td]    * num_accessible_bases

        accessible_genome_size += num_accessible_bases

    pi    = np.mean(pi_s)
    td    = np.mean(td_s)
    theta = np.mean(theta_s)
    ne = theta/(4 * mu)

    outline="{}:{}\t{}\t{}\t{}".format(pop, pi, td, theta, ne)
    print(outline)

rodhaini:0.000482784736658036	0.4786576179515533	0.0004285068495203049	13225.520046922991
brazil:0.0006229634190808082	0.22480080261256646	0.0005842378015352539	18032.03091158191
niger:0.0005282391905747682	-0.5633207858097302	0.000607062748979703	18736.50459813898
senegal:0.000452726735843687	-1.4171249923020972	0.0007125456426101054	21992.149463274858
tanzania:0.0013144180653959314	-0.7287596992860278	0.0016688696510230978	51508.32256244129


# Fst

## genome-wide Fst

In [9]:
pops = [ 'rodhaini', 'caribbean', 'tanzania',
         'senegal',  'niger',     'brazil' ]

pop_combs=list(itertools.combinations(pops, 2)) 

with open("results/fst/genome_wide_fst.csv", 'w') as out_csv:
    out_csv.write("pop1,pop2,fst,se\n")
    print("pop1\tpop2\tfst\tse")

    for pop1, pop2 in pop_combs:

        idxs=[pop_idxs[pop1], pop_idxs[pop2]]

        #allel.average_weir_cockerham_fst(g, subpops, blen, max_allele=None)
        fst, se, block_fsts, jacknife_fsts = allel.average_weir_cockerham_fst(gt, idxs, 100)

        outline="{}\t{}\t{}\t{}".format(pop1, pop2, fst, se)
        print(outline)
        
        out_csv.write(outline.replace("\t", ",")+"\n")

pop1	pop2	fst	SE
rodhaini	caribbean	0.928506200479594	0.0012228434103421389
rodhaini	tanzania	0.8439788473873892	0.001616842545372161
rodhaini	senegal	0.9370016347766583	0.001296004539097002
rodhaini	niger	0.9305794094480899	0.0011327164324128835
rodhaini	brazil	0.9189907169217962	0.001343597256193136
caribbean	tanzania	0.2789521159568736	0.0036073079664987
caribbean	senegal	0.32307932601975586	0.008473085940511469
caribbean	niger	0.23645993744180765	0.00707938775077595
caribbean	brazil	0.15369549738924515	0.006733044751663753
tanzania	senegal	0.41553169413628227	0.00321957992128926
tanzania	niger	0.3478709508995357	0.0031245927121814343
tanzania	brazil	0.3790452130775332	0.0034260999716080004
senegal	niger	0.13546469500849986	0.0042380948004846425
senegal	brazil	0.23465523710207586	0.0046503734975993725
niger	brazil	0.15190265241779138	0.0036320702629908355


## sliding window Fst-UNUSED

In [None]:


os.chdir("/master/nplatt/sch_man_nwinvasion")

if not os.path.exists("results/fst_per_window"):
    os.mkdir("results/fst_per_window")
    
with open('data/pop_assign.yml') as yaml_file:
    pop_assign = yaml.load(yaml_file, Loader=yaml.FullLoader)

#-----------------------------------
# get lengths from cumul positions
#make sure that all stops are not gt chrom length
chr_length = {}
#genome_size = 0
with open('/master/nplatt/sch_man_nwinvasion/data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chr_length[chrom]=int(length)

    cumul_start={}
    cumul_start['SM_V7_1']=0
    cumul_start['SM_V7_2']= cumul_start['SM_V7_1'] + chr_length['SM_V7_1']
    cumul_start['SM_V7_3']= cumul_start['SM_V7_2'] + chr_length['SM_V7_2']
    cumul_start['SM_V7_4']= cumul_start['SM_V7_3'] + chr_length['SM_V7_3']
    cumul_start['SM_V7_5']= cumul_start['SM_V7_4'] + chr_length['SM_V7_4']
    cumul_start['SM_V7_6']= cumul_start['SM_V7_5'] + chr_length['SM_V7_5']
    cumul_start['SM_V7_7']= cumul_start['SM_V7_6'] + chr_length['SM_V7_6']
    scanned_size = cumul_start['SM_V7_7'] + chr_length['SM_V7_7']

#-------------------------------------------------------------------------------
# get genotype info per population

#read in vcf
callset=allel.read_vcf('results/variant_filtration/smv7_ex_autosomes.vcf')

#now get an index for each sample/population
samples = callset["samples"]

i=0 
pop_idxs = defaultdict(list)   
for sample in samples:  
     pop_idxs[pop_assign[sample]].append(i) 
     i=i+1 

pops= list(pop_idxs.keys()) 

#get genotypes
gt=allel.GenotypeArray(callset['calldata/GT'])

#now get allele count per population
ac=gt.count_alleles()

#for simplicity add maf info to callset data
maf=ac[:, :2].min(axis=1)/ac[:, :2].sum(axis=1)
callset['maf']=maf 

pop_ac={}
for pop in pops:
    pop_ac[pop] = gt.count_alleles(subpop=pop_idxs[pop])
    
#-------------------------------------------------------------------------------
#generate windows
window=100_000

#define an array of window start and stops
window_starts = [int(x - (window/2)) for x in callset['variants/POS']]
window_stops  = [int(x + (window/2)) for x in callset['variants/POS']]

#make sure that window starts are all gt 1
window_starts = [1 if i < 1 else i for i in window_starts]


#make sure that all stops are not gt chrom length
chr_length = {}
#genome_size = 0
with open('/master/nplatt/sch_man_nwinvasion/data/genomes/Smansoni_v7.fa.fai', 'r') as fai:
    for entry in fai:
        chrom, length, *offset = entry.rstrip().split("\t")
        chr_length[chrom]=int(length)
        #genome_size = genome_size + chr_length[chrom]
    
i=0
for stop in window_stops:
    chrom = callset['variants/CHROM'][i]
    
    if stop > chr_length[chrom]:
        window_stops[i]=chr_length[chrom]
    i=i+1
    
windows = np.column_stack((np.array(window_starts), 
                           np.array(window_stops)))

callset['windows']=windows

In [None]:
#-------------------------------------------------------------------------------
# fst calculations

pops = ["brazil", "tanzania", "niger", "senegal" ]

idx_comps = {"brazil":   [pop_idxs["brazil"],   pop_idxs["tanzania"] + pop_idxs["niger"]  + pop_idxs["senegal"] ],
             "tanzania": [pop_idxs["tanzania"], pop_idxs["brazil"]   + pop_idxs["niger"]  + pop_idxs["senegal"] ],
             "niger":    [pop_idxs["niger"],    pop_idxs["tanzania"] + pop_idxs["brazil"] + pop_idxs["senegal"] ],
             "senegal":  [pop_idxs["senegal"],  pop_idxs["tanzania"] + pop_idxs["niger"]  + pop_idxs["brazil"] ]}

#make comparisons between population
for pop in idx_comps.keys():
    print(pop)
    pop1_idx = idx_comps[pop][0]
    pop2_idx = idx_comps[pop][1]

    fst_s             = []
    fst_calc_window_s = []
    fst_count_s       = []

    #create empty dataframe to store data    
    headers = ["chrom", "pos", "fst", "smoothed_fst", "window", "num_snps", "zscore", "pvalue", "-log10(p)"]
    df=pd.DataFrame(columns=headers) 

    #now loop through each chromosome
    for chrom in list(set(callset['variants/CHROM'])) :
        target_sites = np.logical_and( callset['maf'] < 0.05, 
                                       callset['variants/CHROM'] == chrom )  

        chr_gts  = gt[target_sites]
        chr_poss = callset['variants/POS'][target_sites]
        chr_wins = callset['windows'][target_sites]

        
        fsts, fst_calc_windows, fst_counts =allel.windowed_weir_cockerham_fst(chr_poss, chr_gts, subpops=[pop1_idx, pop2_idx], windows=chr_wins )

        #get rid of nan values
        useful_values = np.logical_and( np.isfinite(fsts), fst_counts>=10) 

        fsts = fsts[useful_values]
        fst_calc_windows = fst_calc_windows[useful_values]
        fst_counts = fst_counts[useful_values]
        chr_poss = chr_poss[useful_values]

        #set negative fst values to 0
        i=0
        for fst in fsts:
            if fst <0:
                fsts[i]=0
            i=i+1        
        
        #smooth
        smoothed_fsts=signal.medfilt(fsts, kernel_size = 101)

        #add data to dataframe/table
        data = list(zip([chrom]*len(fsts), chr_poss, fsts, smoothed_fsts, fst_calc_windows, fst_counts))
        chr_df=pd.DataFrame(data, columns=headers)
        df = df.append(chr_df)

    #add cumul positions
    fig_x_pos_s=[]
    for index, row in df.iterrows(): 
        fig_x_pos_s.append(int(row["pos"]) + int(cumul_start[row['chrom']]))

    df['fig_x_pos']=fig_x_pos_s

    #save data to csv file
    csv_file = "./results/fst_per_window/{}_vs_all_windowed_fst.csv".format(pop)
    df = df.sort_values(["fig_x_pos"], ascending = True)
    df.to_csv(csv_file, index=False, header=True, mode='w')

# F3

In [11]:
#allel.average_patterson_f3(acc, aca, acb, blen, normed=True)[source]

dict_keys(['samples', 'calldata/GT', 'variants/ALT', 'variants/CHROM', 'variants/FILTER_PASS', 'variants/ID', 'variants/POS', 'variants/QUAL', 'variants/REF'])

In [7]:
allel.average_patterson_f3(pop_ac["brazil"],
                           pop_ac["rodhaini"],
                           pop_ac["tanzania"],
                           blen=100,
                           normed=True)


  x = (ac[:, 0] * ac[:, 1]) / (an * (an - 1))
  vb = T_bsum / B_bsum
  vb = T_bsum / B_bsum


(0.7642687717897145,
 0.02070652497665151,
 36.90956221053494,
 array([2.30770842, 0.38621328, 2.60482947, ..., 2.16391042, 2.00865931,
        0.5053442 ]),
 array([0.76363462, 0.76462773, 0.76375805, ..., 0.76407041, 0.76402817,
        0.76435461]))

In [8]:
allel.average_patterson_f3(pop_ac["niger"],
                           pop_ac["rodhaini"],
                           pop_ac["tanzania"],
                           blen=100,
                           normed=True)

(0.99263110166744,
 0.029885483308429644,
 33.21449050775276,
 array([1.87430761, 0.42550039, 1.06493091, ..., 1.74566325, 8.69340121,
        0.45554722]),
 array([0.99205936, 0.99310809, 0.9925732 , ..., 0.99246997, 0.99211768,
        0.9928036 ]))

In [9]:
allel.average_patterson_f3(pop_ac["senegal"],
                           pop_ac["rodhaini"],
                           pop_ac["tanzania"],
                           blen=100,
                           normed=True)

(1.2648039000192297,
 0.046112799000395034,
 27.42847815437086,
 array([2.30182821, 1.01034066, 7.12026285, ..., 1.39569625, 2.63755556,
        1.16257658]),
 array([1.26427968, 1.26501592, 1.26402984, ..., 1.26485081, 1.26458232,
        1.26490291]))

# D; ABBA BABA

## genome wide

In [22]:
#allel.average_patterson_d(aca, acb, acc, acd, blen)
allel.average_patterson_d(pop_ac["brazil"],
                          pop_ac["tanzania"],
                          pop_ac["rodhaini"],
                          pop_ac["margrebowiei"],
                          blen=1000)

(-0.03261718592569776,
 0.04040435124698111,
 -0.8072691410466545,
 array([-8.82594616e-01, -2.47966235e-01, -3.76546254e-01, -2.63829334e-01,
        -8.64685364e-01,  3.29493488e-01,  1.00000000e+00, -4.61564271e-01,
        -4.96106730e-01, -1.11290935e-01, -4.01299276e-01,  3.51693121e-01,
        -6.66369350e-01, -1.62442320e-01,  1.22037184e-02, -3.21706266e-01,
        -9.37888965e-02,  1.19647107e-01, -8.35101776e-01, -7.50273158e-01,
        -8.92564786e-01,  6.01474933e-01,  1.17053499e-01, -2.57864285e-01,
        -3.46094358e-01, -8.03869738e-02, -2.44462173e-01, -6.75453628e-01,
        -3.74648131e-01, -4.64865059e-02,  6.08967083e-01, -6.69769474e-01,
        -3.01681538e-01,  2.70398336e-01,  3.75302705e-02,  5.79912225e-01,
        -1.04846105e-01, -9.68276941e-01, -1.78855052e-01, -2.89364375e-01,
        -3.01062356e-01, -5.25293250e-01,  1.91914917e-01,  3.50985983e-01,
         6.95302506e-01,  5.83909719e-01, -2.78103353e-03,  6.80119450e-01,
         5.90558287e-

In [23]:
allel.average_patterson_d(pop_ac["niger"],
                          pop_ac["tanzania"],
                          pop_ac["rodhaini"],
                          pop_ac["margrebowiei"],
                          blen=1000)

(0.005640941265044407,
 0.04372255749596241,
 0.12901672701934974,
 array([-6.81917479e-01, -3.85718949e-01, -3.14518210e-01, -1.48157010e-01,
        -7.26049583e-01, -7.48819544e-02,  1.00000000e+00, -4.76431986e-01,
        -4.55424532e-01,  7.70122384e-02,  1.43218693e-01,  5.50476508e-01,
        -6.57157674e-01, -1.70171662e-01,  2.27459140e-01, -3.07161523e-01,
        -8.09768024e-02, -2.33823000e-03, -8.35834344e-01, -7.07126413e-01,
        -8.90220088e-01,  4.65090694e-01,  5.94384452e-02, -3.35107045e-01,
        -1.78654594e-01,  8.73872130e-02, -6.75157654e-02, -7.24413685e-01,
        -1.53511725e-01, -2.05955403e-01,  6.08967083e-01, -6.46493549e-01,
        -4.15855446e-02,  3.07765389e-01, -1.43415159e-01, -5.69425287e-01,
         2.14343956e-02, -9.68151000e-01, -2.19293184e-01, -3.19503789e-01,
        -3.64196420e-01, -7.63703966e-01,  4.48711792e-01,  5.35479304e-01,
         9.09525757e-01,  9.38213406e-01,  1.83753096e-01,  9.74067439e-01,
         6.18728912e-

In [24]:
allel.average_patterson_d(pop_ac["senegal"],
                          pop_ac["tanzania"],
                          pop_ac["rodhaini"],
                          pop_ac["margrebowiei"],
                          blen=1000)

(0.05059992689029783,
 0.047304361492924364,
 1.069667263088762,
 array([-0.89406293, -0.41463584, -0.30774139, -0.16139064, -0.75768364,
         0.44104235,  0.81960111, -0.46710038, -0.38152435,  0.1597514 ,
         0.65115171,  0.48547131, -0.65927959, -0.21034687,  0.40279166,
         0.1466188 , -0.04446779,  0.09214502, -0.82464819, -0.61686142,
        -0.91202416,  0.41720816,  0.01416276, -0.2812765 , -0.06709415,
        -0.0066766 , -0.23985181, -0.82531992, -0.13169282, -0.29448431,
        -0.19772801, -0.70027812,  0.16975618,  0.13907246, -0.21881032,
        -0.57208123,  0.0214344 , -0.94780753, -0.1966875 , -0.07814018,
        -0.43446418, -0.79269509,  0.33335328,  0.71104633,  0.89928786,
         0.95764246, -0.16487126,  0.84891221,  0.46558885, -0.77476284,
        -0.44066063, -0.03270511, -0.78116302, -0.74541501, -0.12114947,
         0.57997731,  0.35401737,  0.92859108,  0.46336566, -0.81648348,
        -0.27947332,  0.08839017,  0.66815843, -0.73577249,

## at region with high rod alleles from loter

In [53]:
#get variants from region: SM_V7_2:31,708,474-32,212,727
x=np.logical_and( filtered_callset['variants/POS'] >= 31_708_474,
                  filtered_callset['variants/POS'] <= 32_212_727)
y=np.logical_and(x,
                 filtered_callset['variants/CHROM'] == "SM_V7_2")



num, den = allel.patterson_d(pop_ac["brazil"][y],
                             pop_ac["tanzania"][y],
                             pop_ac["rodhaini"][y],
                             pop_ac["margrebowiei"][y])

In [65]:
num=num[~np.isnan(num)]
den=den[~np.isnan(den)]

In [82]:
target_d=num.sum()/den.sum()
target_d

-0.8712250892015296

In [69]:
d, se, z, d_block, n_block = allel.average_patterson_d(pop_ac["brazil"],
                                                       pop_ac["tanzania"],
                                                       pop_ac["rodhaini"],
                                                       pop_ac["margrebowiei"],
                                                       blen=1000)


In [81]:
d_block.mean()

-0.016875737747302933

In [76]:
stats.percentileofscore(d_block, target_d, kind="rank")

2.9473684210526314