In [5]:
import os
import shutil
import vcf
import re
import gzip
import pandas as pd

from IPython.display import Image
from Bio import SeqIO


import rpy2.ipython


%load_ext rpy2.ipython

os.chdir("/master/nplatt/sch_man_nwinvasion")

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


# Linkage Disequilibrium

In [153]:
#make lists for each population
if not os.path.isdir("results/ld"):
    os.mkdir("results/ld")

ld_pops=["east_africa", "west_africa", "new_world"]

for pop in ld_pops:
    list_file="results/ld/" + pop + ".list"
    with open(list_file, 'w') as filehandle:  
        filehandle.writelines("%s\n" % sample for sample in pop_ids[pop])

In [155]:
%%bash

#get pop specific VCF files
for POP in east_africa west_africa new_world; do
    if [ ! -f results/ld/schMan_v7_"$POP"_ld_dist.tbl ]; then
        rm results/ld/schMan_v7_"$POP"_ld_dist.tbl
    fi
    
    for CHR in "1" "2" "3" "4" "5" "6" "7"; do
    
        CHR="SM_V7_$CHR"
    
        #get pop specific chr vcf
   
        vcftools \
            --vcf results/variant_filtration/smv7_ex_autosomes.vcf \
            --chr $CHR \
            --keep results/ld/$POP.list \
            --recode \
            --recode-INFO-all \
            --stdout \
            >results/ld/smv7_ex_autosomes_"$POP"_"$CHR".vcf
    
        plink \
            --threads 6 \
            --vcf results/ld/smv7_ex_autosomes_"$POP"_"$CHR".vcf \
            --out results/ld/smv7_ex_autosomes_"$POP"_"$CHR" \
            --double-id\
            --recode12 \
            --allow-extra-chr

        #calculate R2 between all snps on a chr
        plink \
            --threads 6 \
            --r2 \
            --file results/ld/smv7_ex_autosomes_"$POP"_"$CHR" \
            --out results/ld/smv7_ex_autosomes_"$POP"_"$CHR" \
            --double-id \
            --allow-extra-chr \
            --ld-window-r2 0.0 \
            --ld-window 1000000 \
            --ld-window-kb 90000
            
        #calc distance and generate a single table
        awk '{print $0"\t"$5-$2}' \
            results/ld/smv7_ex_autosomes_"$POP"_"$CHR".ld \
            >>results/ld/smv7_ex_autosomes__"$POP"_ld_dist.tbl
                
        #clean up
        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".ld
        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".map
        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".ped
        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".log
        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".nosex
        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".vcf

    done
    
        #remove the header line from the table (duplicated from each chrom)
        sed -i -e '1p' -e '/CHR/d' results/ld/smv7_ex_autosomes_"$POP"_ld_dist.tbl 
        sed -i -e '1s/0/BP_DISTANCE/' results/ld/smv7_ex_autosomes_"$POP"_ld_dist.tbl
        
        #create a file with distances lt 500kb
        awk '{if ($8 <= 500000) print $0}' results/ld/smv7_ex_autosomes_"$POP"_ld_dist.tbl \
            >results/ld/smv7_ex_autosomes_"$POP"_ld_dist_lt500kb.tbl
done

PLINK v1.90b4 64-bit (20 Mar 2017)             www.cog-genomics.org/plink/1.9/
(C) 2005-2017 Shaun Purcell, Christopher Chang   GNU General Public License v3
Note: --recode12 flag deprecated.  Use 'recode 12 ...'.
Logging to results/ld/smv7_ex_autosomes_east_africa_SM_V7_1.log.
Options in effect:
  --allow-extra-chr
  --double-id
  --out results/ld/smv7_ex_autosomes_east_africa_SM_V7_1
  --recode 12
  --threads 6
  --vcf results/ld/smv7_ex_autosomes_east_africa_SM_V7_1.vcf

24158 MB RAM detected; reserving 12079 MB for main workspace.
--vcf: 1k variants complete.--vcf: 2k variants complete.--vcf: 3k variants complete.--vcf: 4k variants complete.--vcf: 5k variants complete.--vcf: 6k variants complete.--vcf: 7k variants complete.--vcf: 8k variants complete.--vcf: 9k variants complete.--vcf: 10k variants complete.--vcf: 11k variants complete.--vcf: 12k variants complete.--vcf: 13k variants complete.--vcf: 14k variants complete.--vcf: 15k variants complete.--vcf: 16k varian

rm: cannot remove `results/ld/schMan_v7_east_africa_ld_dist.tbl': No such file or directory

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf results/variant_filtration/smv7_ex_autosomes.vcf
	--chr SM_V7_1
	--keep results/ld/east_africa.list
	--recode-INFO-all
	--recode
	--stdout

Keeping individuals in 'keep' list
After filtering, kept 58 out of 156 Individuals
Outputting VCF file...
After filtering, kept 162219 out of a possible 475081 Sites
Run Time = 91.00 seconds

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf results/variant_filtration/smv7_ex_autosomes.vcf
	--chr SM_V7_2
	--keep results/ld/east_africa.list
	--recode-INFO-all
	--recode
	--stdout

Keeping individuals in 'keep' list
After filtering, kept 58 out of 156 Individuals
Outputting VCF file...
After filtering, kept 78902 out of a possible 475081 Sites
Run Time = 78.00 seconds

VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta

CalledProcessError: Command 'b'\n#get pop specific VCF files\nfor POP in east_africa west_africa new_world; do\n    if [ ! -f results/ld/schMan_v7_"$POP"_ld_dist.tbl ]; then\n        rm results/ld/schMan_v7_"$POP"_ld_dist.tbl\n    fi\n    \n    for CHR in "1" "2" "3" "4" "5" "6" "7"; do\n    \n        CHR="SM_V7_$CHR"\n    \n        #get pop specific chr vcf\n   \n        vcftools \\\n            --vcf results/variant_filtration/smv7_ex_autosomes.vcf \\\n            --chr $CHR \\\n            --keep results/ld/$POP.list \\\n            --recode \\\n            --recode-INFO-all \\\n            --stdout \\\n            >results/ld/smv7_ex_autosomes_"$POP"_"$CHR".vcf\n    \n        plink \\\n            --threads 6 \\\n            --vcf results/ld/smv7_ex_autosomes_"$POP"_"$CHR".vcf \\\n            --out results/ld/smv7_ex_autosomes_"$POP"_"$CHR" \\\n            --double-id\\\n            --recode12 \\\n            --allow-extra-chr\n\n        #calculate R2 between all snps on a chr\n        plink \\\n            --threads 6 \\\n            --r2 \\\n            --file results/ld/smv7_ex_autosomes_"$POP"_"$CHR" \\\n            --out results/ld/smv7_ex_autosomes_"$POP"_"$CHR" \\\n            --double-id \\\n            --allow-extra-chr \\\n            --ld-window-r2 0.0 \\\n            --ld-window 1000000 \\\n            --ld-window-kb 90000\n            \n        #calc distance and generate a single table\n        awk \'{print $0"\\t"$5-$2}\' \\\n            results/ld/smv7_ex_autosomes_"$POP"_"$CHR".ld \\\n            >>results/ld/smv7_ex_autosomes__"$POP"_ld_dist.tbl\n                \n        #clean up\n        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".ld\n        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".map\n        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".ped\n        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".log\n        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".nosex\n        rm results/ld/smv7_ex_autosomes_"$POP"_"$CHR".vcf\n\n    done\n    \n        #remove the header line from the table (duplicated from each chrom)\n        sed -i -e \'1p\' -e \'/CHR/d\' results/ld/smv7_ex_autosomes_"$POP"_ld_dist.tbl \n        sed -i -e \'1s/0/BP_DISTANCE/\' results/ld/smv7_ex_autosomes_"$POP"_ld_dist.tbl\n        \n        #create a file with distances lt 500kb\n        awk \'{if ($8 <= 500000) print $0}\' results/ld/smv7_ex_autosomes_"$POP"_ld_dist.tbl \\\n            >results/ld/smv7_ex_autosomes_"$POP"_ld_dist_lt500kb.tbl\ndone\n'' returned non-zero exit status 2.

In [None]:
%%R
#in R generate summary stats for bins and plot

#500bp bins for 500kb
breaks <- seq( 0, 5e5, 500)

centers   <- vector()
means     <- vector()
pops      <- vector()
smootheds <- vector()

for (pop in c("east_africa", "west_africa", "new_world",)) {
    
    #read in lt 500kb r2 table from vcftools
    ld_table <- read.table(paste("results/ld/schMan_v7_maf05_", pop, "_ld_dist_lt500kb.tbl", sep=""), 
                           header=FALSE)
    
    #bin r2 values and calculate stats
    ld_binned <- stats.bin(ld_table$V8, ld_table$V7, breaks = breaks)
    
    #created regression line
    loessMod  <- loess(ld_binned$stats["mean",] ~ ld_binned$centers, span=0.50)
    smoothed  <- predict(loessMod)
    
    #append all data to vectors
    centers   <- append(centers, ld_binned$centers)
    means     <- append(means, ld_binned$stats["mean",])
    pops      <- append(pops, rep(pop, length(ld_binned$centers)))
    smootheds <- append(smootheds, smoothed)
}    

#build the dataframe and save to csv
r2_df <- data.frame(centers, means, pops, smootheds)
write.csv(r2_df, file = paste("results/ld/ld_dist_lt500kb_binned_smoothed.csv", sep="") ,row.names=FALSE)

#subset desired populations
major_groups <- subset(r2_df, pops == "east_africa" | 
                              pops == "west_africa" |
                              pops == "new_world" ) 



In [None]:
%%R

pop_colors <- c("eafrica"   = "green",
                "oman"      = "yellow", 
                "wafrica"   = "red",
                "brazil_x"  = "purple",
                "caribbean" = "blue")

#start plotting
p <- ggplot(major_groups, aes(x     = centers, 
                              y     = means, 
                              color = pops))

#adjust colors
p <- p + scale_colour_manual(values = pop_colors)

#plot data
p <- p + geom_point(alpha = 0.0)

#smooth data with loess
p <- p + geom_smooth(span   = 0.75, 
                     method = "loess", 
                     lwd    = 1, 
                     se     = FALSE)

#modify x and y axis
p <- p + scale_x_continuous(name   = "Distance between SNPs (Kb)", 
                            breaks = seq(0,500000,100000), 
                            labels = c("0", "100", "200", "300", "400", "500"), 
                            expand = c(0,0),
                            )
p <- p + scale_y_continuous(name   = "Mean R2", 
                            expand = c(0,0), 
                            limits = c(0,1))

#change theme
p <- p + theme_bw()

#change fonts on axis elements and titles
p <- p + theme(axis.text  = element_text(size = 12),
               axis.title = element_text(size = 14,
                                         face = "bold"))

#removing gridlines
p <- p + theme(panel.grid.major = element_blank(), 
               panel.grid.minor = element_blank())

#set up plot title etc.
p <- p + ggtitle("LD Decay")
p <- p + theme(plot.title = element_text(hjust = 0.5, 
                                         vjust = 0.5, 
                                         face  = 'bold', 
                                          size  = 18))

#modify legend
p <- p + theme(legend.title         = element_text(size = 14, 
                                                   face = "bold"),
               legend.text           = element_text(size = 12),
               legend.position       = c(0.85, 0.85),
               legend.box.background = element_rect(colour = "black"))
p <- p + labs(col = "Population")
p <- p + scale_color_manual(labels = c("Brazil",
                                       "Caribbean",
                                       "E. Africa",
                                       "Oman",
                                       "W. Africa" ), 
                            values = c(pop_colors["brazil_x"], 
                                       pop_colors["caribbean"], 
                                       pop_colors["eafrica"],
                                       pop_colors["oman"],
                                       pop_colors["wafrica"] ))

#save the figure
svg_img <- "results/ld/ld_decay.svg"
png_img <- "results/ld/ld_decay.png"
ggsave(png_img, plot = p, dpi = 600)
ggsave(svg_img, plot = p)


#display in notebook
print(p)

#### Bayescan

In [4]:
import os
from os import path
import shutil

#make a clean dir
#if os.path.isdir("results/bayescan"):
#    shutil.rmtree("results/bayescan")

#os.mkdir("results/bayescan")

bayescan_pop_assignments = {
    "Sm.BR_PdV.0447.1"        : "new_world",
    "Sm.BR_PdV.1039.1"        : "new_world",
    "Sm.BR_PdV.1079.1"        : "new_world",
    "Sm.BR_PdV.1094.1"        : "new_world",
    "Sm.BR_PdV.1103.1"        : "new_world",
    "Sm.BR_PdV.1127.1"        : "new_world",
    "Sm.BR_PdV.1278.1"        : "new_world",
    "Sm.BR_PdV.1340.1"        : "new_world",
    "Sm.BR_PdV.1340.2"        : "new_world",
    "Sm.BR_PdV.1371.1"        : "new_world",
    "Sm.BR_PdV.1404.1"        : "new_world",
    "Sm.BR_PdV.1409.1"        : "new_world",
    "Sm.BR_PdV.1418.1"        : "new_world",
    "Sm.BR_PdV.1475.1"        : "new_world",
    "Sm.BR_PdV.1489.1"        : "new_world",
    "Sm.BR_PdV.2039.1"        : "new_world",
    "Sm.BR_PdV.2039.2"        : "new_world",
    "Sm.BR_PdV.2072.1"        : "new_world",
    "Sm.BR_PdV.2074.1"        : "new_world",
    "Sm.BR_PdV.2076.1"        : "new_world",
    "Sm.BR_PdV.2133.1"        : "new_world",
    "Sm.BR_PdV.2147.1"        : "new_world",
    "Sm.BR_PdV.2189.1"        : "new_world",
    "Sm.BR_PdV.2196.2"        : "new_world",
    "Sm.BR_PdV.2225.1"        : "new_world",
    "Sm.BR_PdV.2227.1"        : "new_world",
    "Sm.BR_PdV.2265.1"        : "new_world",
    "Sm.BR_PdV.2290.1"        : "new_world",
    "Sm.BR_PdV.2300.1"        : "new_world",
    "Sm.BR_PdV.2334.1"        : "new_world",
    "Sm.BR_PdV.2368.1"        : "new_world",
    "Sm.BR_PdV.2406.1"        : "new_world",
    "Sm.BR_PdV.2422.1"        : "new_world",
    "Sm.BR_PdV.2450.1"        : "new_world",
    "Sm.BR_PdV.2456.1"        : "new_world",
    "Sm.BR_PdV.2481.1"        : "new_world",
    "Sm.BR_PdV.2489.1"        : "new_world",
    "Sm.BR_PdV.2508.1"        : "new_world",
    "Sm.BR_PdV.2516.2"        : "new_world",
    "Sm.BR_PdV.2530.1"        : "new_world",
    "Sm.BR_PdV.2538.1"        : "new_world",
    "Sm.BR_PdV.2546.1"        : "new_world",
    "Sm.BR_PdV.2556.1"        : "new_world",
    "Sm.BR_PdV.2577.1"        : "new_world",
    "Sm.BR_PdV.4293.2"        : "new_world",
    "Sm.NE_Di158.1"           : "west_africa",
    "Sm.NE_Di186.1"           : "west_africa",
    "Sm.NE_Di238.1"           : "west_africa",
    "Sm.NE_Di297.1"           : "west_africa",
    "Sm.NE_Di297.2"           : "west_africa",
    "Sm.NE_Di68.2"            : "west_africa",
    "Sm.NE_Na376.2"           : "west_africa",
    "Sm.NE_Na381.1"           : "west_africa",
    "Sm.NE_Na39.1"            : "west_africa",
    "Sm.NE_Na40.1"            : "west_africa",
    "Sm.SN_Nd103.1"           : "west_africa",
    "Sm.SN_Nd109.1"           : "west_africa",
    "Sm.SN_Nd114.1"           : "west_africa",
    "Sm.SN_Nd115.1"           : "west_africa",
    "Sm.SN_Nd18.1"            : "west_africa",
    "Sm.SN_Nd22.1"            : "west_africa",
    "Sm.SN_Nd24.1"            : "west_africa",
    "Sm.SN_Nd25.1"            : "west_africa",
    "Sm.SN_Nd34.1"            : "west_africa",
    "Sm.SN_Nd43.1"            : "west_africa",
    "Sm.SN_Nd47.1"            : "west_africa",
    "Sm.SN_Nd5.1"             : "west_africa",
    "Sm.SN_Nd5.2"             : "west_africa",
    "Sm.SN_Nd50.1"            : "west_africa",
    "Sm.SN_Nd54.1"            : "west_africa",
    "Sm.SN_Nd56.1"            : "west_africa",
    "Sm.SN_Nd77.1"            : "west_africa",
    "Sm.SN_Nd79.1"            : "west_africa",
    "Sm.SN_Nd9.1"             : "west_africa",
    "Sm.SN_Nd90.1"            : "west_africa",
    "Sm.SN_Te26.1"            : "west_africa",
    "Sm.SN_Te3.1"             : "west_africa",
    "Sm.SN_Te49.1"            : "west_africa",
    "Sm.SN_Te55.1"            : "west_africa",
    "Sm.SN_Te68.1"            : "west_africa",
    "Sm.TZ_009.1.1"           : "east_africa",
    "Sm.TZ_009.10.1"          : "east_africa",
    "Sm.TZ_009.2.2"           : "east_africa",
    "Sm.TZ_009.3.1"           : "east_africa",
    "Sm.TZ_009.4.2"           : "east_africa",
    "Sm.TZ_009.5.2"           : "east_africa",
    "Sm.TZ_009.6.1"           : "east_africa",
    "Sm.TZ_009.7.1"           : "east_africa",
    "Sm.TZ_009.8.2"           : "east_africa",
    "Sm.TZ_009.9.1"           : "east_africa",
    "Sm.TZ_055.1.3"           : "east_africa",
    "Sm.TZ_055.10.1"          : "east_africa",
    "Sm.TZ_055.2.1"           : "east_africa",
    "Sm.TZ_055.3.2"           : "east_africa",
    "Sm.TZ_055.5.1"           : "east_africa",
    "Sm.TZ_055.6.1"           : "east_africa",
    "Sm.TZ_055.7.1"           : "east_africa",
    "Sm.TZ_055.8.1"           : "east_africa",
    "Sm.TZ_074N.1.2"          : "east_africa",
    "Sm.TZ_074N.10.2"         : "east_africa",
    "Sm.TZ_074N.2.2"          : "east_africa",
    "Sm.TZ_074N.3.2"          : "east_africa",
    "Sm.TZ_074N.4.1"          : "east_africa",
    "Sm.TZ_074N.6.3"          : "east_africa",
    "Sm.TZ_074N.7.2"          : "east_africa",
    "Sm.TZ_074N.8.1"          : "east_africa",
    "Sm.TZ_074N.9.1"          : "east_africa",
    "Sm.TZ_077.2.1"           : "east_africa",
    "Sm.TZ_077.3.1"           : "east_africa",
    "Sm.TZ_077.4.2"           : "east_africa",
    "Sm.TZ_077.5.1"           : "east_africa",
    "Sm.TZ_077.6.1"           : "east_africa",
    "Sm.TZ_077.7.3"           : "east_africa",
    "Sm.TZ_077.8.1"           : "east_africa",
    "Sm.TZ_077.9.2"           : "east_africa",
    "Sm.TZ_086.1.1"           : "east_africa",
    "Sm.TZ_086.2.3"           : "east_africa",
    "Sm.TZ_086.3.1"           : "east_africa",
    "Sm.TZ_086.4.2"           : "east_africa",
    "Sm.TZ_086.5.1"           : "east_africa",
    "Sm.TZ_086.6.1"           : "east_africa",
    "Sm.TZ_086.7.1"           : "east_africa",
    "Sm.TZ_086.8.1"           : "east_africa",
    "Sm.TZ_134.1.1"           : "east_africa",
    "Sm.TZ_134.2.2"           : "east_africa",
    "Sm.TZ_134.4.1"           : "east_africa",
    "Sm.TZ_134.5.1"           : "east_africa",
    "Sm.TZ_134.6.1"           : "east_africa",
    "Sm.TZ_141.1.1"           : "east_africa",
    "Sm.TZ_141.3.1"           : "east_africa",
    "Sm.TZ_141.4.2"           : "east_africa",
    "Sm.TZ_141.5.3"           : "east_africa",
    "Sm.TZ_141.6.1"           : "east_africa",
    "Sm.TZ_141.7.1"           : "east_africa",
    "Sm.TZ_141.8.1"           : "east_africa" }


#create files with lists of samples per population
for sample in bayescan_pop_assignments:
    with open("results/bayescan/" + bayescan_pop_assignments[sample] + ".list", 'a') as list_file:
        list_file.write(sample + "\n")
    with open("results/bayescan/" + bayescan_pop_assignments[sample] + ".pop", 'a') as list_file:
        list_file.write(sample + "\t" + bayescan_pop_assignments[sample] + "\n")

In [28]:
%%bash 

#get pgdspider to convert formats for bayescan
if [ ! -f bin/PGDSpider_2.1.1.5/PGDSpider2-cli.jar ]; then
    wget -P bin/ http://www.cmpg.unibe.ch/software/PGDSpider/PGDSpider_2.1.1.5.zip
    unzip bin/PGDSpider_2.1.1.5.zip -d bin
fi

if [ ! -f bin/BayeScan2.1/binaries/BayeScan2.1_linux64bits ]; then
    wget -P bin/ http://cmpg.unibe.ch/software/BayeScan/files/BayeScan2.1.zip
    unzip bin/BayeScan2.1.zip -d bin
    chmod u+x bin/BayeScan2.1/binaries/BayeScan2.1_linux64bits
fi

In [5]:
#populations comparisons to make
bayes_comparisons=[  ["new_world",   "west_africa" ],
                     ["new_world",   "east_africa" ],
                     ["west_africa", "east_africa" ]  ]


In [36]:
#prep input files and submit bayescan runs
for comparison in bayes_comparisons:
    pop1 = comparison[0]
    pop2 = comparison[1]

    #get vcf
    get_vcf = "vcftools \
    --vcf results/variant_filtration/smv7_ex_autosomes.vcf \
    --keep results/bayescan/" + pop1 + ".list \
    --keep results/bayescan/" + pop2 + ".list \
    --mac 2 \
    --min-alleles 2 \
    --max-alleles 2 \
    --stdout \
    --recode \
    >results/bayescan/" + pop1 + "_" + pop2 + ".vcf"
    os.system(get_vcf)

    #get list of sites (for plotting downstream)
    with open("results/bayescan/" + pop1 + "_" + pop2 + ".sites", 'w') as sites_file:
        outline = "chrom\tpos\n".format("chrom", "pos")
        sites_file.write(outline)

        vcf   = allel.read_vcf('results/bayescan/' + pop1 + '_' + pop2 + '.vcf')
        chrom = list(vcf['variants/CHROM'])
        pos   = list(vcf['variants/POS'])

        for i in list(range(0, len(chrom))):
            outline = "{0}\t{1}\n".format(chrom[i], pos[i])
            sites_file.write(outline)    
    #-----------------------------------------------------------------------------
    #convert with spider
 

In [6]:
#prep input files and submit bayescan runs
for comparison in bayes_comparisons:
    pop1 = comparison[0]
    pop2 = comparison[1]
    
    #create pop file
    !cat results/bayescan/{pop1}.pop results/bayescan/{pop2}.pop >results/bayescan/{pop1}_{pop2}.pop
    
    #create spid for conversion from vcf to pgd
    vcf_pgd_spid ='''# spid-file generated: NA

# VCF Parser questions
PARSER_FORMAT=VCF

# Only output SNPs with a phred-scaled quality of at least:
VCF_PARSER_QUAL_QUESTION=
# Select population definition file:
VCF_PARSER_POP_FILE_QUESTION=./results/bayescan/{pop1}_{pop2}.pop
# What is the ploidy of the data?
VCF_PARSER_PLOIDY_QUESTION=DIPLOID
# Do you want to include a file with population definitions?
VCF_PARSER_POP_QUESTION=true
# Output genotypes as missing if the phred-scale genotype quality is below:
VCF_PARSER_GTQUAL_QUESTION=
# Do you want to include non-polymorphic SNPs?
VCF_PARSER_MONOMORPHIC_QUESTION=false
# Only output following individuals (ind1, ind2, ind4, ...):
VCF_PARSER_IND_QUESTION=
# Only input following regions (refSeqName:start:end, multiple regions: whitespace separated):
VCF_PARSER_REGION_QUESTION=
# Output genotypes as missing if the read depth of a position for the sample is below:
VCF_PARSER_READ_QUESTION=
# Take most likely genotype if "PL" or "GL" is given in the genotype field?
VCF_PARSER_PL_QUESTION=false
# Do you want to exclude loci with only missing data?
VCF_PARSER_EXC_MISSING_LOCI_QUESTION=false

# PGD Writer questions
WRITER_FORMAT=PGD
'''.format(pop1 = pop1, 
           pop2 = pop2)
    
    #write to file
    with open("results/bayescan/" + pop1 + "_" + pop2 + "_vcf_pgd.spid", 'w') as vcf_pgd_spid_file:
       vcf_pgd_spid_file.write(vcf_pgd_spid)

    pgd_bayescan_spid = '''# spid-file generated: NA
    
# VCF Parser questions
PARSER_FORMAT=PGD

# Only output SNPs with a phred-scaled quality of at least:
VCF_PARSER_QUAL_QUESTION=
# Select population definition file:
VCF_PARSER_POP_FILE_QUESTION=./results/bayescan/{pop1}_{pop2}.pop
# What is the ploidy of the data?
VCF_PARSER_PLOIDY_QUESTION=DIPLOID
# Do you want to include a file with population definitions?
VCF_PARSER_POP_QUESTION=true
# Output genotypes as missing if the phred-scale genotype quality is below:
VCF_PARSER_GTQUAL_QUESTION=
# Do you want to include non-polymorphic SNPs?
VCF_PARSER_MONOMORPHIC_QUESTION=false
# Only output following individuals (ind1, ind2, ind4, ...):
VCF_PARSER_IND_QUESTION=
# Only input following regions (refSeqName:start:end, multiple regions: whitespace separated):
VCF_PARSER_REGION_QUESTION=
# Output genotypes as missing if the read depth of a position for the sample is below:
VCF_PARSER_READ_QUESTION=
# Take most likely genotype if "PL" or "GL" is given in the genotype field?
VCF_PARSER_PL_QUESTION=false
# Do you want to exclude loci with only missing data?
VCF_PARSER_EXC_MISSING_LOCI_QUESTION=false

# PGD Writer questions
WRITER_FORMAT=BAYESCAN
'''.format(pop1 = pop1, 
           pop2 = pop2)
    
    with open("results/bayescan/" + pop1 + "_" + pop2 + "_pgd_bayescan.spid", 'w') as pgd_bayescan_spid_file:
        pgd_bayescan_spid_file.write(pgd_bayescan_spid)
        
    vcf_to_pgd = "java -jar bin/PGDSpider_2.1.1.5/PGDSpider2-cli.jar \
        -inputfile results/bayescan/{pop1}_{pop2}.vcf \
        -inputformat VCF \
        -outputfile results/bayescan/{pop1}_{pop2}.pgd \
        -outputformat PGD \
        -spid results/bayescan/{pop1}_{pop2}_vcf_pgd.spid".format(pop1=pop1, pop2=pop2)

    pgd_to_bayescan = "java -jar bin/PGDSpider_2.1.1.5/PGDSpider2-cli.jar \
        -inputfile results/bayescan/{pop1}_{pop2}.pgd \
        -inputformat PGD \
        -outputfile results/bayescan/{pop1}_{pop2}.bayescan \
        -outputformat GESTE_BAYE_SCAN \
        -spid results/bayescan/{pop1}_{pop2}_pgd_bayescan.spid".format(pop1=pop1, pop2=pop2)
    
    os.system(vcf_to_pgd)
    os.system(pgd_to_bayescan)
    

In [8]:
list(range(1,2))

[1]

In [13]:
for comparison in bayes_comparisons:
    pop1 = comparison[0]
    pop2 = comparison[1]
    
    for chain in list(range(1, 2)):

        chain="{chain:02d}".format(chain=chain)
        #create a CLEAN chain specific dir
        chain_dir="results/bayescan/{pop1}_{pop2}/chain_{chain}".format(pop1=pop1, pop2=pop2, chain=chain)
        if os.path.isdir(chain_dir):
            shutil.rmtree(chain_dir)
        os.makedirs(chain_dir, exist_ok=True)
    
        #set bayescan paramters 
        job_id="{pop1}_{pop2}_chain{chain}".format(pop1=pop1, pop2=pop2, chain=chain)
        job_log="{chain_dir}/{job_id}.log".format(chain_dir=chain_dir, job_id=job_id)
        infile="results/bayescan/{pop1}_{pop2}.bayescan".format(pop1=pop1, pop2=pop2)
        threads=12
        
        #build bayescan code
        cmd="conda activate sch_man_nwinvasion; \
            bin/BayeScan2.1/binaries/BayeScan2.1_linux64bits  \
                {infile} \
                -threads {threads} \
                -od {chain_dir}/ \
                -o {out_prefix} \
                -pr_odds {prior_odds} \
                -pilot {pilot} \
                -burn {burnin} \
                -nbp {nbp} \
                -n {gens} \
                -thin {thin}".format(infile=infile,
                                     pop1=pop1, 
                                     pop2=pop2,
                                     threads=threads,
                                     chain_dir=chain_dir,
                                     out_prefix="bayescan",
                                     prior_odds=10,
                                     pilot=5_000,
                                     burnin=5_000,
                                     nbp=20,
                                     gens=50_000,
                                     thin=20)

        #print parameters for logging purposes
        with open(chain_dir + "/run.sh", 'w') as run_sh:
            run_sh.write(cmd)
        
        #build command to submit to scheduler
        qsub="echo \"{cmd}\" | qsub -V -cwd -S /bin/bash -q all.q -j y -N {job_id} -o {job_log} -pe smp {threads}".format(cmd     = cmd,
                                                                                                                          job_id  = job_id,
                                                                                                                          job_log = job_log,
                                                                                                                          threads = threads)
        
        #submit
        os.system(qsub)

In [11]:
print(qsub)

echo "conda activate sch_man_nwinvasion;             bin/BayeScan2.1/binaries/BayeScan2.1_linux64bits                  results/bayescan/west_africa_east_africa_maf05.bayescan                 -threads 12                 -od results/bayescan/west_africa_east_africa/chain_01/                 -o bayescan                 -pr_odds 10                 -pilot 5000                 -burn 5000                 -nbp 20                 -n 50000                 -thin 20" | qsub -V -cwd -S /bin/bash -q all.q -j y -N chain01 -o results/bayescan/west_africa_east_africa/chain_01/chain01.log -pe smp 12


In [None]:
%%R

# #check to ID min qvalues (see if any sig sel...quick and dirty)
# fst_files<-list.files("results/bayescan", pattern="*fst.txt", recursive=TRUE, full.names=TRUE)

# min_qs<-vector()

# for ( fst_file in fst_files){


#     fst_tbl<-read.table(fst_file, header=TRUE)
    
#     min_q <- min(fst_tbl$qval)
    
#     min_qs <- append(min_qs, min_q)
# }

#print(fst_files)
# svg("results/bayescan/min_qs.svg")
#     plot(min_qs, ylim=c(0, 0.5), pch=21, col="black", cex=1.5, bg="grey", ylab="Min. q-value", xlab="")
#     abline(h=min(min_qs), lwd=1, col="black", lty=2)
#     abline(h=0.05, lwd=1, col="red", lty=2)
#     text(x=0, y = 0.17, labels = "Brazil v Africa", col="black", cex=1, adj=0)
#     text(x=10, y = 0.425, labels = "Brazil v Niger", col="black", cex=1, adj=0)
#     text(x=20, y = 0.115, labels = "Brazil v Senegal", col="black", cex=1, adj=0)
#     text(x=29, y = 0.25, labels = "Brazil v Tanzania", col="black", cex=1, adj=0)
#     text(x=39, y = 0.18, labels = "Brazil v wAfrica", col="black", cex=1, adj=0)
#     text(x=43, y = 0.45, labels = "wAfrica v Tanzania", col="black", cex=1, adj=0)
# dev.off()

source("")
plot_bayescan("results/bayescan/brazil_africa/chain_00/bayescan_fst.csv")

In [None]:
%%bash
paste ../../brazil_africa.sites bayescan_fst.txt | sed 's/ \+/,/gi' | sed 's/\t/,/gi' | sed '2,$s/.$//' | cut --complement -f3 -d","​


In [None]:
%%R

bs<-read.csv("results/bayescan/brazil_africa/chain_00/bayescan_fst.csv", header=TRUE, sep=",")

#head(bs)
cumul_center_pos<-get_cumul_pos(bs$chrom, bs$pos)

#build dataframe
bs_df<-data.frame(bs, cumul_center_pos)

#start plotting
p <- ggplot(bs_df, 
            aes(x     = cumul_center_pos, 
                y     = alpha, 
                color = chrom))

#adjust colors
p <- p + scale_colour_manual(values = chr_colors)

#plot data
p <- p + geom_point(alpha = 1)

#add horizontal line at 99.9th percintile
# p <- p + geom_hline(yintercept = quantile(pbs_df$PBS, 0.999), 
#                     linetype   = "dashed", 
#                     color      = "red")
# p <- p + geom_hline(yintercept = quantile(pbs_df$PBS, 0.001), 
#                     linetype   = "dashed", 
#                     color      = "red")

#modify x and y axis
p <- p + scale_x_continuous(name   = "", 
                            breaks = cumul_starts, 
                            expand = c(0,0))
p <- p + scale_y_continuous(name   = "Alpha", 
                            expand = c(0,0),
                            limits = c(min(bs$alpha * 1.15), 
                                       max(bs$alpha * 1.15)),
                            #breaks = c(-0.2, 0, 0.2, 0.4, 0.6, 0.8)
                           )

#change theme
p <- p + theme_bw()

#change fonts on axis elements and titles
p <- p + theme(axis.text   = element_text(size = 12),
               axis.title  = element_text(size = 12,
                                          face = "bold"),
               axis.text.x = element_blank())

#removing gridlines
p <- p + theme(panel.grid.major  = element_blank(), 
                panel.grid.minor = element_blank())

#set up plot title etc.
#p <- p + ggtitle("PBS (Brazil; WAfrica; EAfrica)")
# p <- p + theme(plot.title = element_text(hjust = 0.5, 
#                                          vjust = 0.5, 
#                                          face  = 'bold', 
#                                          size  = 18))

# #modify legend
p <- p + theme(legend.position = "none")

#save the figure
svg_img <- "results/bayescan/b_v_a.svg"
png_img <- "results/bayescan/b_v_a.png"
ggsave(png_img, plot = p, dpi = 600)
ggsave(svg_img, plot = p)

ggsave("results/bayescan/b_v_a.COMPOSITE.png", plot = p, width = 10, height = 1.5, units = "in", dpi = 300)
# #display in notebook
print(p)