# Filtering & Plots

This notebook is for streamlining the filtering process that occurs after ipyrad. This way, I can rerun the notebook with different filenames instead of do it from scratch each time.


### Naming Cell:

In [5]:
assembly_name = "parcal_003"
filter_biall = "TRUE" # either TRUE or FALSE
filter_MAF = "TRUE" # either TRUE or FALSE
dir_outfiles = "/mnt/hgfs/SHARED_FOLDER/Learn_iPyrad/PARCAL_RUN1/parcal_003_outfiles/"
date = "20170903"
plot_subtitle = "" # leave blank if you don't want to add a subtitle

### Working Cell:

In [6]:
# import all necessary modules
import os
import subprocess as sp
import numpy as np
import matplotlib.pyplot as plt

# name all intermediate files
ipy_VCF = assembly_name + ".vcf" # e.g., CG_l1_25_c86_H.vcf
filt_VCF_out = assembly_name # e.g., CG_l1_25_c86_H
if filter_biall == "TRUE": 
    filt_VCF_out += "_biall" # e.g., CG_l1_25_c86_H_biall
if filter_MAF == "TRUE":
    filt_VCF_out += "_maf" # e.g., CG_l1_25_c86_H_biall_maf
else:
    filt_VCF_out += "_NOmaf" # e.g., CG_l1_25_c86_H_biall_NOmaf
filt_VCF_out_recode = filt_VCF_out + ".recode.vcf" # e.g.,CG_l1_25_c86_H_biall_maf.recode.vcf
one_VCF = filt_VCF_out.split(".")[0] + "_oneSNP.vcf" # e.g., CG_l1_25_c86_H_biall_maf_oneSNP.vcf
one_str_file = one_VCF[:-3] + "str" # e.g., CG_l1_25_c86_H_biall_maf_oneSNP.str
str_inames_file = one_VCF[:-4] + "_inames.str" # e.g., CG_l1_25_c86_H_biall_maf_oneSNP_inames.str
str_inames_fpop_file = one_VCF[:-4] + "_inames_fpop.str" # e.g., CG_l1_25_c86_H_biall_maf_oneSNP_inames_fpop.str

# change wd
os.chdir(dir_outfiles)

# make directory for new output files, if doesn't already exist
if not os.path.exists("new_outfiles"):
    os.makedirs("new_outfiles")
    
# make VCFtools call string
vcf_call = "vcftools --vcf " + ipy_VCF + " "
if filter_biall == "TRUE":
    vcf_call += "--min-alleles 2 --max-alleles 2 "
if filter_MAF == "TRUE":
    vcf_call += "--maf .05 "
vcf_call += "--out new_outfiles/" + filt_VCF_out + " "
vcf_call += "--recode-INFO-all --recode"

# call VCFtools
sp.call([vcf_call],shell=True)

# change wd to new outfiles
os.chdir("new_outfiles")

# make oneSNP call string
one_call = "python ../../oneSNP.py " + filt_VCF_out_recode + " " + one_VCF

# call oneSNP script to filter one SNP per RAD tag
sp.call([one_call], shell=True)

# double-check number of loci in VCF file
vcf_file = open(one_VCF,"r") # change!
vcf_file_lines = vcf_file.readlines()
vcf_file.close()
        
locus_names = {}
snp_name_count = 1

for line in vcf_file_lines:
    if line[0] != "#":
        linelist = line.strip().split()
        locus = linelist[0]
        pos = linelist[1]
        vcf_locus_name = locus + "_" + pos
        str_locus_name = "SNP_" + str(snp_name_count)
        snp_name_count += 1
        locus_names[str_locus_name] = vcf_locus_name  
        
# get sample list and snp name list for rewriting header to structure file       
sample_list = []
snp_name_list = []
rdd = {} 

for line in vcf_file_lines:
    if line[0:1] == "#C":
        linelist = line.strip().split()
        sample_list += linelist[9:]
    elif line[0] != "#": # ignore header lines
        genblocks = line.strip().split()
        snp_name = genblocks[0] + "_" + genblocks[1] # name format e.g., locus_47_1
        snp_name_list.append(snp_name)
        for genblock in genblocks[9:]: # start on genotype lines
            genblocklist = genblock.split(":")
            rd = int(genblocklist[1])
            if snp_name not in rdd:
                rdd[snp_name] = [rd]
            elif snp_name in rdd:
                rdd[snp_name] += [rd]
            else:
                print "something funky going on"
                
                
# PGD Spider transform VCF to STR

# change to PGD Spider directory
os.chdir("/mnt/hgfs/SHARED_FOLDER/Software/PGDSpider_2.1.1.0")

# make PGD call string
PGD_call = "java -Xmx1024m -Xms512m -jar PGDSpider2-cli.jar "
PGD_call += "-inputfile " + dir_outfiles + "/new_outfiles/" + one_VCF
PGD_call += " -outputfile " + dir_outfiles + "/new_outfiles/" + one_str_file
PGD_call += " -inputformat VCF -outputformat STRUCTURE "
PGD_call += "-spid vcf_to_str.spid"

# call PGD to make STR file
sp.call([PGD_call], shell = True)

# change wd back to new outfiles
os.chdir(dir_outfiles + "/new_outfiles")

# rename structure file header
old_str = open(one_str_file, "r")
old_str_lines = old_str.readlines()
old_str.close()

new_str_header = "\t\t" # will need to change this line when I get a hold of better code for one SNP/tag
old_str_header_list = old_str_lines[0].strip().split()
for snpname in snp_name_list:
    new_str_header += snpname + " "
new_str_header = new_str_header[:-1]
new_str = open(str_inames_file, "w")
new_str.write(new_str_header + "\n")
for line in old_str_lines[1:]:
    new_str.write(line)
new_str.close()

# add fake pop for analyzing populations together in hierfstat
str_onesnp_file = open(str_inames_file, "r")
str_onesnp_file_lines = str_onesnp_file.readlines()
str_onesnp_file.close()

fpop_lines = []

for line in str_onesnp_file_lines[-6:]:
    linelist = line.strip().split()
    newline = "FALSE" + linelist[0] + "\t" + "2" + "\t"
    for genotype in linelist[2:]:
        newline += genotype + " "
    newline = newline[:-1] + "\n"
    fpop_lines.append(newline)

str_file_fpop =open(str_inames_fpop_file, "w")

for line in str_onesnp_file_lines:
    str_file_fpop.write(line)
for line in fpop_lines:
    str_file_fpop.write(line)
str_file_fpop.close()

numloci = len(str_onesnp_file_lines[0].strip().split())
numinds = int(float(len(fpop_lines)+len(str_onesnp_file_lines)-1))/float(2)

print "Number of individuals =" + str(numinds - 3)
print "Number of individuals WITH FAKE POP = " + str(numinds) 
print "Number of loci = " + str(numloci)

# make R call string
string_callR = "Rscript ../../../../pop_gen_stats.R "
string_callR += dir_outfiles + "new_outfiles "
string_callR += str_inames_fpop_file + " "
string_callR += str(numinds) + " " + str(numloci) + " " + date

# run R script using call string
sp.call([string_callR],shell=True)

# names of pop gen stats files + name for plot title
assembly_name_plots = assembly_name
str_filename = str_inames_fpop_file
fis_filename = "Fis_" + str_inames_fpop_file[:-4] + "_" + date + ".txt"
ho_filename = "Ho_" + str_inames_fpop_file[:-4] + "_" + date + ".txt"
hs_filename = "Hs_" + str_inames_fpop_file[:-4] + "_" + date + ".txt"

# make read depth plot
vcf_file_fplot = open(one_VCF,"r") # change!
vcf_file_lines = vcf_file_fplot.readlines()
vcf_file.close()

locus_names = {}
snp_name_count = 1

for line in vcf_file_lines:
    if line[0] != "#":
        linelist = line.strip().split()
        locus = linelist[0]
        pos = linelist[1]
        vcf_locus_name = locus + "_" + pos
        str_locus_name = "SNP_" + str(snp_name_count)
        snp_name_count += 1
        locus_names[str_locus_name] = vcf_locus_name  

sample_list = []
snp_name_list = []
rdd = {} 

for line in vcf_file_lines:
    if line[0:1] == "#C":
        linelist = line.strip().split()
        sample_list += linelist[9:]
    elif line[0] != "#": # ignore header lines
        genblocks = line.strip().split()
        snp_name = genblocks[0] + "_" + genblocks[1] # name formate.g., locus_47_1
        snp_name_list.append(snp_name)
        for genblock in genblocks[9:]: # start on genotype lines
            genblocklist = genblock.split(":")
            rd = int(genblocklist[1])
            if snp_name not in rdd:
                rdd[snp_name] = [rd]
            elif snp_name in rdd:
                rdd[snp_name] += [rd]
            else:
                print "something funky going on"

locus_rd_avs = []
for snp in snp_name_list:
    locus_rd_avs.append(np.mean(rdd[snp]))
    
plt.hist(locus_rd_avs, bins = np.arange(0,max(locus_rd_avs)+1,4)-2)

title = one_VCF
if len(plot_subtitle) > 1:
    title += "\n" + plot_subtitle
plt.title(title)

plt.xlabel("Read depth")
plt.ylabel("Frequency")

plt.savefig('rd_' + filt_VCF_out + '.png')
plt.close()

# make Fis plot

fis_file = open(fis_filename,"r")
fis_file_lines = fis_file.readlines()
fis_file.close()

fis_list = []
fis_pls_NAs = []

for line in fis_file_lines[1:]:
    fis = line.strip().split()[1]
    fis_pls_NAs.append(fis)
    if fis != "NA":
        fis_list.append(float(fis))
    
plt.hist(fis_list, bins = np.arange(-1.2,1.2,.05)-.025)

title = "Fis in " + filt_VCF_out
if len(plot_subtitle) > 1:
    title += "\n" + plot_subtitle
plt.title(title)

plt.xlabel("Fis")
plt.ylabel("Frequency")

plt.savefig("Fis_" + filt_VCF_out + '.png')
plt.close()

# make Ho plot

ho_file = open(ho_filename,"r")
ho_file_lines = ho_file.readlines()
ho_file.close()

ho_list = []
ho_and_NAs = []

for line in ho_file_lines[1:]:
    ho = line.strip().split()[1]
    ho_and_NAs.append(ho)
    if ho != "NA":
        ho_list.append(float(ho))
    
plt.hist(ho_list, bins = np.arange(0,1.2,.05)-.025)


title = "Ho in " + filt_VCF_out
if len(plot_subtitle) > 1:
    title += "\n" + plot_subtitle
plt.title(title)

plt.xlabel("Ho")
plt.ylabel("Frequency")
plt.savefig("Ho in " + filt_VCF_out + '.png')
plt.close()

# make Hs plot

hs_file = open(hs_filename,"r")
hs_file_lines = hs_file.readlines()
hs_file.close()

hs_list = []
hs_and_NAs = []

for line in hs_file_lines[1:]:
    hs = line.strip().split()[1]
    hs_and_NAs.append(hs)
    if hs != "NA":
        hs_list.append(float(hs))
    
plt.hist(hs_list, bins = np.arange(0,1.2,.025)-.0125)

title = "Hs in " + filt_VCF_out
if len(plot_subtitle) > 1:
    title += "\n" + plot_subtitle
plt.title(title)

plt.xlabel("Hs")
plt.ylabel("Frequency")
plt.savefig("Hs_" + filt_VCF_out + '.png')
plt.close()

Number of individuals after adding fpop = 80.0
Number of loci = 6362


### Check file names:

In [5]:
print ipy_VCF
print filt_VCF_out
print filt_VCF_out_recode
print one_VCF
print one_str_file
print str_inames_file
print str_inames_fpop_file


CG_L1_L2_refgen_001.vcf
CG_L1_L2_refgen_001_biall_NOmaf
CG_L1_L2_refgen_001_biall_NOmaf.recode.vcf
CG_L1_L2_refgen_001_biall_NOmaf_oneSNP.vcf
CG_L1_L2_refgen_001_biall_NOmaf_oneSNP.str
CG_L1_L2_refgen_001_biall_NOmaf_oneSNP_inames.str
CG_L1_L2_refgen_001_biall_NOmaf_oneSNP_inames_fpop.str
