In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm import tqdm
from pathlib import Path
from matplotlib.colors import ListedColormap

In [2]:
proj_dir="/master/nplatt/sch_hae_scan"
results_dir=f"{proj_dir}/results"
os.chdir(results_dir)

In [33]:
Path("{}/candidates".format(results_dir)).mkdir(parents=True, exist_ok=True)
os.chdir("{}/candidates".format(results_dir))

In [32]:
!cat ../outlier_table/target_introgressed_loci.bed

#header
NC_067199.1	28348440	28877530
NC_067200.1	9712340	10514400


In [None]:
%%bash

conda run -n popstructure bedtools intersect -a ../../data/GCF_000699445.3_UoM_Shae.V3_genomic.gff -b ../outlier_table/target_introgressed_loci.bed  | awk '{if ($3=="gene") print $0}' | cut -f9 | cut -f1,3 -d";" | sed 's/ID=//g' | sed 's/;Name=/\t/' >genes_in_introgressed_loci.tsv

In [None]:
%%bash

conda run -n popstructure \
    vcftools \
        --vcf ../snpeff/snpeff.vcf \
        --recode \
        --recode-INFO-all \
        --out candidate_snvs \
        --bed ../outlier_table/target_introgressed_loci.bed

In [None]:
conda run -n popstructure \
    vcftools \
        --vcf candidate_snvs.recode.vcf \
        --recode \
        --recode-INFO-all \
        --out candidate_snvs_missense \
        --positions missense_variants.list

In [34]:
pca_df=pd.read_csv(f"{proj_dir}/results/pca/pca_df.csv", sep=",")
pca_df=pca_df[["sample_name", "kmeans_label"]]

sbs = pca_df.loc[pca_df["kmeans_label"] == "sb", "sample_name"]
ses = pca_df.loc[pca_df["kmeans_label"] == "sh_se", "sample_name"]
nws = pca_df.loc[pca_df["kmeans_label"] == "sh_nw", "sample_name"]

sbs.to_csv("sb.list", sep="\t", header=False, index=False)
ses.to_csv("se.list", sep="\t", header=False, index=False)
nws.to_csv("nw.list", sep="\t", header=False, index=False)

In [36]:
%%bash

conda run -n popstructure vcftools --vcf candidate_snvs.recode.vcf --keep se.list --freq --stdout >se.tsv
conda run -n popstructure vcftools --vcf candidate_snvs.recode.vcf --keep nw.list --freq --stdout >nw.tsv
conda run -n popstructure vcftools --vcf candidate_snvs.recode.vcf --keep sb.list --freq --stdout >sb.tsv


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf candidate_snvs.recode.vcf
	--keep se.list
	--freq
	--stdout

Keeping individuals in 'keep' list
After filtering, kept 59 out of 166 Individuals
Outputting Frequency Statistics...
After filtering, kept 98898 out of a possible 98898 Sites
Run Time = 3.00 seconds


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf candidate_snvs.recode.vcf
	--keep nw.list
	--freq
	--stdout

Keeping individuals in 'keep' list
After filtering, kept 82 out of 166 Individuals
Outputting Frequency Statistics...
After filtering, kept 98898 out of a possible 98898 Sites
Run Time = 3.00 seconds


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf candidate_snvs.recode.vcf
	--keep sb.list
	--freq
	--stdout

Keeping individuals in 'keep' list
After filtering, kept 21 out of 166 Individuals
Outputting Frequency Statistics...
After fil

In [37]:
col_names=["chrom", "pos", "n_allele", "count", "ref_info", "alt_info"]
se_df = pd.read_csv("se.tsv", sep="\t", index_col=None, skiprows=1, header=None, names=col_names)
sb_df = pd.read_csv("sb.tsv", sep="\t", index_col=None, skiprows=1, header=None, names=col_names)
nw_df = pd.read_csv("nw.tsv", sep="\t", index_col=None, skiprows=1, header=None, names=col_names)

# pd.concat([sb_df, nw_df[[", df3], axis=1)
df = pd.concat([sb_df, nw_df[["count", "ref_info", "alt_info"]], se_df[["count", "ref_info", "alt_info"]]], axis=1)
df.columns=["chrom", "pos", "n_alleles", "sb_count", "sb_ref", "sb_alt", "nw_count", "nw_ref", "nw_alt", "se_count", "se_ref", "se_alt"]
df

Unnamed: 0,chrom,pos,n_alleles,sb_count,sb_ref,sb_alt,nw_count,nw_ref,nw_alt,se_count,se_ref,se_alt
0,NC_067199.1,28348448,2,42,C:1,T:0,164,C:0.993902,T:0.00609756,118,C:1,T:0
1,NC_067199.1,28348453,2,42,G:0.97619,A:0.0238095,164,G:1,A:0,118,G:1,A:0
2,NC_067199.1,28348457,2,42,G:1,A:0,164,G:0.847561,A:0.152439,118,G:1,A:0
3,NC_067199.1,28348464,2,42,A:0.97619,T:0.0238095,164,A:1,T:0,118,A:1,T:0
4,NC_067199.1,28348468,2,42,G:1,A:0,164,G:1,A:0,118,G:1,A:0
...,...,...,...,...,...,...,...,...,...,...,...,...
98893,NC_067200.1,10514330,2,42,C:0.97619,T:0.0238095,164,C:0.987805,T:0.0121951,118,C:0.991525,T:0.00847458
98894,NC_067200.1,10514337,2,42,A:1,G:0,164,A:1,G:0,118,A:1,G:0
98895,NC_067200.1,10514350,2,42,G:0.119048,C:0.880952,164,G:0.77439,C:0.22561,118,G:0.991525,C:0.00847458
98896,NC_067200.1,10514356,2,42,G:0,C:1,164,G:0.670732,C:0.329268,118,G:0.991525,C:0.00847458


In [38]:
ref_alleles = df["sb_ref"].str.split(":", expand=True)[0]
alt_alleles = df["sb_alt"].str.split(":", expand=True)[0]

sb_ref_freq = df["sb_ref"].str.split(":", expand=True)[1]
sb_alt_freq = df["sb_alt"].str.split(":", expand=True)[1]

nw_ref_freq = df["nw_ref"].str.split(":", expand=True)[1]
nw_alt_freq = df["nw_alt"].str.split(":", expand=True)[1]

se_ref_freq = df["se_ref"].str.split(":", expand=True)[1]
se_alt_freq = df["se_alt"].str.split(":", expand=True)[1]

df["ref"] = ref_alleles
df["alt"] = alt_alleles

df["sb_ref_freq"] = sb_ref_freq
df["sb_alt_freq"] = sb_alt_freq

df["nw_ref_freq"] = nw_ref_freq
df["nw_alt_freq"] = nw_alt_freq

df["se_ref_freq"] = se_ref_freq
df["se_alt_freq"] = se_alt_freq

df = df.drop(["sb_ref", "sb_alt", "nw_ref", "nw_alt", "se_ref", "se_alt", "sb_alt_freq", "se_alt_freq", "nw_alt_freq"], axis=1)
df

Unnamed: 0,chrom,pos,n_alleles,sb_count,nw_count,se_count,ref,alt,sb_ref_freq,nw_ref_freq,se_ref_freq
0,NC_067199.1,28348448,2,42,164,118,C,T,1,0.993902,1
1,NC_067199.1,28348453,2,42,164,118,G,A,0.97619,1,1
2,NC_067199.1,28348457,2,42,164,118,G,A,1,0.847561,1
3,NC_067199.1,28348464,2,42,164,118,A,T,0.97619,1,1
4,NC_067199.1,28348468,2,42,164,118,G,A,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
98893,NC_067200.1,10514330,2,42,164,118,C,T,0.97619,0.987805,0.991525
98894,NC_067200.1,10514337,2,42,164,118,A,G,1,1,1
98895,NC_067200.1,10514350,2,42,164,118,G,C,0.119048,0.77439,0.991525
98896,NC_067200.1,10514356,2,42,164,118,G,C,0,0.670732,0.991525


Unnamed: 0,chrom,pos,n_alleles,sb_count,nw_count,se_count,ref,alt,sb_ref_freq,nw_ref_freq,se_ref_freq


In [44]:
ref_target_df = df.loc[ ((df["nw_ref_freq"].astype(float) > 0.95) & (df["se_ref_freq"].astype(float) < 0.05)) ]
alt_target_df = df.loc[ ((df["nw_ref_freq"].astype(float) < 0.05) & (df["se_ref_freq"].astype(float) > 0.95)) ]


ref_target_df.to_csv("candidates_fixed.csv", sep=",", header=True, index=False)
ref_target_df[["chrom", "pos"]].to_csv("candidates_fixed.pos.list", sep="\t", header=False, index=False)
ref_target_df

#df.loc[ (df["nw_ref_freq"].astype(float) < 0.05) & (df["sb_ref_freq"].astype(float) < 0.05) & (df["se_ref_freq"].astype(float) > 0.95) ]

Unnamed: 0,chrom,pos,n_alleles,sb_count,nw_count,se_count,ref,alt,sb_ref_freq,nw_ref_freq,se_ref_freq
40,NC_067199.1,28348762,2,42,164,118,G,A,0.952381,0.987805,0
72,NC_067199.1,28349130,2,42,164,118,T,C,1,0.987805,0.00847458
73,NC_067199.1,28349142,2,42,164,118,A,G,1,0.987805,0.00847458
82,NC_067199.1,28349237,2,42,164,118,A,G,1,0.987805,0.00847458
93,NC_067199.1,28349417,2,42,164,118,A,G,1,0.987805,0
...,...,...,...,...,...,...,...,...,...,...,...
56487,NC_067200.1,10169242,2,42,164,118,C,T,0.97619,0.95122,0
56637,NC_067200.1,10171336,2,42,164,118,G,T,0.952381,0.95122,0.0254237
56648,NC_067200.1,10171461,2,42,164,118,G,A,0.952381,0.95122,0.0169492
56808,NC_067200.1,10172511,2,42,164,118,G,C,0.952381,0.957317,0.0423729


In [47]:
%%bash 

conda run -n popstructure \
    vcftools \
        --vcf candidate_snvs.recode.vcf \
        --recode \
        --recode-INFO-all \
        --out candidate_snvs_fixed \
        --positions candidates_fixed.pos.list


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf candidate_snvs.recode.vcf
	--recode-INFO-all
	--out candidate_snvs_fixed
	--positions candidates_fixed.pos.list
	--recode

After filtering, kept 166 out of 166 Individuals
Outputting VCF file...
After filtering, kept 1818 out of a possible 98898 Sites
Run Time = 0.00 seconds



In [48]:
%%bash 

grep -v "#" candidate_snvs_fixed.recode.vcf | grep -i missense | cut -f1,2 >missense_variants.list
wc -l missense_variants.list

49 missense_variants.list


In [51]:
#get xpehh from these variants

sel_df = pd.read_csv("../selscan/sel_df_norm.csv", sep=",", header=0)

In [53]:
mis_df = pd.read_csv("missense_variants.list", sep="\t", header=None)
mis_df.columns=["chrom", "pos"]
mis_df

Unnamed: 0,chrom,pos
0,NC_067199.1,28396888
1,NC_067199.1,28402533
2,NC_067199.1,28403016
3,NC_067199.1,28403068
4,NC_067199.1,28455596
5,NC_067199.1,28455984
6,NC_067199.1,28456239
7,NC_067199.1,28456295
8,NC_067199.1,28467142
9,NC_067199.1,28469435


In [54]:
sel_df

Unnamed: 0,chr,pos,id,gpos,p1,ihh1,p2,ihh2,xpehh,normxpehh,crit,genomic_position,color,norm_p_value,p_value_bonferroni,fdr,nw_intro
0,NC_067196.1,147067,NC_067196.1:147067,147067.0,0.158537,3849.020,0.381356,2542.010,0.180172,1.58557,0,147067,#078d70,0.112837,1.0,0.999997,0.823171
1,NC_067196.1,147068,NC_067196.1:147068,147068.0,0.048780,3270.680,0.186441,1366.650,0.378980,2.42020,1,147068,#078d70,0.015512,1.0,0.562670,0.823171
2,NC_067196.1,147069,NC_067196.1:147069,147069.0,0.024390,3022.890,0.000000,1374.130,0.342395,2.26661,1,147069,#078d70,0.023414,1.0,0.679088,0.823171
3,NC_067196.1,147070,NC_067196.1:147070,147070.0,0.000000,3023.680,0.000000,1374.130,0.342510,2.26709,1,147070,#078d70,0.023385,1.0,0.678738,0.823171
4,NC_067196.1,147072,NC_067196.1:147072,147072.0,0.018293,3023.510,0.000000,1374.130,0.342485,2.26699,1,147072,#078d70,0.023391,1.0,0.678825,0.823171
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28585158,NC_067202.1,19477723,NC_067202.1:19477723,19477700.0,0.000000,398.509,0.000000,270.019,0.169044,1.53885,0,317910191,#078d70,0.123841,1.0,0.999997,0.018293
28585159,NC_067202.1,19477730,NC_067202.1:19477730,19477700.0,0.036585,396.811,0.000000,270.019,0.167189,1.53106,0,317910198,#078d70,0.125755,1.0,0.999997,0.018293
28585160,NC_067202.1,19477740,NC_067202.1:19477740,19477700.0,0.000000,409.008,0.000000,270.019,0.180337,1.58626,0,317910208,#078d70,0.112680,1.0,0.999997,0.018293
28585161,NC_067202.1,19477770,NC_067202.1:19477770,19477800.0,0.390244,333.042,0.245763,207.029,0.206467,1.69596,0,317910238,#078d70,0.089893,1.0,0.999997,0.018293


In [56]:
rows = []
for index, row in tqdm(mis_df.iterrows(), total=len(mis_df)):
    try:
        values = sel_df.loc[ ((sel_df["chr"] == row["chrom"]) & (sel_df["pos"] == row["pos"]))]
        rows.append(values)
    except:
        rows.append("")


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49/49 [01:47<00:00,  2.19s/it]


In [70]:
df

Unnamed: 0,chr,pos,id,gpos,p1,ihh1,p2,ihh2,xpehh,normxpehh,crit,genomic_position,color,norm_p_value,p_value_bonferroni,fdr,nw_intro


In [74]:
df = pd.concat(rows, axis=0).reset_index(drop=True)
df  = df.loc[ ((df["norm_p_value"] < 0.05) & (df["xpehh"] > 0))]
len(df)

20

In [75]:
df

Unnamed: 0,chr,pos,id,gpos,p1,ihh1,p2,ihh2,xpehh,normxpehh,crit,genomic_position,color,norm_p_value,p_value_bonferroni,fdr,nw_intro
8,NC_067199.1,28467142,NC_067199.1:28467142,28467100.0,0.02439,2589.41,1.0,1087.53,0.376759,2.41087,1,224969683,#078d70,0.015915,1.0,0.569497,0.987805
9,NC_067199.1,28469435,NC_067199.1:28469435,28469400.0,0.012195,2377.27,1.0,1042.41,0.358042,2.3323,1,224971976,#078d70,0.019685,1.0,0.629035,0.987805
12,NC_067199.1,28495283,NC_067199.1:28495283,28495300.0,0.012195,3155.26,1.0,1127.35,0.446977,2.70566,1,224997824,#078d70,0.006817,1.0,0.371166,0.987805
13,NC_067199.1,28495360,NC_067199.1:28495360,28495400.0,0.012195,3132.47,1.0,1127.35,0.443827,2.69244,1,224997901,#078d70,0.007093,1.0,0.37875,0.987805
14,NC_067199.1,28498519,NC_067199.1:28498519,28498500.0,0.012195,3204.62,0.991525,978.453,0.515237,2.99223,1,225001060,#078d70,0.002769,1.0,0.228593,0.987805
21,NC_067199.1,28516439,NC_067199.1:28516439,28516400.0,0.012195,5860.09,1.0,2515.01,0.367365,2.37144,1,225018980,#078d70,0.017719,1.0,0.598416,0.987805
22,NC_067199.1,28516469,NC_067199.1:28516469,28516500.0,0.012195,5860.09,1.0,2515.01,0.367365,2.37144,1,225019010,#078d70,0.017719,1.0,0.598416,0.987805
23,NC_067199.1,28516559,NC_067199.1:28516559,28516600.0,0.012195,5860.09,1.0,2517.78,0.366886,2.36943,1,225019100,#078d70,0.017816,1.0,0.599954,0.987805
24,NC_067199.1,28518344,NC_067199.1:28518344,28518300.0,0.012195,5537.36,1.0,2194.5,0.401968,2.51671,1,225020885,#078d70,0.011846,1.0,0.491987,0.987805
25,NC_067199.1,28518418,NC_067199.1:28518418,28518400.0,0.012195,5537.36,1.0,2181.79,0.40449,2.52729,1,225020959,#078d70,0.011495,1.0,0.484716,0.987805


In [76]:
df[["chr", "pos"]].to_csv("candidates_fixed_missense_sel.pos.list", sep="\t", header=False, index=False)

In [78]:
%%bash 

conda run -n popstructure \
    vcftools \
        --vcf candidate_snvs_fixed.recode.vcf \
        --recode \
        --recode-INFO-all \
        --out candidates_fixed_missense_sel \
        --positions candidates_fixed_missense_sel.pos.list


VCFtools - 0.1.16
(C) Adam Auton and Anthony Marcketta 2009

Parameters as interpreted:
	--vcf candidate_snvs_fixed.recode.vcf
	--recode-INFO-all
	--out candidates_fixed_missense_sel
	--positions candidates_fixed_missense_sel.pos.list
	--recode

After filtering, kept 166 out of 166 Individuals
Outputting VCF file...
After filtering, kept 20 out of a possible 1818 Sites
Run Time = 0.00 seconds

