In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from pathlib import Path


In [2]:
proj_dir="/master/nplatt/sch_hae_scan"
results_dir=f"{proj_dir}/results"

In [59]:
Path(f"{results_dir}/outlier_table").mkdir(parents=True, exist_ok=True)
os.chdir(f"{results_dir}/outlier_table")

In [58]:
#split genome into 10kb windows
!conda run -n popstructure bedtools makewindows -w 337000 -s 168500 -g ../../data/GCF_000699445.3_UoM_Shae.V3_genomic.fna.fai >1p_windows.bed

In [60]:
!conda run -n popstructure bedtools intersect -c -a 1p_windows.bed -b ../post_phase_filtering/chrs_unrelated.vcf >n_snvs_per_window.bed

In [6]:
sel_df=pd.read_csv(f"{results_dir}/selscan/sel_df_norm.csv", sep=",", header=0)


alpha = 0.05
sig_bonf = sel_df[sel_df["p_value_bonferroni"] <= alpha]

sel_bed_df = sig_bonf[["chr", "pos", "pos"]]
sel_bed_df.to_csv("normxpehh_outlier.bed", sep="\t", header=False, index=False)


In [7]:
fst_df = pd.read_csv(f"{results_dir}/fst/genome_wide_fsts.csv", sep=",", header=0)


rank = fst_df["sh_nw_v_sh_se"].rank() / np.isfinite(fst_df["sh_nw_v_sh_se"]).sum()
sig_fst_df = fst_df.loc[(rank >= 0.95)]

fst_bed_df = sig_fst_df[["chr", "start", "stop"]]
fst_bed_df.to_csv("fst_outlier.bed", sep="\t", header=False, index=False)


In [8]:
n_snvs_df = pd.read_csv("n_snvs_per_window.bed", sep="\t", header=None)
n_snvs_df.columns=["chrom", "start", "stop", "n_snvs"]

In [9]:
rf_df = pd.read_csv(f"{results_dir}/rfmix/rfmix_perc_bovis_genome.csv", sep=",", header=0)
high_anc_df = rf_df.loc[rf_df["perc_sh_nw"] >= 0.95]

anc_bed_df = high_anc_df[["chrom", "s_pos", "e_pos"]]
anc_bed_df.to_csv("ancestry.bed", sep="\t", header=False, index=False)


In [10]:
d_df = pd.read_csv(f"{results_dir}/abba_baba/patterson_d.csv", sep=",", header=0)


sig_d_df = d_df.loc[d_df["z_score"]>2]
sig_d_df

d_bed_df = sig_d_df[["chrom", "start", "stop"]]
d_bed_df.to_csv("d_outlier.bed", sep="\t", header=False, index=False)


In [51]:
# # Given data
# mu = d_df["d"].std().mean()
# sigma = d_df["d"].std()  # You need to provide this value
# alpha = 0.05  # For a 95% confidence level

In [None]:
# # Two-tailed test critical Z-values
# z_critical = -norm.ppf(alpha/2)  # for a two-tailed test

In [61]:
%%bash

echo -e "chrom\tstart\tstop\tn_fst" >fst_counts.bed
conda run -n popstructure bedtools intersect -c -a 1p_windows.bed -b fst_outlier.bed >>fst_counts.bed

echo -e "chrom\tstart\tstop\tn_xpehh" >xpehh_counts.bed
conda run -n popstructure bedtools intersect -c -a 1p_windows.bed -b normxpehh_outlier.bed >>xpehh_counts.bed

echo -e "chrom\tstart\tstop\tn_anc" >anc_counts.bed
conda run -n popstructure bedtools intersect -c -a 1p_windows.bed -b ancestry.bed >>anc_counts.bed

echo -e "chrom\tstart\tstop\tn_d" >d_counts.bed
conda run -n popstructure bedtools intersect -c -a 1p_windows.bed -b d_outlier.bed >>d_counts.bed

# conda run -n popstructure bedtools multiinter -header -i fst_counts.bed xpehh_counts.bed anc_counts.bed d_counts.bed >test.bed
echo -e "chrom\tstart\tstop" >1p_windows.tsv
cat 1p_windows.bed >>1p_windows.tsv

In [65]:
n_fst_df    = pd.read_csv("fst_counts.bed", header=0, sep="\t")
n_xpehh_df = pd.read_csv("xpehh_counts.bed", header=0, sep="\t")
n_anc_df   = pd.read_csv("anc_counts.bed", header=0, sep="\t")
n_d_df = pd.read_csv("d_counts.bed", header=0, sep="\t")
df =  pd.read_csv("1p_windows.tsv", header=0, sep="\t")

In [66]:
df["n_fst"] = n_fst_df["n_fst"]
df["n_xpehh"] = n_xpehh_df["n_xpehh"]
df["n_anc"] = n_anc_df["n_anc"]
df["n_d"] = n_d_df["n_d"]
df["n_snvs"] = n_snvs_df["n_snvs"]

df

Unnamed: 0,chrom,start,stop,n_fst,n_xpehh,n_anc,n_d,n_snvs
0,NC_067195.1,0,337000,0,0,0,0,12600
1,NC_067195.1,168500,505500,0,0,0,0,13169
2,NC_067195.1,337000,674000,0,0,0,1,13727
3,NC_067195.1,505500,842500,0,0,0,4,14062
4,NC_067195.1,674000,1011000,0,0,0,4,14469
...,...,...,...,...,...,...,...,...
2488,NW_026137017.1,0,52804,0,0,0,0,31089
2489,NW_026137018.1,0,33027,0,0,0,0,30869
2490,NW_026137003.1,0,209180,0,0,0,0,30341
2491,NW_026137003.1,168500,209180,0,0,0,0,29868


In [67]:
df.loc[ ((df["n_fst"] >0) |
          (df["n_xpehh"] >0) |
        (df["n_anc"] >0) |
        (df["n_d"] >0) &
        (df["n_snvs"] >10)) ]

Unnamed: 0,chrom,start,stop,n_fst,n_xpehh,n_anc,n_d,n_snvs
2,NC_067195.1,337000,674000,0,0,0,1,13727
3,NC_067195.1,505500,842500,0,0,0,4,14062
4,NC_067195.1,674000,1011000,0,0,0,4,14469
5,NC_067195.1,842500,1179500,0,0,0,1,14328
8,NC_067195.1,1348000,1685000,0,0,0,2,14598
...,...,...,...,...,...,...,...,...
2324,NC_067202.1,18366500,18703500,0,13,0,0,33268
2325,NC_067202.1,18535000,18872000,0,15,0,0,33239
2326,NC_067202.1,18703500,19040500,0,2,0,0,33225
2327,NC_067202.1,18872000,19209000,0,0,0,2,33228


In [68]:
target_df = df.loc[ ((df["n_fst"] >0) &
                     (df["n_xpehh"] >0) &
                     (df["n_anc"] >0) &
                     (df["n_d"] >0) &
                     (df["n_snvs"] >10)) ]
target_df.head()

Unnamed: 0,chrom,start,stop,n_fst,n_xpehh,n_anc,n_d,n_snvs
1822,NC_067199.1,28476500,28813500,41,62,6,9,25584
1989,NC_067200.1,9773000,10110000,12,2,3,7,22658
1990,NC_067200.1,9941500,10278500,34,72,5,19,22503
1991,NC_067200.1,10110000,10447000,33,70,3,18,22587


In [69]:
target_df

Unnamed: 0,chrom,start,stop,n_fst,n_xpehh,n_anc,n_d,n_snvs
1822,NC_067199.1,28476500,28813500,41,62,6,9,25584
1989,NC_067200.1,9773000,10110000,12,2,3,7,22658
1990,NC_067200.1,9941500,10278500,34,72,5,19,22503
1991,NC_067200.1,10110000,10447000,33,70,3,18,22587


In [70]:
target_df = target_df.sort_values(by=['chrom', 'start']).reset_index(drop=True)
merged_loci = []

X=10_000

# Initialize the current locus
current_locus = [target_df.iloc[0]['chrom'], target_df.iloc[0]['start'], target_df.iloc[0]['stop']]

for i in range(1, len(target_df)):
    if target_df.iloc[i]['chrom'] == current_locus[0] and target_df.iloc[i]['start'] - current_locus[2] <= X:
        # Extend the current locus
        current_locus[2] = max(current_locus[2], target_df.iloc[i]['stop'])
    else:
        # Save the current locus and start a new one
        merged_loci.append(current_locus)
        current_locus = [target_df.iloc[i]['chrom'], target_df.iloc[i]['start'], target_df.iloc[i]['stop']]

# Append the last locus
merged_loci.append(current_locus)

# Convert the merged loci into a DataFrame
merged_df = pd.DataFrame(merged_loci, columns=['chrom', 'start', 'stop'])
merged_df.to_csv("merged_target_loci.bed", sep="\t", header=False, index=False)
merged_df

Unnamed: 0,chrom,start,stop
0,NC_067199.1,28476500,28813500
1,NC_067200.1,9773000,10447000


In [71]:
sum(merged_df["stop"] - merged_df["start"])/1e6

1.011

In [None]:
%%bash 

conda run -n vcftools \
    vcftools \
        --vcf ../filter_genotypes/sorted_annotated_snps.vcf \
        --recode \
        --recode-INFO-all \
        --bed merged_target_loci.bed
        --stdout >merged_target_loci.vcf