# Import libraries

In [1]:
import subprocess
import os
from helpers import parse_variables
import pandas as pd
import numpy as np

# Load simulation parameters

In [2]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])
HWE = int(dict['HWE'])

nr_humans = int(dict['nr_humans'])
nr_snps = int(dict['nr_snps'])
bottleneck_nr = int(dict['bottleneck_nr'])

# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

number_of_snps = (G*L)/2 # one loci per chromosome
number_of_individuals = c*k*k


very_rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

very_rare = very_rare.rename(columns=lambda x: 'VR' + x)/2
rare = rare.rename(columns=lambda x: 'R' + x)/2
common = common.rename(columns=lambda x: 'C' + x)/2
complete = pd.concat([common, rare, very_rare], axis=1)
complete = ((complete*2)-1)

In [3]:
path_bottle = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/phenotype/abyss_bottleneck"
bottle_file = [f for f in os.listdir(path_bottle) if int(f.split("_")[2]) ==  bottleneck_nr][0]
elapsed_time_bottleneck = float(bottle_file.split('_')[3].split('seconds')[0])
bottle = pd.read_pickle(f"{path_bottle}/{bottle_file}")

In [4]:
bottle

Unnamed: 0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10,...,dim56,dim57,dim58,dim59,dim60,dim61,dim62,dim63,dim64,cluster
0,0.438031,0.219851,0.417390,-0.087233,0.636523,0.411987,0.439273,0.587137,0.635839,0.678626,...,0.231992,0.773297,0.311669,0.631210,0.449041,0.075500,0.492948,0.245609,0.241071,0
1,0.394187,0.238746,0.366791,-0.018373,0.628212,0.375899,0.439879,0.564070,0.630235,0.620578,...,0.290926,0.743057,0.326274,0.627469,0.428575,0.084482,0.444983,0.267797,0.252006,0
2,0.415530,0.248868,0.398593,-0.041195,0.605459,0.401057,0.404802,0.574336,0.607739,0.615653,...,0.288881,0.731947,0.327299,0.619902,0.441620,0.071646,0.458034,0.259421,0.266945,0
3,0.432265,0.208079,0.385075,-0.070840,0.659129,0.396468,0.463096,0.592612,0.659859,0.672706,...,0.248439,0.749525,0.340455,0.621179,0.461796,0.090048,0.483524,0.285094,0.255872,0
4,0.421046,0.237361,0.380843,-0.086100,0.651207,0.386901,0.412704,0.592257,0.652908,0.644078,...,0.286236,0.743728,0.356144,0.624085,0.458687,0.052115,0.462230,0.254381,0.228417,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.449468,0.280360,0.627646,-0.137756,0.334139,0.358471,0.241586,0.670462,0.349183,0.292309,...,0.317284,0.296977,0.429530,0.258882,0.482674,0.210803,0.323687,0.298608,0.331501,0
796,0.489992,0.279948,0.619469,-0.165737,0.334048,0.401841,0.184096,0.656153,0.349172,0.318653,...,0.319594,0.286011,0.473470,0.256547,0.523683,0.160437,0.362631,0.290732,0.341743,0
797,0.465740,0.267128,0.617547,-0.167902,0.359323,0.356619,0.236413,0.675095,0.374622,0.310297,...,0.313876,0.277158,0.473220,0.234429,0.509698,0.199792,0.333904,0.291245,0.294345,0
798,0.459556,0.258761,0.627874,-0.157513,0.324786,0.363663,0.232370,0.662467,0.339631,0.298652,...,0.295846,0.274634,0.446936,0.239383,0.491439,0.207320,0.334245,0.299045,0.337022,0


In [5]:
path_pops_estimated_lds = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_estimated_mafs/"

In [6]:
L*G

10000

In [7]:
q2s_pop = []
twopqs_pop = []
p2s_pop = []

time_q2 = 0.0
time_p2 = 0.0
time_2pq = 0.0

for pop in os.listdir(path_pops_estimated_lds):
    print(pop)
    bottle_index = bottle[bottle['cluster']==int(pop)]
    path_estimated_lds = path_pops_estimated_lds + "/" + pop
    q2_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == 'q2']
    q2_files = sorted(q2_files, key=lambda x: int(x.split('_')[0]))
    p2_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == 'p2']
    p2_files = sorted(p2_files, key=lambda x: int(x.split('_')[0]))
    
    twopq_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == '2pq']
    twopq_files = sorted(twopq_files, key=lambda x: int(x.split('_')[0]))
    
    
    q2s = []
    for q2_file in q2_files:
        time_q2 += float(q2_file.split('_pop_')[1].split("seconds")[0])
        path_q2_file = path_estimated_lds + '/' + q2_file
        q2 = pd.read_pickle(path_q2_file)
        q2s.append(q2)
    
    q2s = pd.concat(q2s, axis=1)
    q2s = q2s[list(complete.columns)]
    q2s_pop.append(q2s)

    p2s = []
    for p2_file in p2_files:
        time_p2 += float(p2_file.split('_pop_')[1].split("seconds")[0])

        path_p2_file = path_estimated_lds + '/' + p2_file
        p2 = pd.read_pickle(path_p2_file)
        p2s.append(p2)

    p2s = pd.concat(p2s, axis=1)
    p2s = p2s[list(complete.columns)]
    p2s_pop.append(p2s)

    
    twopqs = []
    for twopq_file in twopq_files:
        time_2pq += float(twopq_file.split('_pop_')[1].split("seconds")[0])
        path_2pq_file = path_estimated_lds + '/' + twopq_file
        twopq = pd.read_pickle(path_2pq_file)
        twopqs.append(twopq)

    twopqs = pd.concat(twopqs, axis=1)
    twopqs = twopqs[list(complete.columns)]
    twopqs_pop.append(twopqs)


q2s = pd.concat(q2s_pop, axis=0)
q2s = q2s.sort_index()

p2s = pd.concat(p2s_pop, axis=0)
p2s = p2s.sort_index()

twopqs = pd.concat(twopqs_pop, axis=0)
twopqs = twopqs.sort_index()

0


In [10]:
path_output = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/"

In [11]:
os.system(f"rm -rf {path_output}/estimated*")

0

In [12]:
q2s.to_pickle(f"{path_output}/estimated_q2s_via_esti_pop_{time_q2}seconds.pkl")
p2s.to_pickle(f"{path_output}/estimated_p2s_via_esti_pop_{time_p2}seconds.pkl")
twopqs.to_pickle(f"{path_output}/estimated_2pqs_via_esti_pop_{time_2pq}seconds.pkl")