# Import libraries

In [1]:
import subprocess
import os
from helpers import parse_variables
import pandas as pd
import numpy as np

# Load simulation parameters

In [2]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])
HWE = int(dict['HWE'])

nr_humans = int(dict['nr_humans'])
nr_snps = int(dict['nr_snps'])
bottleneck_nr = int(dict['bottleneck_nr'])

# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

number_of_snps = (G*L)/2 # one loci per chromosome
number_of_individuals = c*k*k


very_rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

very_rare = very_rare.rename(columns=lambda x: 'VR' + x)/2
rare = rare.rename(columns=lambda x: 'R' + x)/2
common = common.rename(columns=lambda x: 'C' + x)/2
complete = pd.concat([common, rare, very_rare], axis=1)
complete = ((complete*2)-1)

In [3]:
path_bottle = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/phenotype/abyss_bottleneck"
bottle_file = [f for f in os.listdir(path_bottle) if int(f.split("_")[2]) ==  bottleneck_nr][0]
elapsed_time_bottleneck = float(bottle_file.split('_')[3].split('seconds')[0])
bottle = pd.read_pickle(f"{path_bottle}/{bottle_file}")

In [4]:
bottle

Unnamed: 0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10,...,dim56,dim57,dim58,dim59,dim60,dim61,dim62,dim63,dim64,cluster
0,0.464010,0.277659,0.184471,0.145874,0.548457,0.551377,0.601435,0.237088,0.211788,0.251649,...,0.503871,0.185923,0.470348,0.157695,0.109228,0.581360,0.740218,0.520439,0.250200,3
1,0.376607,0.282675,0.240265,0.162393,0.515220,0.512761,0.543219,0.266610,0.217731,0.293146,...,0.540333,0.183413,0.505353,0.150232,0.120217,0.548561,0.719707,0.553534,0.311406,3
2,0.393871,0.293032,0.258735,0.195829,0.520866,0.512252,0.557425,0.227262,0.259624,0.260234,...,0.516241,0.176218,0.476512,0.173226,0.146574,0.593437,0.708354,0.527019,0.303865,3
3,0.391297,0.288184,0.225150,0.160709,0.544529,0.491067,0.542282,0.231822,0.205366,0.310358,...,0.551153,0.154091,0.507891,0.172045,0.083044,0.639686,0.760948,0.562117,0.281420,3
4,0.389418,0.286322,0.222184,0.166057,0.540349,0.513667,0.540740,0.249134,0.223207,0.274226,...,0.508267,0.170535,0.497345,0.129209,0.124610,0.587048,0.701011,0.550922,0.301730,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.135528,0.217973,0.285638,0.219283,0.630403,0.264012,0.278632,0.166489,0.272766,0.138688,...,0.257950,0.169351,0.604665,0.033925,0.154283,0.564271,0.275137,0.725359,0.654545,0
796,0.152436,0.259799,0.264204,0.168112,0.665685,0.271778,0.317010,0.158502,0.236699,0.131330,...,0.219169,0.096814,0.582710,0.046724,0.094563,0.588377,0.279054,0.684996,0.649056,0
797,0.148816,0.287756,0.285579,0.167949,0.621246,0.303736,0.322631,0.228928,0.244147,0.173931,...,0.266793,0.122937,0.583326,0.036712,0.135711,0.540060,0.304811,0.672534,0.645869,0
798,0.142888,0.250974,0.280890,0.166238,0.625920,0.264907,0.305856,0.211439,0.228303,0.177745,...,0.264104,0.137100,0.577952,0.060120,0.119170,0.529136,0.294351,0.675257,0.653216,0


In [5]:
path_pops_estimated_lds = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_estimated_mafs/"

In [6]:
L*G

4000

In [7]:
q2s_pop = []
twopqs_pop = []
p2s_pop = []

for pop in os.listdir(path_pops_estimated_lds):
    print(pop)
    bottle_index = bottle[bottle['cluster']==int(pop)]
    path_estimated_lds = path_pops_estimated_lds + "/" + pop
    q2_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == 'q2']
    p2_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == 'p2']
    twopq_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == '2pq']
    
    q2s = []
    for q2_file in q2_files:
        path_q2_file = path_estimated_lds + '/' + q2_file
        q2 = pd.read_pickle(path_q2_file)
        q2s.append(q2)
    
    q2s = pd.concat(q2s, axis=1)
    q2s = q2s[list(complete.columns)]
    q2s_pop.append(q2s)

    p2s = []
    for p2_file in p2_files:
        path_p2_file = path_estimated_lds + '/' + p2_file
        p2 = pd.read_pickle(path_p2_file)
        p2s.append(p2)

    p2s = pd.concat(p2s, axis=1)
    p2s = p2s[list(complete.columns)]
    p2s_pop.append(p2s)

    
    twopqs = []
    for twopq_file in twopq_files:
        path_2pq_file = path_estimated_lds + '/' + twopq_file
        twopq = pd.read_pickle(path_2pq_file)
        twopqs.append(twopq)

    twopqs = pd.concat(twopqs, axis=1)
    twopqs = twopqs[list(complete.columns)]
    twopqs_pop.append(twopqs)


q2s = pd.concat(q2s_pop, axis=0)
q2s = q2s.sort_index()

p2s = pd.concat(p2s_pop, axis=0)
p2s = q2s.sort_index()

twopqs = pd.concat(twopqs_pop, axis=0)
twopqs = twopqs.sort_index()

0
2
1
3


In [12]:
q2s

snps,CV4_AF_0.22875,CV16_AF_0.22875,CV26_AF_0.2225,CV33_AF_0.240625,CV38_AF_0.23875,CV54_AF_0.235,CV80_AF_0.246875,CV84_AF_0.229375,CV104_AF_0.258125,CV115_AF_0.255,...,VRV3940_AF_0.03,VRV3944_AF_0.04,VRV3959_AF_0.025625,VRV3964_AF_0.031875,VRV3967_AF_0.045625,VRV3970_AF_0.024375,VRV3975_AF_0.0475,VRV3977_AF_0.015625,VRV3993_AF_0.048125,VRV3996_AF_0.044375
0,0.014271,0.014592,0.029550,0.034243,0.018173,0.046397,0.016462,0.017469,0.033872,0.027320,...,3.955295e-17,-1.686411e-15,-1.011773e-18,-4.921388e-05,2.543126e-02,1.357086e-14,-6.684624e-09,1.965009e-14,2.334227e-15,3.055620e-19
1,0.023445,0.025912,0.047383,0.061704,0.031455,0.063585,0.024968,0.031815,0.051659,0.032184,...,1.601003e-16,-1.464308e-15,-1.376567e-18,-8.203820e-04,2.208706e-02,1.381569e-14,-6.475656e-09,2.061817e-14,2.536977e-15,2.058515e-19
2,0.016011,0.017008,0.034687,0.040875,0.022955,0.048773,0.018289,0.021450,0.041303,0.031475,...,9.402818e-17,-1.596926e-15,-9.555338e-19,1.768616e-03,2.316612e-02,1.381995e-14,-6.009951e-09,2.167808e-14,2.546029e-15,-1.000362e-18
3,0.013269,0.014264,0.029788,0.033324,0.014001,0.044094,0.015770,0.019457,0.028835,0.026863,...,-3.366632e-18,-1.678477e-15,-1.018054e-18,7.430175e-04,2.496951e-02,1.328547e-14,-6.313709e-09,1.999178e-14,2.332877e-15,1.242452e-19
4,0.016413,0.017356,0.034581,0.041470,0.022007,0.049930,0.018568,0.022276,0.038899,0.028858,...,7.824965e-17,-1.597253e-15,-1.037111e-18,1.621107e-05,2.409840e-02,1.365695e-14,-6.226841e-09,2.008273e-14,2.423282e-15,-2.423554e-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.124070,0.057131,0.008094,0.014063,0.063869,0.014149,0.076561,0.081853,-0.001812,0.009269,...,-1.094148e-18,-5.828617e-03,4.400436e-03,-1.697953e-13,3.439846e-10,-7.429821e-03,-3.570586e-13,2.955785e-15,4.145032e-12,2.243222e-03
796,0.131113,0.061693,0.012852,0.013656,0.070904,0.018602,0.086037,0.089579,-0.003086,0.005345,...,4.510434e-19,-5.117491e-03,2.947031e-03,-2.005152e-13,3.333308e-10,-6.823877e-03,-3.642108e-13,2.796307e-15,4.130824e-12,2.598950e-03
797,0.129081,0.067849,0.025883,0.027595,0.085930,0.037991,0.102627,0.110668,-0.001743,0.004967,...,-2.253343e-18,-2.463982e-03,1.780010e-03,-2.066078e-13,3.178018e-10,-1.481399e-04,-3.589041e-13,2.294587e-15,3.753056e-12,3.964207e-03
798,0.135853,0.067896,0.021618,0.025640,0.086467,0.033395,0.096864,0.104987,-0.002019,0.003154,...,-6.761052e-19,-3.598383e-03,1.610521e-03,-2.043024e-13,3.244557e-10,-2.639154e-03,-3.503482e-13,2.788510e-15,4.037347e-12,4.398942e-03


In [None]:
path_output = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/"

In [10]:
os.system(f"rm -rf {path_output}")

0

In [11]:
q2s.to_pickle(f"{path_output}/estimated_q2s_via_esti_pop.pkl")
p2s.to_pickle(f"{path_output}/estimated_p2s_via_esti_pop.pkl")
twopqs.to_pickle(f"{path_output}/estimated_2pqs_via_esti_pop.pkl")