# Import libraries

In [7]:
import subprocess
import os
from helpers import parse_variables
import pandas as pd
import numpy as np

# Load simulation parameters

In [8]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
if 'k' not in globals():
    k = int(dict['k'])
    
if 'M' not in globals():
    M = float(dict['M'])
HWE = int(dict['HWE'])

nr_humans = int(dict['nr_humans'])
nr_snps = int(dict['nr_snps'])
bottleneck_nr = int(dict['bottleneck_nr'])

if 'tools' not in globals():
    tools = ['PCA', 'abyss_counted', 'abyss', 'no_corr']


if 'scenarios' not in globals():
    scenarios = ['snp_effect',
                 'linear_continuous',
                 'non_linear_continuous',
                 'discrete_global',
                 'discrete_localized',
                 'mix_linear_continuous',
                 'mix_non_linear_continuous',
                 'mix_discrete_global',
                 'mix_discrete_localized']

if 'very_rare_threshold_L' not in globals():
    very_rare_threshold_L = float(dict['very_rare_threshold_L'])
if 'very_rare_threshold_H' not in globals():
    very_rare_threshold_H = float(dict['very_rare_threshold_H'])
if 'rare_threshold_L' not in globals():
    rare_threshold_L = float(dict['rare_threshold_L'])
if 'rare_threshold_H' not in globals():
    rare_threshold_H = float(dict['rare_threshold_H'])
if 'common_threshold_L' not in globals():
    common_threshold_L = float(dict['common_threshold_L'])
if 'common_threshold_H' not in globals():
    common_threshold_H = float(dict['common_threshold_H'])

number_of_snps = (G*L)/2 # one loci per chromosome
number_of_individuals = c*k*k

very_rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

complete = pd.concat([common, rare, very_rare], axis=1)
complete = ((complete*2)-1)

In [9]:
if 'abyss' in tools:
    path_bottle = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/phenotype/abyss_bottleneck"
    bottle_file = [f for f in os.listdir(path_bottle) if int(f.split("_")[2]) ==  bottleneck_nr][0]
    elapsed_time_bottleneck = float(bottle_file.split('_')[3].split('seconds')[0])
    bottle = pd.read_pickle(f"{path_bottle}/{bottle_file}")

    path_pops_estimated_lds = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_estimated_mafs/"


    q2s_pop = []
    twopqs_pop = []
    p2s_pop = []
    
    time_q2 = 0.0
    time_p2 = 0.0
    time_2pq = 0.0
    
    for pop in os.listdir(path_pops_estimated_lds):
        bottle_index = bottle[bottle['cluster']==int(pop)]
        path_estimated_lds = path_pops_estimated_lds + "/" + pop
        q2_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == 'q2']
        q2_files = sorted(q2_files, key=lambda x: int(x.split('_')[0]))
        p2_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == 'p2']
        p2_files = sorted(p2_files, key=lambda x: int(x.split('_')[0]))
        
        twopq_files = [f for f in os.listdir(path_estimated_lds) if f.split(f"_")[6] == '2pq']
        twopq_files = sorted(twopq_files, key=lambda x: int(x.split('_')[0]))
        
        
        q2s = []
        for q2_file in q2_files:
            time_q2 += float(q2_file.split('_pop_')[1].split("seconds")[0])
            path_q2_file = path_estimated_lds + '/' + q2_file
            q2 = pd.read_pickle(path_q2_file)
            q2s.append(q2)
        
        q2s = pd.concat(q2s, axis=1)
        q2s = q2s[list(complete.columns)]
        q2s_pop.append(q2s)
    
        p2s = []
        for p2_file in p2_files:
            time_p2 += float(p2_file.split('_pop_')[1].split("seconds")[0])
    
            path_p2_file = path_estimated_lds + '/' + p2_file
            p2 = pd.read_pickle(path_p2_file)
            p2s.append(p2)
    
        p2s = pd.concat(p2s, axis=1)
        p2s = p2s[list(complete.columns)]
        p2s_pop.append(p2s)
    
        
        twopqs = []
        for twopq_file in twopq_files:
            time_2pq += float(twopq_file.split('_pop_')[1].split("seconds")[0])
            path_2pq_file = path_estimated_lds + '/' + twopq_file
            twopq = pd.read_pickle(path_2pq_file)
            twopqs.append(twopq)
    
        twopqs = pd.concat(twopqs, axis=1)
        twopqs = twopqs[list(complete.columns)]
        twopqs_pop.append(twopqs)
    
    
    q2s = pd.concat(q2s_pop, axis=0)
    q2s = q2s.sort_index()
    
    p2s = pd.concat(p2s_pop, axis=0)
    p2s = p2s.sort_index()
    
    twopqs = pd.concat(twopqs_pop, axis=0)
    twopqs = twopqs.sort_index()

    path_output = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/"
    os.system(f"rm -rf {path_output}/estimated*")
    q2s.to_pickle(f"{path_output}/estimated_q2s_via_esti_pop_{time_q2}seconds.pkl")
    p2s.to_pickle(f"{path_output}/estimated_p2s_via_esti_pop_{time_p2}seconds.pkl")
    twopqs.to_pickle(f"{path_output}/estimated_2pqs_via_esti_pop_{time_2pq}seconds.pkl")

else:
    pass