# Import libraries

In [1]:
import os
import logging

# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0 = all messages are logged (default), 1 = INFO, 2 = WARNING, 3 = ERROR
logging.getLogger('tensorflow').setLevel(logging.ERROR)

import time  # Import the time module
import warnings
import importlib.util

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

from scipy.stats import t, entropy, stats

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

import tensorflow as tf
from tensorflow.keras import regularizers, Input, Model, layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from k_means_constrained import KMeansConstrained

from helpers import (
    parse_variables, get_risk_level, hi_gauss_blob_risk_fun, blob_risk_fun, 
    NW_risk_fun, square_risk_fun, map_to_color, simulate_quant_trait
)

from models import ols_regression, manhattan_linear, gc
from deep_learning_models import abyss, deep_abyss


In [2]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
if 'k' not in globals():
    k = int(dict['k'])
    
if 'M' not in globals():
    M = float(dict['M'])
    
HWE = int(dict['HWE'])

nr_humans = int(dict['nr_humans'])
nr_snps = int(dict['nr_snps'])
bottleneck_nr = int(dict['bottleneck_nr'])

# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

number_of_snps = (G*L)/2 # one loci per chromosome
number_of_individuals = c*k*k

In [3]:
very_rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [4]:
very_rare = very_rare.rename(columns=lambda x: 'VR' + x)/2
rare = rare.rename(columns=lambda x: 'R' + x)/2
common = common.rename(columns=lambda x: 'C' + x)/2
complete = pd.concat([common, rare, very_rare], axis=1)
complete = ((complete*2)-1)

In [5]:
path_bottle = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/phenotype/abyss_bottleneck"
bottle_file = [f for f in os.listdir(path_bottle) if int(f.split("_")[2]) ==  bottleneck_nr][0]
elapsed_time_bottleneck = float(bottle_file.split('_')[3].split('seconds')[0])
bottle = pd.read_pickle(f"{path_bottle}/{bottle_file}")

# Run Abyss on LD block

In [6]:
def maf_prediction(bottle_in, geno_out, epoch, patience):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(bottle_in, geno_out, test_size=0.2, random_state=42)
    
    # Regularization parameter
    l2_regularizer = 0.001
    
    # Original autoencoder model with L2 regularization
    decoder = tf.keras.Sequential([
        tf.keras.layers.Dense(int(nr_snps/2), activation='elu', input_shape=(bottle_in.shape[1],), kernel_regularizer=regularizers.l2(l2_regularizer)),  # First hidden layer with L2 regularization
        layers.BatchNormalization(),
        tf.keras.layers.Activation('elu'),
        tf.keras.layers.Dense(geno_out.shape[1], activation='linear', kernel_regularizer=regularizers.l2(l2_regularizer))  # Output layer
    ])
    
    # Compile the original model with L2 regularization
    decoder.compile(optimizer='adam',
                        loss='mean_squared_error',
                        metrics=['mean_absolute_error'])
    
    # Define Early Stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    
    # Fit the original model with Early Stopping
    history = decoder.fit(X_train, y_train, epochs=epoch, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)
    
    return decoder, history

In [7]:
for pop in bottle['cluster'].unique():
    temp_bottle = bottle[bottle['cluster'] == pop]
    temp_bottle = temp_bottle.drop('cluster', axis=1)
    path_output = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_estimated_mafs/{pop}"
    os.system(f"rm -rf {path_output}")
    os.makedirs(path_output, exist_ok = True)
    path_one_hot_genotype = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_one_hot/{pop}"
    path_lds = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks/{pop}"
    ld_files = os.listdir(path_lds)
    epoch = 500
    patience = 50
    p2s = []
    twopqs = []
    q2s = []
    for ld_file in ld_files:
        db_minor = pd.read_pickle(f"{path_one_hot_genotype}/{ld_file.split('.pkl')[0]}_db_minor.pkl")
        db_het = pd.read_pickle(f"{path_one_hot_genotype}/{ld_file.split('.pkl')[0]}_db_het.pkl")
        db_major = pd.read_pickle(f"{path_one_hot_genotype}/{ld_file.split('.pkl')[0]}_db_major.pkl")
        
        start_time_p2 = time.time()
        decoder, history = maf_prediction(temp_bottle, db_major, epoch, patience)
        end_time_p2 = time.time()
        
        elapsed_time_p2 = np.round(end_time_p2 - start_time_p2,3)
        p2 = decoder(tf.convert_to_tensor(temp_bottle, dtype=tf.float32))
        p2 = pd.DataFrame(data=p2, columns = db_major.columns)
    
        p2.index = db_major.index
        p2s.append(p2)
        
        start_time_2pq = time.time()
        decoder, history = maf_prediction(temp_bottle, db_het, epoch, patience)
        end_time_2pq = time.time()
        
        elapsed_time_2pq = np.round(end_time_2pq - start_time_2pq,3)
        
        twopq = decoder(tf.convert_to_tensor(temp_bottle, dtype=tf.float32))
        twopq = pd.DataFrame(data=twopq, columns = db_het.columns)  
        twopq.index = db_het.index
        twopqs.append(twopq)
        
        start_time_q2 = time.time()
        decoder, history = maf_prediction(temp_bottle, db_minor, epoch, patience)
        end_time_q2 = time.time()
        
        elapsed_time_q2 = np.round(end_time_q2 - start_time_q2,3)
        q2 = decoder(tf.convert_to_tensor(temp_bottle, dtype=tf.float32))
        q2 = pd.DataFrame(data=q2, columns = db_minor.columns)
        
        q2.index = db_minor.index
        q2s.append(q2)
        
        path_output_global = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_estimated_mafs/{pop}"
        p2.to_pickle(f"{path_output_global}/{ld_file}_esti_p2_via_esti_pop_{elapsed_time_p2}seconds.pkl")
        twopq.to_pickle(f"{path_output_global}/{ld_file}_esti_2pq_via_esti_pop_{elapsed_time_2pq}seconds.pkl")        
        q2.to_pickle(f"{path_output_global}/{ld_file}_esti_q2_via_esti_pop_{elapsed_time_q2}seconds.pkl")


In [8]:
"""
p2 = pd.concat(p2s, axis=1)
p2 = p2.sort_index()
p2 = p2[list(complete.columns)]

q2 = pd.concat(q2s, axis=1)
q2 = q2.sort_index()
q2 = q2[list(complete.columns)]

twopq = pd.concat(twopqs, axis=1)
twopq = twopq.sort_index()
twopq = twopq[list(complete.columns)]
"""

'\np2 = pd.concat(p2s, axis=1)\np2 = p2.sort_index()\np2 = p2[list(complete.columns)]\n\nq2 = pd.concat(q2s, axis=1)\nq2 = q2.sort_index()\nq2 = q2[list(complete.columns)]\n\ntwopq = pd.concat(twopqs, axis=1)\ntwopq = twopq.sort_index()\ntwopq = twopq[list(complete.columns)]\n'