# Import libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
import tensorflow as tf

from helpers import parse_variables, get_risk_level, map_to_color, lin_reg, simulate_quant_trait
from models import no_corr, rare_pc, pc, gc, abyss_bottle_linreg, abyss_maf_linreg
from deep_learning_models import abyss, deep_abyss

import warnings
from scipy.stats import t
from scipy import stats

warnings.filterwarnings("ignore")

2024-06-28 14:11:04.858123: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-28 14:11:04.906202: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-28 14:11:04.906258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-28 14:11:04.907449: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-28 14:11:04.916202: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-28 14:11:04.917471: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
naming_dict = {
    "no_risk": "no environmental risk",
    "NW_risk": "Smooth linear North-West environmental risk",
    "N_risk" : "Smooth linear North environmental risk",
    "blob_risk": "Localised big blob risk",
    "center_risk": "Localised big central risk",
    "big_square_risk": "big square risk",
    "square_risk" : "Tiny square risk",
    'hi_square_risk' : "Tiny square risk",
    "hi_gauss_blob_risk": "Global Gaussian Risk",
    "two_square_risk": "Two tiny risks"
}

# Load genotype

In [3]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
k = int(dict['k'])
M = float(dict['M'])

# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

number_of_snps = (G*L)/2 # one loci per chromosome
number_of_individuals = c*k*k

In [4]:
risk_level = get_risk_level()
risk_level = risk_level.split("\n")[-1]
name_risk = risk_level.split('_fun')[0]
name_risk

'NW_risk'

In [5]:
nr_common_PCs = 5
pc_columns = ['PC{}'.format(i) for i in range(1, nr_common_PCs+1)]
nr_rare_PCs = 5
rare_pc_columns = ['PC{}'.format(i) for i in range(1, nr_rare_PCs+1)]

In [6]:
very_rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/genotype/02_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [7]:
very_rare = very_rare.rename(columns=lambda x: 'VR' + x)/2
rare = rare.rename(columns=lambda x: 'R' + x)/2
common = common.rename(columns=lambda x: 'C' + x)/2
complete = pd.concat([common, rare, very_rare], axis=1)

# Load environmental risk

In [8]:
risk = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/phenotype/environmental_risks/risk_{name_risk}.pkl")

# Create phenotype only ENV dependent

In [9]:
mu= np.zeros(complete.shape[0])
beta = np.zeros(complete.shape[1])
y = np.array(simulate_quant_trait(mu, np.array(complete), beta, np.array(risk[name_risk])))

# Analysis

In [10]:
# No correction
df_no_corr = no_corr(complete, y)

In [11]:
# rare PCA
PC_veryrare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/phenotype/PCs/veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
df_rare_PCs = rare_pc(complete, y , PC_veryrare, rare_pc_columns)

In [12]:
# PCA
PC_common= pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}/phenotype/PCs/common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")
df_PCs = rare_pc(complete, y , PC_common, pc_columns)

In [13]:
# Genomic control
df_GC = gc(df_no_corr)

In [17]:
# minimalist abyss
bottleneck_nr = 2
epoch = 5
patience = 2
dim_columns = ['dim{}'.format(i) for i in range(1, bottleneck_nr+1)]
autoencoder, bottleneck_model, history = abyss(complete*2 - 1, bottleneck_nr, epoch, patience)

abyss_bottle = bottleneck_model(tf.convert_to_tensor(complete*2 - 1, dtype=tf.float32))
abyss_MAF = autoencoder(tf.convert_to_tensor(complete*2 - 1, dtype=tf.float32))
probmaf = (pd.DataFrame(data=abyss_MAF, columns = complete.columns)+1)/2

df_abyss_bottle = abyss_bottle_linreg(complete, y, abyss_bottle)
df_abyss_maf = abyss_maf_linreg(complete, y, probmaf)

In [23]:
# deep abyss
def deep_abyss(geno, bottle, epoch, patience, pheno):
    dim_columns = ['dim{}'.format(i) for i in range(1, bottle+1)]
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, pheno_train, pheno_test = train_test_split(geno, geno, pheno, test_size=0.2, random_state=42)

    # Define your regularization strength (lambda)
    l2_lambda = 0.001  # Adjust this value as needed

    # Define input layers
    input_shape_geno = geno.shape[1:]
    input_layer_geno = Input(shape=input_shape_geno, name='input_geno')

    input_shape_pheno = pheno.shape[1:]
    input_layer_pheno = Input(shape=input_shape_pheno, name='input_pheno')

    # Define bottleneck size

    size_layer_1 = int(round(input_shape_geno[0]) / 2)

    # Create layers
    encoder_init_1 = layers.Dense(bottle, 
                           activation="elu", 
                           name="encoder_init_1",
                           kernel_regularizer=regularizers.l2(l2_lambda))
    
    decoder_init_2 = layers.Dense(input_shape_geno[0], 
                           activation="elu", 
                           name="decoder_init_2",
                           kernel_regularizer=regularizers.l2(l2_lambda))
    
    predictor = layers.Dense(input_shape_pheno[0], 
                           activation="linear", 
                           name="predictor",
                           kernel_regularizer=regularizers.l2(l2_lambda))

    # Define custom layer for element-wise trainable weights
    class ElementWiseWeightsLayer(tf.keras.layers.Layer):
        def __init__(self, **kwargs):
            super(ElementWiseWeightsLayer, self).__init__(**kwargs)
    
        def build(self, input_shape):
            self.weight = self.add_weight(shape=(), initializer="ones", trainable=True, name="element_wise_weight")
            super(ElementWiseWeightsLayer, self).build(input_shape)
    
        def call(self, inputs):
            return inputs * self.weight
    
    # Define encoder and decoder paths
    bottle_neck = encoder_init_1(input_layer_geno)
    allele_frequency_probability = decoder_init_2(bottle_neck)
    y_predictor = predictor(allele_frequency_probability)
    
    # Define the model
    autoencoder = Model(inputs=input_layer_geno, outputs=[allele_frequency_probability, y_predictor], name="fishy")
    # Extract the bottleneck layer
    bottleneck_model = tf.keras.Model(inputs=autoencoder.input, outputs=autoencoder.get_layer('encoder_init_1').output)
    
    # Compile the model
    autoencoder.compile(optimizer='adam', loss=['mse', 'mse'], loss_weights=[1.0, 2.0])
    
    # Define early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)
    
    # Train the model
    history = autoencoder.fit(X_train, [X_train, pheno_train], epochs=epochs, batch_size=32, validation_data=(X_test, [X_test, pheno_test]), callbacks=[early_stopping], verbose=0)
    
    # Evaluate the model
    evaluation = autoencoder.evaluate(X_test, [y_test, pheno_test])
    
    # Predict outputs
    allele_frequency_output, y_output = autoencoder.predict(geno)
    
    # Extract encoded outputs from the encoder (bottleneck layer)
    encoded_output = bottleneck_model.predict(geno)


Unnamed: 0,coeff,AFs,Ps_abyss_maf,expected_P,logPs_abyss_maf,expected_logP
0,-0.064727,0.296425,7.446390e-08,0.001159,0.000383,-0.000000
1,0.007621,0.21885,7.105215e-06,0.002317,0.001704,0.000504
2,-0.000402,0.21275,1.744526e-05,0.003476,0.001933,0.001008
3,-0.017530,0.2368,2.568836e-05,0.004635,0.001950,0.001512
4,0.003380,0.217525,4.668449e-05,0.005794,0.003080,0.002018
...,...,...,...,...,...,...
858,0.059531,0.016075,9.929324e-01,0.995365,4.330827,2.237041
859,-0.170837,0.0108,9.955195e-01,0.996524,4.590264,2.333951
860,-0.158448,0.008,9.955583e-01,0.997683,4.758323,2.458890
861,-0.162501,0.010525,9.960850e-01,0.998841,5.148423,2.634981


In [26]:
deep_abyss(complete*2 - 1, bottleneck_nr, epoch, patience, risk[[name_risk]])


NameError: name 'train_test_split' is not defined