# Import libraries

In [1]:
import os
import logging

# Suppress TensorFlow logging
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 0 = all messages are logged (default), 1 = INFO, 2 = WARNING, 3 = ERROR
logging.getLogger('tensorflow').setLevel(logging.ERROR)

import random
import importlib.util

from collections import Counter
from math import floor

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

from scipy.stats import t, entropy, stats

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

import statsmodels.api as sm

import tensorflow as tf
from tensorflow.keras import regularizers, Input, Model, layers
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from k_means_constrained import KMeansConstrained

from helpers import (
    parse_variables, get_risk_level, hi_gauss_blob_risk_fun, blob_risk_fun, 
    NW_risk_fun, square_risk_fun, map_to_color, simulate_quant_trait
)

from models import ols_regression, manhattan_linear, gc
from deep_learning_models import abyss, deep_abyss


2024-08-30 17:29:35.901522: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-30 17:29:35.904423: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-30 17:29:35.913847: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-30 17:29:35.934259: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-30 17:29:35.934311: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-30 17:29:35.946783: I tensorflow/core/platform/cpu_feature_guard.cc:

In [18]:
dict = parse_variables('geno_simulation.txt')
G = int(dict['G'])
L = int(dict['L'])
c = int(dict['c'])
if 'k' not in globals():
    k = int(dict['k'])
if 'M' not in globals():
    M = float(dict['M'])
HWE = int(dict['HWE'])
nr_humans = int(dict['nr_humans'])
nr_snps = int(dict['nr_snps'])
bottleneck_nr = int(dict['bottleneck_nr'])
# Thresholds
very_rare_threshold_L = float(dict['very_rare_threshold_L'])
very_rare_threshold_H = float(dict['very_rare_threshold_H'])

rare_threshold_L = float(dict['rare_threshold_L'])
rare_threshold_H = float(dict['rare_threshold_H'])

common_threshold_L = float(dict['common_threshold_L'])
common_threshold_H = float(dict['common_threshold_H'])

number_of_snps = (G*L)/2 # one loci per chromosome
number_of_individuals = c*k*k

In [19]:
very_rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_veryrare_genotype_AF_{very_rare_threshold_L}_{very_rare_threshold_H}.pkl")
rare = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_rare_genotype_AF_{rare_threshold_L}_{rare_threshold_H}.pkl")
common = pd.read_pickle(f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/01_common_genotype_AF_{common_threshold_L}_{common_threshold_H}.pkl")

In [20]:
number_of_snps

1250.0

In [21]:
very_rare = very_rare.rename(columns=lambda x: 'VR' + x)/2
rare = rare.rename(columns=lambda x: 'R' + x)/2
common = common.rename(columns=lambda x: 'C' + x)/2
complete = pd.concat([common, rare, very_rare], axis=1)
complete = ((complete*2)-1)

In [22]:
# Read bottleneck

In [23]:
path_bottle = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/phenotype/abyss_bottleneck"
bottle_file = [f for f in os.listdir(path_bottle) if int(f.split("_")[2]) ==  bottleneck_nr][0]

In [24]:
elapsed_time_bottleneck = float(bottle_file.split('_')[3].split('seconds')[0])

In [25]:
bottle = pd.read_pickle(f"{path_bottle}/{bottle_file}")

In [26]:
complete['pop'] = bottle['cluster']

# Divide in LD blocks

In [27]:
complete

snps,CV55_AF_0.205,CV1037_AF_0.205,CV1039_AF_0.2275,CV1394_AF_0.205,CV1554_AF_0.2225,CV1555_AF_0.2075,CV1559_AF_0.21,CV1560_AF_0.2425,CV1562_AF_0.235,CV1564_AF_0.23,...,VRV2449_AF_0.0425,VRV2453_AF_0.035,VRV2463_AF_0.02,VRV2468_AF_0.0325,VRV2471_AF_0.0125,VRV2475_AF_0.03,VRV2480_AF_0.0325,VRV2486_AF_0.04,VRV2495_AF_0.0375,pop
0,1.0,1.0,1.0,0.0,-1.0,-1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1,1.0,1.0,1.0,0.0,0.0,1.0,-1.0,1.0,-1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
2,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
3,1.0,1.0,1.0,-1.0,1.0,0.0,1.0,0.0,1.0,-1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
4,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,-1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0
196,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0
197,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
198,-1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0


In [43]:
for pop in bottle['cluster'].unique():
    temp_complete = complete[complete['pop']==pop]
    temp_complete = temp_complete.drop('pop', axis = 1)
    genos = temp_complete.T
    sample_size = nr_snps
    n_components = 15
    num_clus = round(genos.shape[0] / sample_size)
    size_clus = int(genos.shape[0]/num_clus)
    size_min = size_clus - round(size_clus / 5)
    size_max = size_clus + round(size_clus / 5)
    if size_max > genos.shape[0]:
        size_max = None
    # Calculate the number of clusters based on sample size
    # Standardize the data
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(genos)

    # Perform PCA with n components
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(df_scaled)
    
    # Create a new DataFrame to store the principal components
    pc_columns = [f'PC{i+1}' for i in range(n_components)]
    df_pca = pd.DataFrame(data=principal_components, columns=pc_columns)
    
    # Apply constrained K-Means clustering
    clf = KMeansConstrained(
        n_clusters=num_clus,
        size_min=size_min,
        size_max=size_max,
        random_state=0
    )
    clf.fit_predict(np.array(df_pca))

    # Assign cluster labels to genotypic data
    count = Counter(clf.labels_)
    empty = []
    sample_size_temp = floor(len(clf.labels_)/sample_size)+1
    for val in count.values():
        multi = floor(val/sample_size_temp)+1
        list_to_sample = multi*list(range(sample_size_temp))
        random.shuffle(list_to_sample)
        empty = empty + list_to_sample[0:val]
        
    genos['clusters_k_means'] = empty # shuffled
    #genos['clusters_k_means'] = clf.labels_ # clustered

    PATH_output = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks/{pop}"
    os.system(f"rm -rf {PATH_output}")
    os.makedirs(PATH_output, exist_ok=True)

    for num in genos.clusters_k_means.unique():
        to_save = genos.loc[genos['clusters_k_means'] == num]
        to_save = to_save.drop(columns=['clusters_k_means']).T
        to_save.index = temp_complete.index
        # Calculate minimum and maximum MAF for the cluster
        tot_mafs = []
        for snp_id in to_save:
            try:
                try:
                    num_maj = to_save[[snp_id]].value_counts()[1]
                except Exception as e:
                    num_maj = 0
                try:
                    num_het = to_save[[snp_id]].value_counts()[0]
                except Exception as e:
                    num_het = 0
                try:
                    num_min = to_save[[snp_id]].value_counts()[-1]
                except Exception as e:
                    num_min = 0
                total_humans = num_maj + num_het + num_min
                maf = (num_min*2 + num_het)/(total_humans*2)
    
            except Exception as e:
                print(e)
                print(f"snp {snp_id} has a problem")
    
            tot_mafs.append(maf)
    
        max_maf = np.round(max(tot_mafs), 5)
        min_maf = np.round(min(tot_mafs), 5)
        size = to_save.shape[1]
    
        # Save the processed cluster data
        to_save.to_pickle(f"{PATH_output}/{num}_{size}_maf_{min_maf}_{max_maf}.pkl")

    path_one_hot_genotype = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks_one_hot/{pop}"
    os.system(f"rm -rf {path_one_hot_genotype}")
    os.makedirs(path_one_hot_genotype, exist_ok = True)

    path_lds = f"data/G{G}_L{L}_c{c}_k{k}_M{M}_HWE{HWE}/genotype/LD_blocks/{pop}"
    ld_files = os.listdir(path_lds)
    for ld_file in ld_files:
        path_ld_file = path_lds + "/" + ld_file
        ld_complete = pd.read_pickle(path_ld_file)
    
        # Create db_minor
        db_minor = ld_complete.copy()
        db_minor = db_minor.applymap(lambda x: 1 if x == -1.0 else 0)
        
        # Create db_het
        db_het = ld_complete.copy()
        db_het = db_het.applymap(lambda x: 1 if x == 0.0 else 0)
        
        # Create db_major
        db_major = ld_complete.copy()
        db_major = db_major.applymap(lambda x: 1 if x == 1.0 else 0)
        
    
        db_minor.to_pickle(f"{path_one_hot_genotype}/{ld_file.split('.pkl')[0]}_db_minor.pkl")
        db_het.to_pickle(f"{path_one_hot_genotype}/{ld_file.split('.pkl')[0]}_db_het.pkl")
        db_major.to_pickle(f"{path_one_hot_genotype}/{ld_file.split('.pkl')[0]}_db_major.pkl")