In [1]:
import os
import math
import pandas as pd
import numpy as np
import umap
from functools import reduce
from sklearn.preprocessing import StandardScaler

# Configuration settings
from chasm.config import CONFIG

# PLINK preprocessing utilities
from chasm.plink_preprocessing import (
    concat_AFs, 
    divide_into_chunks, 
    make_ids,
    make_AFs
)

from chasm.data_preprocessing import (
    is_snp,
    make_df,
    calculate_AFs,
    merge_AFs_ensembl_build,
    divide_into_chunks,
    align_dataframes,
)
from chasm.gwas import ols_regression, pca_of_n_snps, project_on_dimensions

from chasm.visualization import make_population_plot
from chasm.ld_blocks import segmenter
from chasm.abyss import linear_abyss, AE

from chasm.fst import make_fst, make_global_fst
from chasm.cluster import silhouette_score_clusters

2025-02-25 13:55:17.147163: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-25 13:55:17.149261: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-25 13:55:17.196547: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-25 13:55:17.198025: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Changing the GTM data into a pickle file having an id file and calculating the AFs for each SNP 
# with the CHROM:POS for every corresponding RSID

# GTM data
"""
path_data_raw = f"{CONFIG['PATH_data']}/00_raw/recoded_1000G.raw"
path_raw = f"{CONFIG['PATH_data']}/01_raw/"
path_usefull = f"{CONFIG['PATH_data']}/02_usefull/"

geno = make_df(path_data_raw, path_usefull, path_raw)

afs = calculate_AFs(geno)

path_ensembl = f"/mnt/e/1000G_data/usefull/ensembl_build"
merge_AFs_ensembl_build(path_ensembl, path_usefull, afs)
"""
# Divide the AFs into chunks
"""
size_chunck = 20_000
min_maf = 0.01

path_raw = f"{CONFIG['PATH_data']}/01_raw/geno.pkl"
path_afs = f"{CONFIG['PATH_data']}/02_usefull/allele_frequencies.pkl"
path_output = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"

divide_into_chunks(path_raw, path_afs, path_output, size_chunck, min_maf)
"""

'\nsize_chunck = 20_000\nmin_maf = 0.01\n\npath_raw = f"{CONFIG[\'PATH_data\']}/01_raw/geno.pkl"\npath_afs = f"{CONFIG[\'PATH_data\']}/02_usefull/allele_frequencies.pkl"\npath_output = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF/"\n\ndivide_into_chunks(path_raw, path_afs, path_output, size_chunck, min_maf)\n'

# Choose SNPs to project on n dimensions

# Observe raw dimensions

In [3]:
path_ids = f"{CONFIG['PATH_data']}/02_usefull/ids.pkl"
path_panel = f"{CONFIG['PATH_data']}/00_raw/all.panel"
ids = pd.read_pickle(path_ids)
labels = pd.read_pickle(path_panel)
ids = ids.merge(labels, left_on="IID", right_on="Sample name")

In [4]:
"""
path_macro_similar = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
iterations = 3

for iter in list(range(iterations)):
    print(f"Running iteration {iter}")
    path_output = f"{CONFIG['PATH_data']}/iteration_{iter}/"
    
    if iter == 0:
        ids[f"cluster_{iter}"] = f"{0}"
        os.makedirs(f"{path_output}/pop_{0}", exist_ok=True)
        ids.to_pickle(f"{path_output}/pop_{0}/ids.pkl")
        
    else:
        pass
    
    cluster_to_add = []
    for sub_pop in ids[f"cluster_{iter}"].unique():
        path_output = f"{CONFIG['PATH_data']}/iteration_{iter}/pop_{sub_pop}/"
        os.makedirs(path_output, exist_ok=True)
        temp_ids = ids[ids[f"cluster_{iter}"] == sub_pop]
        nr_of_projected_dimensions = 3
        geno = project_on_dimensions(path_macro_similar, path_output, temp_ids, nr_of_projected_dimensions=3, nr_snps = 20_000, n_components = 10)
        
        existing_dims = []   
        for PCs in [f for f in os.listdir(path_output) if f.startswith('PCs')]:
            dim = PCs.split('PCs_')[1].split('.pkl')[0]
            path_PC = f"{path_output}/{PCs}"
            PCs = pd.read_pickle(f"{path_PC}")
            temp_ids = temp_ids.copy()
            temp_ids[dim] = list(PCs['PC1'])
            existing_dims.append(dim)
        labels = silhouette_score_clusters(temp_ids, existing_dims, plot=False)
        for element in labels:
            cluster_to_add.append(f"{sub_pop}_{element}")
            
    ids[f"cluster_{iter+1}"] = cluster_to_add
    ids.to_pickle(f"{CONFIG['PATH_data']}/iteration_{iter}/ids.pkl")
    
    # Vizualization
    path_input = f"{CONFIG['PATH_data']}/iteration_{iter}/"
    for pop in [f for f in os.listdir(path_input) if f.startswith('pop')]:
        pop = pop.split('pop_')[1]
        temp_ids = ids[ids[f"cluster_{iter}"] == pop]
        path_pop = f"{path_input}/pop_{pop}"
        for PCs in [f for f in os.listdir(path_pop) if f.startswith('PCs')]:
            dim = PCs.split('PCs_')[1].split('.pkl')[0]
            path_PC = f"{path_pop}/{PCs}"
            PCs = pd.read_pickle(f"{path_PC}")
            temp_ids = temp_ids.copy()
            temp_ids[dim] = list(PCs['PC1'])
        if len(list(temp_ids['Population name'].unique())) > 20:
            make_population_plot(temp_ids, 'dim_1', 'dim_2', 'Superpopulation name', f"iteration {iter} - pop {pop}", palette = 'rocket')
        else:
            make_population_plot(temp_ids, 'dim_1', 'dim_2', 'Population name', f"iteration {iter} - pop {pop}", palette = 'rocket')
            
"""

'\npath_macro_similar = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF/"\niterations = 3\n\nfor iter in list(range(iterations)):\n    print(f"Running iteration {iter}")\n    path_output = f"{CONFIG[\'PATH_data\']}/iteration_{iter}/"\n    \n    if iter == 0:\n        ids[f"cluster_{iter}"] = f"{0}"\n        os.makedirs(f"{path_output}/pop_{0}", exist_ok=True)\n        ids.to_pickle(f"{path_output}/pop_{0}/ids.pkl")\n        \n    else:\n        pass\n    \n    cluster_to_add = []\n    for sub_pop in ids[f"cluster_{iter}"].unique():\n        path_output = f"{CONFIG[\'PATH_data\']}/iteration_{iter}/pop_{sub_pop}/"\n        os.makedirs(path_output, exist_ok=True)\n        temp_ids = ids[ids[f"cluster_{iter}"] == sub_pop]\n        nr_of_projected_dimensions = 3\n        geno = project_on_dimensions(path_macro_similar, path_output, temp_ids, nr_of_projected_dimensions=3, nr_snps = 20_000, n_components = 10)\n        \n        existing_dims = []   \n        for PCs in [f for f in os.listdir(pa

# Find the dims per snp

In [5]:
"""
path_data = f"{CONFIG['PATH_data']}/"
iteration_files = [f for f in os.listdir(path_data) if f.startswith('iteration')]
snp_ids_dfs = []
for iteration in iteration_files:
    path_iteration = f"{path_data}/{iteration}/"
    for pop in [f for f in os.listdir(path_iteration) if f.startswith('pop')]:
        pop = pop.split('pop_')[1]
        path_snp_ids = f"{path_iteration}/pop_{pop}/snp_ids.pkl"
        snp_ids = pd.read_pickle(path_snp_ids)
        snp_ids = snp_ids.drop(columns=['pval', 'betas', '-logp'])
        snp_ids[f"{iteration}_pop_{pop}_dim"] = list(snp_ids['dim'])
        snp_ids = snp_ids.drop(columns=['dim'])
        snp_ids = snp_ids.sort_index()  # Now sort
        snp_ids_dfs.append(snp_ids)
        
snp_ids = reduce(lambda left, right: pd.merge(left, right, on='snp_rs'), snp_ids_dfs)
snp_ids.to_pickle(f"{CONFIG['PATH_data']}/02_usefull/post_iterations_snp_ids.pkl")
"""

'\npath_data = f"{CONFIG[\'PATH_data\']}/"\niteration_files = [f for f in os.listdir(path_data) if f.startswith(\'iteration\')]\nsnp_ids_dfs = []\nfor iteration in iteration_files:\n    path_iteration = f"{path_data}/{iteration}/"\n    for pop in [f for f in os.listdir(path_iteration) if f.startswith(\'pop\')]:\n        pop = pop.split(\'pop_\')[1]\n        path_snp_ids = f"{path_iteration}/pop_{pop}/snp_ids.pkl"\n        snp_ids = pd.read_pickle(path_snp_ids)\n        snp_ids = snp_ids.drop(columns=[\'pval\', \'betas\', \'-logp\'])\n        snp_ids[f"{iteration}_pop_{pop}_dim"] = list(snp_ids[\'dim\'])\n        snp_ids = snp_ids.drop(columns=[\'dim\'])\n        snp_ids = snp_ids.sort_index()  # Now sort\n        snp_ids_dfs.append(snp_ids)\n        \nsnp_ids = reduce(lambda left, right: pd.merge(left, right, on=\'snp_rs\'), snp_ids_dfs)\nsnp_ids.to_pickle(f"{CONFIG[\'PATH_data\']}/02_usefull/post_iterations_snp_ids.pkl")\n'

# Make autoencoder

In [6]:
# create one dimensional representation of the data

In [51]:
"""
path_data = f"{CONFIG['PATH_data']}/"
iterations = [f for f in os.listdir(path_data) if f.startswith('iteration_0')]
for iteration in iterations:
    path_iteration = f"{path_data}/{iteration}/"
    for pop in [f for f in os.listdir(path_iteration) if f.startswith('pop')]:
        pop = pop.split('pop_')[1]
        path_pop = f"{path_iteration}/pop_{pop}/"
        dims = [f for f in os.listdir(path_pop) if f.startswith('PCs')]

nr_PCs = 3
PCs_labels = []
for i in range(nr_PCs):
    PCs_labels.append(f"PC{i+1}")

dims_df = pd.DataFrame()
for dim  in dims:
    path_dim = f"{path_pop}/{dim}"
    dim = dim.split('PCs_')[1].split('.pkl')[0]
    
    PCs = pd.read_pickle(path_dim)
    for label in PCs_labels:
        PCs.rename(columns={f"{label}": f"{label}_dim_{dim}"}, inplace=True)
        dims_df[f"{label}_{dim}"] = list(PCs[f"{label}_dim_{dim}"])
    

hidden = 2
epoch = 500
patience = 10
[autoencoder, bottleneck_model, history] = AE(dims_df, 1, dims_df.shape[1]*2, epoch, patience)

bottle = bottleneck_model(dims_df.to_numpy())
bottleneck_df = pd.DataFrame(bottle, columns=['Bottleneck_1D'])

ids = pd.read_pickle(f"{CONFIG['PATH_data']}/02_usefull/ids.pkl")
ids['center_dim'] = list(bottleneck_df['Bottleneck_1D'])
ids.to_pickle(f"{CONFIG['PATH_data']}/02_usefull/ids_with_center_dim.pkl")
make_population_plot(ids, 'center_dim', 'center_dim', 'FID', f"show IDs", palette = 'rocket')
"""

'\npath_data = f"{CONFIG[\'PATH_data\']}/"\niterations = [f for f in os.listdir(path_data) if f.startswith(\'iteration_0\')]\nfor iteration in iterations:\n    path_iteration = f"{path_data}/{iteration}/"\n    for pop in [f for f in os.listdir(path_iteration) if f.startswith(\'pop\')]:\n        pop = pop.split(\'pop_\')[1]\n        path_pop = f"{path_iteration}/pop_{pop}/"\n        dims = [f for f in os.listdir(path_pop) if f.startswith(\'PCs\')]\n\nnr_PCs = 3\nPCs_labels = []\nfor i in range(nr_PCs):\n    PCs_labels.append(f"PC{i+1}")\n\ndims_df = pd.DataFrame()\nfor dim  in dims:\n    path_dim = f"{path_pop}/{dim}"\n    dim = dim.split(\'PCs_\')[1].split(\'.pkl\')[0]\n    \n    PCs = pd.read_pickle(path_dim)\n    for label in PCs_labels:\n        PCs.rename(columns={f"{label}": f"{label}_dim_{dim}"}, inplace=True)\n        dims_df[f"{label}_{dim}"] = list(PCs[f"{label}_dim_{dim}"])\n    \n\nhidden = 2\nepoch = 500\npatience = 10\n[autoencoder, bottleneck_model, history] = AE(dims_df,

In [52]:
# Use lower dimensional representation to create AF probabilities

In [45]:
path_data = f"{CONFIG['PATH_data']}/"
iterations = [f for f in os.listdir(path_data) if f.startswith('iteration')]
for iteration in iterations[1:2]:
    path_iteration = f"{path_data}/{iteration}/"
    ids = pd.read_pickle(f"{path_iteration}/ids.pkl")
    dims_dfs = []
    for pop in [f for f in os.listdir(path_iteration) if f.startswith('pop')]:
        pop = pop.split('pop_')[1]
        temp_ids = ids[ids[f"cluster_{iteration.split('_')[1]}"] == pop]
        path_pop = f"{path_iteration}/pop_{pop}/"
        dims = [f for f in os.listdir(path_pop) if f.startswith('PCs')]
        
        nr_PCs = 3
        PCs_labels = []
        for i in range(nr_PCs):
            PCs_labels.append(f"PC{i+1}")

        dims_df = pd.DataFrame()
        for dim  in dims:
            path_dim = f"{path_pop}/{dim}"
            dim = dim.split('PCs_')[1].split('.pkl')[0]
            
            PCs = pd.read_pickle(path_dim)
            for label in PCs_labels:
                PCs.rename(columns={f"{label}": f"pop_{pop}_{label}_dim_{dim}"}, inplace=True)
                dims_df[f"pop_{pop}_{label}_dim_{dim}"] = list(PCs[f"pop_{pop}_{label}_dim_{dim}"])
        dims_df.index = temp_ids.index
        dims_dfs.append(dims_df)
            
    

In [47]:

# Concatenate DataFrames, filling missing columns with NaN
result = pd.concat(dims_dfs, ignore_index=False, sort=True)
result = result.fillna(0)
result

Unnamed: 0,pop_0_0_PC1_dim_dim_1,pop_0_0_PC1_dim_dim_2,pop_0_0_PC1_dim_dim_3,pop_0_0_PC2_dim_dim_1,pop_0_0_PC2_dim_dim_2,pop_0_0_PC2_dim_dim_3,pop_0_0_PC3_dim_dim_1,pop_0_0_PC3_dim_dim_2,pop_0_0_PC3_dim_dim_3,pop_0_1_PC1_dim_dim_1,...,pop_0_3_PC3_dim_dim_3,pop_0_4_PC1_dim_dim_1,pop_0_4_PC1_dim_dim_2,pop_0_4_PC1_dim_dim_3,pop_0_4_PC2_dim_dim_1,pop_0_4_PC2_dim_dim_2,pop_0_4_PC2_dim_dim_3,pop_0_4_PC3_dim_dim_1,pop_0_4_PC3_dim_dim_2,pop_0_4_PC3_dim_dim_3
0,0.933013,-4.068720,10.336285,-3.492381,9.887428,-2.532068,9.639750,2.783962,7.823999,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6.188379,-4.959371,16.235042,-2.123308,21.497266,3.080950,20.465567,0.438365,-1.994241,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.393187,-6.371439,-3.397163,-6.866763,-3.048944,6.415818,1.044204,2.575572,-2.637278,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.743959,-6.591337,7.333168,-6.852792,8.612897,-1.655533,8.877612,-0.498862,-0.818207,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.883044,-5.209089,2.450627,-4.378275,1.098689,2.820204,-2.990109,-2.853764,-3.851209,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,20.935759,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2500,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,23.250069,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2501,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,21.184803,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2502,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,19.753146,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
