In [None]:
import os
import math
import pandas as pd
import numpy as np
import umap
from functools import reduce

# Configuration settings
from chasm.config import CONFIG

# PLINK preprocessing utilities
from chasm.plink_preprocessing import (
    concat_AFs, 
    divide_into_chunks, 
    make_ids,
    make_AFs
)

from chasm.data_preprocessing import (
    is_snp,
    make_df,
    calculate_AFs,
    merge_AFs_ensembl_build,
    divide_into_chunks,
    align_dataframes,
)
from chasm.gwas import ols_regression, pca_of_n_snps, project_on_dimensions

from chasm.visualization import make_population_plot
from chasm.ld_blocks import segmenter
from chasm.abyss import linear_abyss

from chasm.fst import make_fst, make_global_fst
from chasm.cluster import silhouette_score_clusters

2025-02-20 12:32:25.351612: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-20 12:32:25.487869: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 12:32:26.570971: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-20 12:32:26.575375: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Changing the GTM data into a pickle file having an id file and calculating the AFs for each SNP 
# with the CHROM:POS for every corresponding RSID

# GTM data
"""
path_data_raw = f"{CONFIG['PATH_data']}/00_raw/recoded_1000G.raw"
path_raw = f"{CONFIG['PATH_data']}/01_raw/"
path_usefull = f"{CONFIG['PATH_data']}/02_usefull/"

geno = make_df(path_data_raw, path_usefull, path_raw)

afs = calculate_AFs(geno)

path_ensembl = f"/mnt/e/1000G_data/usefull/ensembl_build"
merge_AFs_ensembl_build(path_ensembl, path_usefull, afs)
"""
# Divide the AFs into chunks
"""
size_chunck = 20_000
min_maf = 0.01

path_raw = f"{CONFIG['PATH_data']}/01_raw/geno.pkl"
path_afs = f"{CONFIG['PATH_data']}/02_usefull/allele_frequencies.pkl"
path_output = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"

divide_into_chunks(path_raw, path_afs, path_output, size_chunck, min_maf)
"""

'\nsize_chunck = 20_000\nmin_maf = 0.01\n\npath_raw = f"{CONFIG[\'PATH_data\']}/01_raw/geno.pkl"\npath_afs = f"{CONFIG[\'PATH_data\']}/02_usefull/allele_frequencies.pkl"\npath_output = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF/"\n\ndivide_into_chunks(path_raw, path_afs, path_output, size_chunck, min_maf)\n'

# Choose SNPs to project on n dimensions

# Observe raw dimensions

In [3]:
path_ids = f"{CONFIG['PATH_data']}/02_usefull/ids.pkl"
path_panel = f"{CONFIG['PATH_data']}/00_raw/all.panel"
ids = pd.read_pickle(path_ids)
labels = pd.read_pickle(path_panel)
ids = ids.merge(labels, left_on="IID", right_on="Sample name")

In [None]:
"""
path_macro_similar = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
iterations = 3

for iter in list(range(iterations))[2:]:
    print(f"Running iteration {iter}")
    path_output = f"{CONFIG['PATH_data']}/iteration_{iter}/"
    if iter == 0:
        ids[f"cluster_{iter}"] = f"{0}"
        ids.to_pickle(path_ids)
        
    else:
        pass
    
    cluster_to_add = []
    for sub_pop in ids[f"cluster_{iter}"].unique():
        path_output = f"{CONFIG['PATH_data']}/iteration_{iter}/pop_{sub_pop}/"
        os.makedirs(path_output, exist_ok=True)
        temp_ids = ids[ids[f"cluster_{iter}"] == sub_pop]
        nr_of_projected_dimensions = 3
        geno = project_on_dimensions(path_macro_similar, path_output, temp_ids, nr_of_projected_dimensions=3, nr_snps = 20_000, n_components = 10)
        
        existing_dims = []   
        for dims in [f for f in os.listdir(path_output) if f.startswith('dim')]:
            PCs = pd.read_pickle(f"{path_output}/{dims}/global_PCs.pkl")
            temp_ids = temp_ids.copy()
            temp_ids[dims] = list(PCs['PC1'])
            existing_dims.append(dims)
        labels = silhouette_score_clusters(temp_ids, existing_dims, plot=False)
        for element in labels:
            cluster_to_add.append(f"{sub_pop}_{element}")
            
    ids[f"cluster_{iter+1}"] = cluster_to_add
    ids.to_pickle(path_ids)
    
    # Vizualization
    path_input = f"{CONFIG['PATH_data']}/iteration_{iter}/"
    for pop in [f for f in os.listdir(path_input) if f.startswith('pop')]:
        pop = pop.split('pop_')[1]
        temp_ids = ids[ids[f"cluster_{iter}"] == pop]
        path_pop = f"{path_input}/pop_{pop}"
        for dim in [f for f in os.listdir(path_pop) if f.startswith('dim')]:
            path_dim = f"{path_pop}/{dim}"
            PC_file = [f for f in os.listdir(path_dim) if f.startswith('global_PCs')][0]
            PCs = pd.read_pickle(f"{path_dim}/{PC_file}")
            temp_ids = temp_ids.copy()
            temp_ids[dim] = list(PCs['PC1'])
        if len(list(temp_ids['Population name'].unique())) > 20:
            make_population_plot(temp_ids, 'dim_1', 'dim_2', 'Superpopulation name', f"iteration {iter} - pop {pop}", palette = 'rocket')
        else:
            make_population_plot(temp_ids, 'dim_1', 'dim_2', 'Population name', f"iteration {iter} - pop {pop}", palette = 'rocket')
            
"""

'\npath_macro_similar = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF/"\niterations = 3\n\nfor iter in list(range(iterations))[2:]:\n    print(f"Running iteration {iter}")\n    path_output = f"{CONFIG[\'PATH_data\']}/iteration_{iter}/"\n    if iter == 0:\n        ids[f"cluster_{iter}"] = f"{0}"\n    else:\n        pass\n    \n    cluster_to_add = []\n    for sub_pop in ids[f"cluster_{iter}"].unique():\n        path_output = f"{CONFIG[\'PATH_data\']}/iteration_{iter}/pop_{sub_pop}/"\n        os.makedirs(path_output, exist_ok=True)\n        temp_ids = ids[ids[f"cluster_{iter}"] == sub_pop]\n        nr_of_projected_dimensions = 3\n        geno = project_on_dimensions(path_macro_similar, path_output, temp_ids, nr_of_projected_dimensions=3, nr_snps = 20_000, n_components = 10)\n        \n        existing_dims = []   \n        for dims in [f for f in os.listdir(path_output) if f.startswith(\'dim\')]:\n            PCs = pd.read_pickle(f"{path_output}/{dims}/global_PCs.pkl")\n            temp_

# Find the dims per snp

In [None]:
path_data = f"{CONFIG['PATH_data']}/"
iteration_files = [f for f in os.listdir(path_data) if f.startswith('iteration')]
snp_ids_dfs = []
for iteration in iteration_files:
    path_iteration = f"{path_data}/{iteration}/"
    for pop in [f for f in os.listdir(path_iteration) if f.startswith('pop')]:
        pop = pop.split('pop_')[1]
        path_snp_ids = f"{path_iteration}/pop_{pop}/snp_ids.pkl"
        snp_ids = pd.read_pickle(path_snp_ids)
        snp_ids = snp_ids.drop(columns=['pval', 'betas', '-logp'])
        snp_ids[f"iter_{iteration}_pop_{pop}_dim"] = list(snp_ids['dim'])
        snp_ids = snp_ids.drop(columns=['dim'])
        snp_ids = snp_ids.sort_index()  # Now sort
        snp_ids_dfs.append(snp_ids)
        
merged_df = reduce(lambda left, right: pd.merge(left, right, on='snp_rs'), snp_ids_dfs)

        

0
0_1
0_4
0_0
0_2
0_3
0_1_1
0_1_0
0_1_2
0_4_0
0_4_1
0_0_2
0_0_0
0_0_1
0_2_2
0_2_0
0_2_1
0_3_4
0_3_1
0_3_0
0_3_2
0_3_3


In [30]:
merged_df

Unnamed: 0,snp_rs,pop_0_dim,pop_0_1_dim,pop_0_4_dim,pop_0_0_dim,pop_0_2_dim,pop_0_3_dim,pop_0_1_1_dim,pop_0_1_0_dim,pop_0_1_2_dim,...,pop_0_0_0_dim,pop_0_0_1_dim,pop_0_2_2_dim,pop_0_2_0_dim,pop_0_2_1_dim,pop_0_3_4_dim,pop_0_3_1_dim,pop_0_3_0_dim,pop_0_3_2_dim,pop_0_3_3_dim
0,rs9828553_G,3,2,1,1,1,2,1,1,2,...,1,1,1,2,3,2,3,2,2,1
1,rs8075603_T,2,1,1,1,1,3,3,3,2,...,2,1,2,2,3,3,1,3,2,3
2,rs2031579_G,3,1,1,2,3,1,2,3,2,...,2,1,1,1,3,2,3,1,1,3
3,rs9615890_A,1,3,2,1,1,3,1,2,2,...,1,2,2,3,1,1,1,3,1,3
4,rs8126694_G,3,3,1,2,3,3,1,2,2,...,3,1,3,1,2,1,1,3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38939,rs6837350_C,1,2,2,1,2,2,1,3,3,...,3,3,1,3,3,3,3,1,1,2
38940,rs6552684_A,1,1,2,1,2,3,2,2,3,...,2,3,3,1,1,3,3,3,3,3
38941,rs80316200_C,1,2,1,2,3,3,1,3,2,...,3,3,2,3,3,3,3,3,3,3
38942,rs11725738_T,1,2,2,3,2,2,1,1,2,...,2,3,3,2,2,3,3,2,3,2


In [8]:
folders = os.listdir(f"{CONFIG['PATH_data']}")
os.makedirs(f"{CONFIG['PATH_data']}/09_snps_id/", exist_ok=True)
projected_folders = [folder for folder in folders if "projected" in folder]
i = 0
for folder in projected_folders:
    dfs = []
    path_projected = f"{CONFIG['PATH_data']}/{folder}"
    pops = [f for f in os.listdir(path_projected) if f.startswith('pop')]
    for pop in pops:
        path_pop = f"{path_projected}/{pop}"
        snp_id = [f for f in os.listdir(path_pop) if f.startswith('snp')][0]
        path_snp = f"{path_pop}/{snp_id}"
        temp = pd.read_pickle(path_snp)
        temp['pop'] = pop
        temp['folder'] = folder
        dfs.append(temp)
        
    dfs = pd.concat(dfs, axis=0)
    i = i + 1
    dfs.to_pickle(f"{CONFIG['PATH_data']}/09_snps_id/projection_{i}_ids.pkl")

In [10]:
path_projection_id = f"{CONFIG['PATH_data']}/09_snps_id/"
for projection_folder in [f for f in os.listdir(path_projection_id) if f.startswith('projection')]:
    projection = pd.read_pickle(f"{path_projection_id}/{projection_folder}")

In [None]:
# Divide into chunks
"""
path_input  = f"{CONFIG['PATH_data']}/03_macro_similar_AF"
for cluster in list(merged_df['clusters'].unique()):
    cluster_df = merged_df[merged_df['clusters'] == cluster]
    for chrom in [f for f in os.listdir(path_input) if f.startswith('chrom')]:
        path_chrom = f"{path_input}/{chrom}"
        for chunk in [f for f in os.listdir(path_chrom) if f.startswith('chunk')]:
            path_chunk = f"{path_chrom}/{chunk}"
            chunk_df = pd.read_pickle(path_chunk)
            chunk_df['IID'] = ids['IID']
            merged = pd.merge(cluster_df[['IID']], chunk_df, on='IID')
            merged.drop('IID', axis=1, inplace=True)
            path_output = f"{CONFIG['PATH_data']}/05_macro_similar_per_pop/pop_{cluster}/{chrom}"
            os.makedirs(path_output, exist_ok=True)
            merged.to_pickle(f"{path_output}/{chunk}")
"""

'\npath_input  = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF"\nfor cluster in list(merged_df[\'clusters\'].unique()):\n    cluster_df = merged_df[merged_df[\'clusters\'] == cluster]\n    for chrom in [f for f in os.listdir(path_input) if f.startswith(\'chrom\')]:\n        path_chrom = f"{path_input}/{chrom}"\n        for chunk in [f for f in os.listdir(path_chrom) if f.startswith(\'chunk\')]:\n            path_chunk = f"{path_chrom}/{chunk}"\n            chunk_df = pd.read_pickle(path_chunk)\n            chunk_df[\'IID\'] = ids[\'IID\']\n            merged = pd.merge(cluster_df[[\'IID\']], chunk_df, on=\'IID\')\n            merged.drop(\'IID\', axis=1, inplace=True)\n            path_output = f"{CONFIG[\'PATH_data\']}/05_macro_similar_per_pop/pop_{cluster}/{chrom}"\n            os.makedirs(path_output, exist_ok=True)\n            merged.to_pickle(f"{path_output}/{chunk}")\n'

In [9]:
# iteration 1

In [11]:

"""
path_input = f"{CONFIG['PATH_data']}/05_macro_similar_per_pop/"
path_output = f"{CONFIG['PATH_data']}/06_projected_per_pop/"
for pop in [f for f in os.listdir(path_input) if f.startswith('pop')]:
    path_input_pop = f"{path_input}/{pop}"
    path_output_pop = f"{path_output}/{pop}"
    nr_of_projected_dimensions = 3
    project_on_dimensions(path_input_pop, path_output_pop, nr_of_projected_dimensions)
"""

'\npath_input = f"{CONFIG[\'PATH_data\']}/05_macro_similar_per_pop/"\npath_output = f"{CONFIG[\'PATH_data\']}/06_projected_per_pop/"\nfor pop in [f for f in os.listdir(path_input) if f.startswith(\'pop\')]:\n    path_input_pop = f"{path_input}/{pop}"\n    path_output_pop = f"{path_output}/{pop}"\n    nr_of_projected_dimensions = 3\n    project_on_dimensions(path_input_pop, path_output_pop, nr_of_projected_dimensions)\n'

In [None]:
"""
merged_df = pd.read_pickle(f"{CONFIG['PATH_data']}/04_projected/pop_0/samples_with_clusters.pkl")
path_input = f"{CONFIG['PATH_data']}/06_projected_per_pop/"
pops = [f for f in os.listdir(path_input) if f.startswith('pop')]
for pop in pops:
    pop = int(pop.split('_')[1])
    temp_samples = merged_df[merged_df['clusters'] == pop].copy()
    path_pop = f"{path_input}/pop_{pop}"
    dims = [f for f in os.listdir(path_pop) if f.startswith("dim")]
    for dim in dims:
        path_dim = f"{path_pop}/{dim}"
        PCs = [f for f in os.listdir(path_dim) if f.startswith("global")][0]
        path_PC = f"{path_dim}/{PCs}"
        PC_dim = pd.read_pickle(path_PC)
        temp_samples[dim] = list(PC_dim['PC1'])
    
    make_population_plot(temp_samples, 'dim_1', 'dim_2', 'Population name', 'test', palette = 'rocket')
    
    # clustering
    labels = silhouette_score_clusters(temp_samples, ['dim_1', 'dim_2', 'dim_3'], plot=False)
    temp_samples['clusters'] = labels
    temp_samples.to_pickle(f"{CONFIG['PATH_data']}/06_projected_per_pop/pop_{pop}/samples_with_clusters.pkl")
    make_population_plot(temp_samples, 'dim_1', 'dim_2', 'clusters', 'test', palette = 'rocket')
"""

'\nmerged_df = pd.read_pickle(f"{CONFIG[\'PATH_data\']}/04_projected/samples_with_clusters.pkl")\npath_input = f"{CONFIG[\'PATH_data\']}/06_projected_per_pop/"\npops = [f for f in os.listdir(path_input) if f.startswith(\'pop\')]\nfor pop in pops:\n    pop = int(pop.split(\'_\')[1])\n    temp_samples = merged_df[merged_df[\'clusters\'] == pop].copy()\n    path_pop = f"{path_input}/pop_{pop}"\n    dims = [f for f in os.listdir(path_pop) if f.startswith("dim")]\n    for dim in dims:\n        path_dim = f"{path_pop}/{dim}"\n        PCs = [f for f in os.listdir(path_dim) if f.startswith("global")][0]\n        path_PC = f"{path_dim}/{PCs}"\n        PC_dim = pd.read_pickle(path_PC)\n        temp_samples[dim] = list(PC_dim[\'PC1\'])\n    \n    make_population_plot(temp_samples, \'dim_1\', \'dim_2\', \'Population name\', \'test\', palette = \'rocket\')\n    \n    # clustering\n    labels = silhouette_score_clusters(temp_samples, [\'dim_1\', \'dim_2\', \'dim_3\'], plot=False)\n    temp_samples[\

In [12]:
"""
path_input_macro  = f"{CONFIG['PATH_data']}/03_macro_similar_AF"
path_input = f"{CONFIG['PATH_data']}/06_projected_per_pop/"
pops = [f for f in os.listdir(path_input) if f.startswith('pop')]
for pop in pops:
    pop_nr = int(pop.split('_')[1])
    path_pop = f"{path_input}/{pop}"
    samples = pd.read_pickle(f"{path_pop}/samples_with_clusters.pkl")
    for cluster in list(samples['clusters'].unique()):
        cluster_df = samples[samples['clusters'] == cluster]
        
        for chrom in [f for f in os.listdir(path_input_macro) if f.startswith('chrom')]:
            path_chrom = f"{path_input_macro}/{chrom}"
            for chunk in [f for f in os.listdir(path_chrom) if f.startswith('chunk')]:
                path_chunk = f"{path_chrom}/{chunk}"
                chunk_df = pd.read_pickle(path_chunk)
                chunk_df['IID'] = ids['IID']
                merged = pd.merge(cluster_df[['IID']], chunk_df, on='IID')
                merged.drop('IID', axis=1, inplace=True)
                path_output = f"{CONFIG['PATH_data']}/07_macro_similar_per_sub_pop/pop_{pop_nr}_subpop_{cluster}/{chrom}"
                os.makedirs(path_output, exist_ok=True)
                merged.to_pickle(f"{path_output}/{chunk}")
                
"""

'\npath_input_macro  = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF"\npath_input = f"{CONFIG[\'PATH_data\']}/06_projected_per_pop/"\npops = [f for f in os.listdir(path_input) if f.startswith(\'pop\')]\nfor pop in pops:\n    pop_nr = int(pop.split(\'_\')[1])\n    path_pop = f"{path_input}/{pop}"\n    samples = pd.read_pickle(f"{path_pop}/samples_with_clusters.pkl")\n    for cluster in list(samples[\'clusters\'].unique()):\n        cluster_df = samples[samples[\'clusters\'] == cluster]\n        \n        for chrom in [f for f in os.listdir(path_input_macro) if f.startswith(\'chrom\')]:\n            path_chrom = f"{path_input_macro}/{chrom}"\n            for chunk in [f for f in os.listdir(path_chrom) if f.startswith(\'chunk\')]:\n                path_chunk = f"{path_chrom}/{chunk}"\n                chunk_df = pd.read_pickle(path_chunk)\n                chunk_df[\'IID\'] = ids[\'IID\']\n                merged = pd.merge(cluster_df[[\'IID\']], chunk_df, on=\'IID\')\n                merged

In [13]:
"""
path_input = f"{CONFIG['PATH_data']}/07_macro_similar_per_sub_pop/"
path_output = f"{CONFIG['PATH_data']}/08_projected_per_sub_pop/"
for pop in [f for f in os.listdir(path_input) if f.startswith('pop')]:
    path_input_pop = f"{path_input}/{pop}"
    path_output_pop = f"{path_output}/{pop}"
    nr_of_projected_dimensions = 3
    project_on_dimensions(path_input_pop, path_output_pop, nr_of_projected_dimensions)
    
"""

'\npath_input = f"{CONFIG[\'PATH_data\']}/07_macro_similar_per_sub_pop/"\npath_output = f"{CONFIG[\'PATH_data\']}/08_projected_per_sub_pop/"\nfor pop in [f for f in os.listdir(path_input) if f.startswith(\'pop\')]:\n    path_input_pop = f"{path_input}/{pop}"\n    path_output_pop = f"{path_output}/{pop}"\n    nr_of_projected_dimensions = 3\n    project_on_dimensions(path_input_pop, path_output_pop, nr_of_projected_dimensions)\n    \n'

In [7]:
"""
path_propjected = f"{CONFIG['PATH_data']}/06_projected_per_pop/"
path_input = f"{CONFIG['PATH_data']}/08_projected_per_sub_pop/"
pops = [f for f in os.listdir(path_input) if f.startswith('pop')]
for pop in pops:

    sub_pop = int(pop.split('_')[3])
    pop = int(pop.split('_')[1])
    
    path_propjected_pop = f"{path_propjected}/pop_{pop}"
    merged_df = pd.read_pickle(f"{path_propjected_pop}/samples_with_clusters.pkl")
    temp_samples = merged_df[merged_df['clusters'] == sub_pop].copy()
    path_pop = f"{path_input}/pop_{pop}_subpop_{sub_pop}"
    dims = [f for f in os.listdir(path_pop) if f.startswith("dim")]
    for dim in dims:
        path_dim = f"{path_pop}/{dim}"
        PCs = [f for f in os.listdir(path_dim) if f.startswith("global")][0]
        path_PC = f"{path_dim}/{PCs}"
        PC_dim = pd.read_pickle(path_PC)
        temp_samples[dim] = list(PC_dim['PC1'])
    
    make_population_plot(temp_samples, 'dim_3', 'dim_2', 'Population name', 'test', palette = 'rocket')
"""

'\npath_propjected = f"{CONFIG[\'PATH_data\']}/06_projected_per_pop/"\npath_input = f"{CONFIG[\'PATH_data\']}/08_projected_per_sub_pop/"\npops = [f for f in os.listdir(path_input) if f.startswith(\'pop\')]\nfor pop in pops:\n\n    sub_pop = int(pop.split(\'_\')[3])\n    pop = int(pop.split(\'_\')[1])\n    \n    path_propjected_pop = f"{path_propjected}/pop_{pop}"\n    merged_df = pd.read_pickle(f"{path_propjected_pop}/samples_with_clusters.pkl")\n    temp_samples = merged_df[merged_df[\'clusters\'] == sub_pop].copy()\n    path_pop = f"{path_input}/pop_{pop}_subpop_{sub_pop}"\n    dims = [f for f in os.listdir(path_pop) if f.startswith("dim")]\n    for dim in dims:\n        path_dim = f"{path_pop}/{dim}"\n        PCs = [f for f in os.listdir(path_dim) if f.startswith("global")][0]\n        path_PC = f"{path_dim}/{PCs}"\n        PC_dim = pd.read_pickle(path_PC)\n        temp_samples[dim] = list(PC_dim[\'PC1\'])\n    \n    make_population_plot(temp_samples, \'dim_3\', \'dim_2\', \'Populat

# Find perfect MAF prediction

In [49]:
"""
folders = os.listdir(f"{CONFIG['PATH_data']}")
os.makedirs(f"{CONFIG['PATH_data']}/09_snps_id/", exist_ok=True)
projected_folders = [folder for folder in folders if "projected" in folder]
i = 0
for folder in projected_folders:
    dfs = []
    path_projected = f"{CONFIG['PATH_data']}/{folder}"
    pops = [f for f in os.listdir(path_projected) if f.startswith('pop')]
    for pop in pops:
        path_pop = f"{path_projected}/{pop}"
        snp_id = [f for f in os.listdir(path_pop) if f.startswith('snp')][0]
        path_snp = f"{path_pop}/{snp_id}"
        temp = pd.read_pickle(path_snp)
        temp['pop'] = pop
        temp['folder'] = folder
        dfs.append(temp)
        
    dfs = pd.concat(dfs, axis=0)
    i = i + 1
    dfs.to_pickle(f"{CONFIG['PATH_data']}/09_snps_id/projection_{i}_ids.pkl")
"""

'\nfolders = os.listdir(f"{CONFIG[\'PATH_data\']}")\nos.makedirs(f"{CONFIG[\'PATH_data\']}/09_snps_id/", exist_ok=True)\nprojected_folders = [folder for folder in folders if "projected" in folder]\ni = 0\nfor folder in projected_folders:\n    dfs = []\n    path_projected = f"{CONFIG[\'PATH_data\']}/{folder}"\n    pops = [f for f in os.listdir(path_projected) if f.startswith(\'pop\')]\n    for pop in pops:\n        path_pop = f"{path_projected}/{pop}"\n        snp_id = [f for f in os.listdir(path_pop) if f.startswith(\'snp\')][0]\n        path_snp = f"{path_pop}/{snp_id}"\n        temp = pd.read_pickle(path_snp)\n        temp[\'pop\'] = pop\n        temp[\'folder\'] = folder\n        dfs.append(temp)\n        \n    dfs = pd.concat(dfs, axis=0)\n    i = i + 1\n    dfs.to_pickle(f"{CONFIG[\'PATH_data\']}/09_snps_id/projection_{i}_ids.pkl")\n'

In [None]:
path_input = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
projection_ids = os.listdir(f"{CONFIG['PATH_data']}/09_snps_id/")

for chrom in [f for f in  os.listdir(path_input) if f.startswith('chrom')][0:1]:
    path_chrom = f"{path_input}/{chrom}"
    for chunk in [f for f in os.listdir(path_chrom) if f.startswith('chunk')][0:1]:
        path_chunk = f"{path_chrom}/{chunk}"
        chunk_df = pd.read_pickle(path_chunk)
        for snp in list(chunk_df.columns)[0:3]:     
            for projection_id in projection_ids:
                path_projection = f"{CONFIG['PATH_data']}/09_snps_id/{projection_id}"
                projection = pd.read_pickle(path_projection)
                snps_folders = projection[projection['snp_rs'] == snp].sort_values('pop')
                
                
                


yey rs3928804_T
rs3928804_T
yey rs6695244_A
rs6695244_A
yey rs12745158_G
rs12745158_G


In [51]:
snps_folders

Unnamed: 0,pval,betas,snp_rs,-logp,dim,pop,folder
2,0.403572,-0.42925,rs12745158_G,0.394079,2,pop_0_subpop_0,08_projected_per_sub_pop
2,0.184803,2.707016,rs12745158_G,0.733292,1,pop_0_subpop_1,08_projected_per_sub_pop
2,0.271207,1.207662,rs12745158_G,0.566698,2,pop_0_subpop_2,08_projected_per_sub_pop
2,0.058531,5.27024,rs12745158_G,1.232614,1,pop_1_subpop_0,08_projected_per_sub_pop
2,0.222673,-1.315814,rs12745158_G,0.652333,2,pop_1_subpop_1,08_projected_per_sub_pop
1,0.422229,1.603859,rs12745158_G,0.374452,2,pop_1_subpop_2,08_projected_per_sub_pop
2,0.133921,1.40312,rs12745158_G,0.873152,1,pop_1_subpop_3,08_projected_per_sub_pop
1,0.392363,0.87637,rs12745158_G,0.406312,2,pop_2_subpop_0,08_projected_per_sub_pop
2,0.090999,3.384176,rs12745158_G,1.040963,1,pop_2_subpop_1,08_projected_per_sub_pop
1,0.451859,-0.525493,rs12745158_G,0.344997,3,pop_2_subpop_2,08_projected_per_sub_pop


In [None]:
# Assuming df is your DataFrame
for index, row in snps_folders.iterrows():
    pop = row['pop']
    dim = row['dim']
    folder = row['folder']
    path_projected = f"{CONFIG['PATH_data']}/{folder}/{pop}/dim_{dim}/"

In [62]:
path_usefull = f"{CONFIG['PATH_data']}/02_usefull/allele_frequencies.pkl"
AFs = pd.read_pickle(path_usefull)


In [64]:
AFs[AFs['snp_rs'] == snp]

Unnamed: 0,snp_rs,AF,RSID,POS,CHROM
1363,rs12745158_G,0.5,rs12745158,39802413,1


In [61]:
snp_id[snp_id['snp_rs'] == snp]

Unnamed: 0,pval,betas,snp_rs,-logp,dim
2,0.317308,2.297452,rs12745158_G,0.498519,2
