In [None]:
import os
import math
import pandas as pd
import numpy as np

# Configuration settings
from chasm.config import CONFIG

# PLINK preprocessing utilities
from chasm.plink_preprocessing import (
    test, 
    make_AFs, 
    concat_AFs, 
    divide_into_chunks, 
    make_ids,
    is_snp
)



In [None]:
# Changing the GTM data into a pickle file having an id file and calculating the AFs for each SNP 
# with the CHROM:POS for every corresponding RSID
"""
# GTM data
path_data_raw = f"{CONFIG['PATH_data']}/00_raw/recoded_1000G.raw"
path_data_ids = f"{CONFIG['PATH_data']}/00_raw/recoded_1000G.raw.noadmixed.ids"
path_data_lbls3 = f"{CONFIG['PATH_data']}/00_raw/recoded_1000G.raw.noadmixed.lbls3"

geno = pd.read_csv(path_data_raw, sep=" ")
non_snp_cols = [col for col in geno.columns if not is_snp(col)]
path_usefull = f"{CONFIG['PATH_data']}/usefull/"
os.makedirs(path_usefull, exist_ok=True)
geno[non_snp_cols].to_pickle(f"{path_usefull}/ids.pkl")
geno = geno.drop(columns=non_snp_cols)
path_raw = f"{CONFIG['PATH_data']}/01_raw/"
os.makedirs(path_raw, exist_ok=True)
geno.to_pickle(f"{path_raw}/geno.pkl")
allele_frequencies = geno.sum(axis=0) / (2 * geno.shape[0])

allele_frequencies_df = pd.DataFrame(allele_frequencies, columns=["AF"])
# Assuming allele_frequencies is a Pandas Series
allele_frequencies_df = allele_frequencies.reset_index()
allele_frequencies_df.columns = ["snp_rs", "AF"]  # Rename columns
# Create a new column without the allele suffix
allele_frequencies_df['RSID'] = allele_frequencies_df['snp_rs'].str.replace(r'_[ACTG]$', '', regex=True)

merged_dfs = []
path_ensembl = f"/mnt/e/1000G_data/usefull/ensembl_build"
for chrom in list(range(22)):
    chrom += 1
    path_ensembl_chrom = f"{path_ensembl}/chrom_{chrom}"
    for build in [f for f in os.listdir(path_ensembl_chrom) if f.startswith('build')]:
        build = pd.read_pickle(f"{path_ensembl_chrom}/{build}")
        build['CHROM'] = chrom
        merged_dfs.append(pd.merge(allele_frequencies_df, build, left_on='RSID', right_on='RSID', how='inner'))
        
df = pd.concat(merged_dfs, axis=0)
# Sorting by CHROM and then POS
df_sorted = df.sort_values(by=["CHROM", "POS"]).reset_index(drop=True)
df_sorted.to_pickle(f"{path_usefull}/allele_frequencies.pkl")
"""

In [35]:
# Divide the AFs into chunks
"""
size_chunck = 20_000
min_maf = 0.01

path_raw = f"{CONFIG['PATH_data']}/01_raw/"
geno = pd.read_pickle(f"{path_raw}/geno.pkl")
geno.fillna(0, inplace=True)
geno = (geno - 1)*-1

path_afs = f"{CONFIG['PATH_data']}/usefull/allele_frequencies.pkl"
path_output = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
os.makedirs(path_output, exist_ok=True)
afs = pd.read_pickle(path_afs)

for chrom in afs['CHROM'].unique():
    path_output_chrom = f"{path_output}/chrom_{chrom}/"
    os.makedirs(path_output_chrom, exist_ok=True)
    
    afs_chrom = afs[afs['CHROM'] == chrom]
    afs_chrom = afs_chrom[afs_chrom['AF'] > min_maf]

    afs_chrom = afs_chrom.sort_values(by=["AF"], ascending=False).reset_index(drop=True)
    nr_snps_total = len(afs_chrom)
    num_subframes = nr_snps_total//size_chunck
    remaining_rows = nr_snps_total%size_chunck
    
    try:
        to_divide_in = remaining_rows//num_subframes
        rest = nr_snps_total- ((size_chunck + to_divide_in)*num_subframes)
        snps_per_segments = np.ones(num_subframes) * (size_chunck+ to_divide_in)
        to_add = np.concatenate((np.ones(rest), np.zeros(num_subframes - rest)),axis = 0)
        snps_per_segments = snps_per_segments + to_add
    except Exception as e:
        snps_per_segments = [nr_snps_total]
        
    # Make Chunks per chromosomes
    start = 0
    end = 0
    i = 0
    for nr_snps in snps_per_segments:
        end = int(end + nr_snps)
        AF_chunk = afs_chrom[start:end].copy()
        start = int(start + nr_snps)
        feature_size = AF_chunk.shape[0]
        i+=1
        minaf = np.round(AF_chunk['AF'].min(),2)
        maxaf = np.round(AF_chunk['AF'].max(),2)
        geno[AF_chunk['snp_rs']].to_pickle(f"{path_output_chrom}/chunk_{i}_size_{len(AF_chunk)}_mafs_{minaf}_{maxaf}.pkl")
"""

'\nsize_chunck = 20_000\nmin_maf = 0.01\n\npath_raw = f"{CONFIG[\'PATH_data\']}/01_raw/"\ngeno = pd.read_pickle(f"{path_raw}/geno.pkl")\ngeno.fillna(0, inplace=True)\ngeno = (geno - 1)*-1\n\npath_afs = f"{CONFIG[\'PATH_data\']}/usefull/allele_frequencies.pkl"\npath_output = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF/"\nos.makedirs(path_output, exist_ok=True)\nafs = pd.read_pickle(path_afs)\n\nfor chrom in afs[\'CHROM\'].unique():\n    path_output_chrom = f"{path_output}/chrom_{chrom}/"\n    os.makedirs(path_output_chrom, exist_ok=True)\n    \n    afs_chrom = afs[afs[\'CHROM\'] == chrom]\n    afs_chrom = afs_chrom[afs_chrom[\'AF\'] > min_maf]\n\n    afs_chrom = afs_chrom.sort_values(by=["AF"], ascending=False).reset_index(drop=True)\n    nr_snps_total = len(afs_chrom)\n    num_subframes = nr_snps_total//size_chunck\n    remaining_rows = nr_snps_total%size_chunck\n    \n    try:\n        to_divide_in = remaining_rows//num_subframes\n        rest = nr_snps_total- ((size_chunck + to_div

# Choose SNPs to project on 1 DIM

In [36]:
# 1 create PCA

nr_snps_for_PCA = 20_000

path_input = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"

chroms = os.listdir(path_input)
nr_snps_for_PCA_per_chrom = math.ceil(nr_snps_for_PCA/len(chroms))
genos = []
for chrom in chroms:
    path_chrom = f"{path_input}/{chrom}"
    chunks = os.listdir(path_chrom)
    nr_snps_for_PCA_per_chunks = math.ceil(nr_snps_for_PCA_per_chrom / len(chunks))

    for chunk in chunks:
        path_chunk = f"{path_chrom}/{chunk}"
        geno = pd.read_pickle(path_chunk)
        # Get number of available columns
        num_available_columns = geno.shape[1]

        # Adjust n if needed
        n = min(nr_snps_for_PCA_per_chunks, num_available_columns)
        geno = geno.sample(n=n, axis=1)
        genos.append(geno)
genos = pd.concat(genos, axis=1)


In [37]:
genos

Unnamed: 0,rs4951095_A,rs6665357_C,rs843949_C,rs171137_A,rs9441941_C,rs1340771_G,rs2077061_T,rs1702304_A,rs58830878_G,rs1635506_C,...,rs113570940_C,rs1894532_C,rs133662_T,rs2267305_C,rs139134_A,rs115079653_C,rs867086_C,rs9607372_A,rs737923_G,rs66468952_A
0,1,-1,-1,0,0,1,1,0,0,1,...,1.0,1,1,-1,0,0,-1,-1,0,-1
1,1,1,0,0,0,1,-1,1,0,0,...,1.0,0,0,0,0,0,-1,0,0,1
2,0,0,1,-1,0,0,0,0,0,-1,...,1.0,0,1,-1,0,1,-1,1,0,0
3,1,1,1,-1,1,0,0,1,1,-1,...,1.0,1,1,0,0,1,0,-1,0,1
4,0,0,-1,0,-1,1,1,0,1,0,...,1.0,-1,1,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499,1,1,0,0,0,0,0,0,0,0,...,1.0,0,1,-1,0,1,-1,-1,-1,1
2500,0,0,0,0,0,0,-1,1,0,1,...,1.0,1,0,-1,0,1,1,0,1,-1
2501,1,0,0,-1,1,0,1,1,0,0,...,1.0,1,-1,0,-1,1,1,-1,0,1
2502,0,1,0,0,1,0,0,0,0,0,...,1.0,1,0,0,0,0,0,0,0,0


In [38]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize the data (zero mean, unit variance)
scaler = StandardScaler()
genos = scaler.fit_transform(genos)  # Returns a NumPy array

# Apply PCA
n_components = 5  # Number of principal components to keep
pca = PCA(n_components=n_components)
genos_pca = pca.fit_transform(genos)  # Transform the data

# Convert PCA output to DataFrame
genos_pca = pd.DataFrame(genos_pca, columns=[f'PC{i+1}' for i in range(n_components)])

# Explained Variance Ratio
explained_variance = pca.explained_variance_ratio_

In [39]:
genos_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
0,28.046845,3.023890,1.533245,0.039748,-0.842892
1,16.100059,2.647944,-10.248734,3.914047,-1.073932
2,34.738629,5.739448,1.321606,-0.495051,-3.520648
3,27.760890,2.078005,1.241756,-4.175199,-0.883958
4,33.299482,5.766866,1.700281,-0.102952,-0.575061
...,...,...,...,...,...
2499,-9.691925,-7.408922,-26.339319,9.674844,1.914666
2500,-9.301613,-8.050064,-26.604209,8.188526,-1.307271
2501,-8.929657,-6.764144,-26.099826,8.686889,-1.227916
2502,-9.571332,-7.937263,-27.943715,9.161649,-0.943432
