In [1]:
import os
import math
import pandas as pd
import numpy as np

# Configuration settings
from chasm.config import CONFIG

# PLINK preprocessing utilities
from chasm.plink_preprocessing import (
    test, 
    concat_AFs, 
    divide_into_chunks, 
    make_ids,
    make_AFs
)

from chasm.data_preprocessing import (
    is_snp,
    make_df,
    calculate_AFs,
    merge_AFs_ensembl_build,
    divide_into_chunks
)

from chasm.gwas import ols_regression, pca_of_n_snps

In [2]:
# Changing the GTM data into a pickle file having an id file and calculating the AFs for each SNP 
# with the CHROM:POS for every corresponding RSID
"""
# GTM data
path_data_raw = f"{CONFIG['PATH_data']}/00_raw/recoded_1000G.raw"
path_raw = f"{CONFIG['PATH_data']}/01_raw/"
path_usefull = f"{CONFIG['PATH_data']}/02_usefull/"

geno = make_df(path_data_raw, path_usefull, path_output)

afs = calculate_AFs(geno)

path_ensembl = f"/mnt/e/1000G_data/usefull/ensembl_build"
merge_AFs_ensembl_build(path_ensembl, path_usefull, afs)
"""
# Divide the AFs into chunks
"""
size_chunck = 20_000
min_maf = 0.01

path_raw = f"{CONFIG['PATH_data']}/01_raw/"
path_afs = f"{CONFIG['PATH_data']}/usefull/allele_frequencies.pkl"
path_output = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"

divide_into_chunks(path_raw, path_afs, path_output, size_chunck, min_maf)
"""

'\nsize_chunck = 20_000\nmin_maf = 0.01\n\npath_raw = f"{CONFIG[\'PATH_data\']}/01_raw/"\npath_afs = f"{CONFIG[\'PATH_data\']}/usefull/allele_frequencies.pkl"\npath_output = f"{CONFIG[\'PATH_data\']}/03_macro_similar_AF/"\n\ndivide_into_chunks(path_raw, path_afs, path_output, size_chunck, min_maf)\n'

# Choose SNPs to project on 1 DIM

first PCA

In [23]:
def split_dataframe_iteratively(df, nr_of_projected_dimensions):
    chunks = []  # To store the split dataframes
    still_to_do = df.copy()
    
    for i in range(nr_of_projected_dimensions):
        num_cols = len(still_to_do.columns) // (nr_of_projected_dimensions - i)
        to_keep = still_to_do.iloc[:, :num_cols]
        still_to_do = still_to_do.iloc[:, num_cols:]
        chunks.append(to_keep)
        
    return chunks

In [21]:
path_input = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
chroms = [f for f in os.listdir(path_input) if f.startswith('chrom')]
p_vals = []
betas = []
snps = []
for chrom in chroms[0:1]:
    path_chrom = f"{path_input}/{chrom}"
    chunks = os.listdir(path_chrom)
    for chunk in chunks[0:1]:
        path_chunk = f"{path_chrom}/{chunk}"
        geno = pd.read_pickle(path_chunk)

In [24]:
chunks = split_dataframe_iteratively(geno, 3)

Unnamed: 0,rs3928804_T,rs6695244_A,rs12745158_G,rs3010023_A,rs703115_C,rs7516288_A,rs6704281_T,rs10888929_T,rs10801960_A,rs7547336_G,...,rs4908526_T,rs6690012_T,rs6676486_G,rs706435_C,rs9988428_G,rs1980798_T,rs6425998_C,rs1915870_C,rs12743597_G,rs7548519_A
0,0,0,1,-1,-1,0,0,-1,1,1,...,0,-1,-1,0,0,0,-1,-1,0,0
1,0,0,-1,1,0,-1,0,-1,0,-1,...,0,0,0,-1,-1,0,-1,-1,-1,0
2,0,1,1,1,0,-1,1,0,1,1,...,-1,1,1,0,-1,-1,0,1,0,-1
3,0,-1,-1,0,-1,1,1,-1,1,1,...,-1,0,0,-1,0,1,1,0,1,-1
4,0,0,1,0,0,-1,-1,0,1,-1,...,1,-1,0,0,0,-1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2499,-1,-1,-1,0,0,-1,0,0,-1,0,...,0,0,0,0,1,0,1,1,0,1
2500,1,1,0,1,0,1,0,1,-1,0,...,-1,1,1,0,-1,1,0,1,1,1
2501,0,1,0,1,1,1,0,0,0,0,...,-1,0,1,0,-1,1,1,-1,1,0
2502,0,1,-1,1,-1,0,-1,0,-1,0,...,0,1,1,0,0,0,0,1,1,1


In [None]:
nr_of_projected_dimensions = 3

for i in range(nr_of_projected_dimensions-1):
    i+=1
    to_keep = i/3
    print(to_keep)

0.3333333333333333
0.6666666666666666


In [7]:

path_input = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
path_output = f"{CONFIG['PATH_data']}/04_PCA/"
os.makedirs(path_output, exist_ok=True)

n_components = 10
nr_snps = 20_000
genos_pca = pca_of_n_snps(path_input, f"{path_output}/global_PCs.pkl", nr_snps, n_components)

In [29]:
path_input = f"{CONFIG['PATH_data']}/03_macro_similar_AF/"
chroms = [f for f in os.listdir(path_input) if f.startswith('chrom')]
p_vals = []
betas = []
snps = []
for chrom in chroms:
    path_chrom = f"{path_input}/{chrom}"
    chunks = os.listdir(path_chrom)
    for chunk in chunks:
        path_chunk = f"{path_chrom}/{chunk}"
        geno = pd.read_pickle(path_chunk)
        for snp in geno.columns:
            [beta_values, p_values] = ols_regression(genos_pca['PC1'], geno[snp], covs=None)
            p_vals.append(p_values[snp])
            betas.append(beta_values[snp])
            snps.append(snp)
            

p_vals = pd.DataFrame(data = {'pval': p_vals, 'betas':betas, 'snp_rs':snps})
p_vals['-logp'] = -np.log10(p_vals['pval'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [30]:
bonfer = 8.3
to_keep = p_vals[p_vals['-logp'] > bonfer]
other = p_vals[p_vals['-logp'] <= bonfer]

In [33]:
pos = pd.read_pickle(f"{CONFIG['PATH_data']}/usefull/allele_frequencies.pkl")

In [35]:
to_keep = pd.merge(to_keep, pos, left_on='snp_rs', right_on='snp_rs', how='inner')
other = pd.merge(other, pos, left_on='snp_rs', right_on='snp_rs', how='inner')

In [36]:
to_keep

Unnamed: 0,pval,betas,snp_rs,-logp,AF,RSID,POS,CHROM
0,1.641501e-64,9.543889,rs3928804_T,63.784759,0.500000,rs3928804,160943517,1
1,8.059506e-24,5.723635,rs12745158_G,23.093692,0.500000,rs12745158,39802413,1
2,1.092316e-21,-5.472473,rs703115_C,20.961652,0.500000,rs703115,158673247,1
3,1.854242e-09,3.489105,rs10888929_T,8.731834,0.500000,rs10888929,39681671,1
4,3.042075e-13,4.139335,rs814346_A,12.516830,0.500000,rs814346,190795603,1
...,...,...,...,...,...,...,...,...
21349,5.053940e-20,11.640626,rs2013215_C,19.296370,0.055911,rs2013215,49447222,22
21350,2.773652e-50,-18.127401,rs6004890_G,49.556948,0.055511,rs6004890,26011025,22
21351,1.676378e-17,-11.429114,rs117767918_A,16.775628,0.051118,rs117767918,26721111,22
21352,3.996715e-09,-8.010530,rs12167043_A,8.398297,0.050919,rs12167043,17286193,22


In [37]:
other

Unnamed: 0,pval,betas,snp_rs,-logp,AF,RSID,POS,CHROM
0,0.001380,-1.850217,rs6695244_A,2.860106,0.500000,rs6695244,18407223,1
1,0.481211,0.413966,rs3010023_A,0.317665,0.500000,rs3010023,212042477,1
2,0.147103,-0.847900,rs7516288_A,0.832378,0.500000,rs7516288,22320295,1
3,0.010445,-1.461384,rs6704281_T,1.981080,0.500000,rs6704281,221334304,1
4,0.007284,1.557526,rs10801960_A,2.137619,0.500000,rs10801960,117483702,1
...,...,...,...,...,...,...,...,...
17585,0.013558,3.196345,rs543164958_G,1.867794,0.057508,rs543164958,17511736,22
17586,0.000003,-5.930929,rs113570940_C,5.479788,0.055711,rs113570940,19545170,22
17587,0.302625,-1.339838,rs377221550_T,0.519095,0.055511,rs377221550,15611674,22
17588,0.078167,-2.233307,rs3984495_A,1.106978,0.054712,rs3984495,18743311,22
