In [1]:
import pandas as pd
import numpy as np
import pyreadr
import matplotlib.pyplot as plt
import seaborn as sns
import zstandard as zstd
from matplotlib.patches import Patch
from adjustText import adjust_text
import os

In [2]:
os.chdir("/home/lucytian/data/1_Single_Cell_PRS/2_cV2F")
annot = pd.read_csv('ukb_genoHM3_mvp_afr_tissue_cV2F.annot.pvar.gz', sep='\t', compression='gzip', low_memory=False)

In [3]:
pheno_name_dict = {'INI30120' : 'Lymphocyte Count', 'INI50030700': 'eGFR', 'INI1003063': 'FEV1/FVC ratio', 'INI20030780': 'LDL-C'}

In [4]:
base_d = 'baselines/406k_geno_v2_UKB_18PCs/fit_w_val'
data_d = 'mvp_afr_0.9_0.01/20231221/406k_geno_v2_UKB_18PCs/fit_w_val'

In [5]:
## pre-calculated cutoff top 5% on cv2f
cutoffs = {'BLOOD': 0.7324814022, 'LIVER': 0.6460890472, 'LUNG': 0.6198484063, 'KIDNEY': 0.619365716}

In [6]:
def assign_color_size(x, y, x1, cv2f, cutoff):
    y_lower = np.mean(y) - 5*np.std(y)
    y_upper = np.mean(y) + 5*np.std(y)
    
    
    min_val = min(min(x), min(y))
    max_val = max(max(x), max(y))
    increment = (max_val - min_val) / 10
    
    standard_1 = set()
    standard_2 = set()
    for i in np.arange(len(x)):
        if cv2f[i] >= cutoff:
            if (y[i] <= y_lower) | (y[i] >= y_upper):
                standard_1.add(i)
        if abs(x[i] - y[i]) > (increment/2):
            standard_2.add(i)
    overlap = standard_1.intersection(standard_2)
    standard_1_only = standard_1 - standard_2
    standard_2_only = standard_2 - standard_1
    if standard_1_only.intersection(standard_2_only):
        print("there is overlap")
    return standard_1_only, standard_2_only, overlap

In [7]:
tissue_trait_dict = {'INI30120' : 'BLOOD', 'INI50030700': 'KIDNEY', 'INI1003063': 'LUNG', 'INI20030780': 'LIVER'}

In [8]:
for tr in ['INI30120', 'INI50030700', 'INI1003063']:
    df_beta = pd.read_csv(f'{data_d}/{tissue_trait_dict[tr]}/{tr}/snpnet.BETAs.tsv.gz', sep='\t', compression='gzip')
    df_beta = df_beta.iloc[23:, :]
    df_beta_base = pd.read_csv(f'{base_d}/{tr}/snpnet.BETAs.tsv.gz', sep='\t', compression='gzip')
    df_beta_base = df_beta_base.iloc[23:, :]
    merged = df_beta_base.merge(df_beta, on='#ID')
    merged = merged.merge(annot[['#CHROM', 'POS', 'rsID', 'ID', 'cV2F_' + tissue_trait_dict[tr]]], left_on='#ID', right_on='ID')
    merged = merged[merged['#CHROM'] != 'MT']
    merged['#CHROM'] = merged['#CHROM'].replace('X', 23)
    merged['#CHROM'] = merged['#CHROM'].astype(int)
    merged['cV2F_' + tissue_trait_dict[tr]] = merged['cV2F_' + tissue_trait_dict[tr]].fillna(0)
    merged['BETA_x'] = merged['BETA_x'].fillna(0)
    merged['BETA_y'] = merged['BETA_y'].fillna(0)
    merged = merged.sort_values(by=['#CHROM', 'POS'])
    x_1 = merged['#CHROM'].tolist()
    x_cv2f = merged['cV2F_' + tissue_trait_dict[tr]].tolist()
    y = merged['BETA_y'].tolist()
    x = merged['BETA_x'].tolist()
    ids = merged['rsID'].tolist()
    
    s1, s2, overlap = assign_color_size(x, y, x_1, x_cv2f, cutoffs[tissue_trait_dict[tr]])
    
    rs_candidates = [ids[i] for i in s1] +[ids[i] for i in s2]+[ids[i] for i in overlap]
    rs_group = ['Group C' for i in np.arange(len(s1))] +['Group B' for i in np.arange(len(s2))]+['Group A' for i in np.arange(len(overlap))]
    
    select_df = pd.DataFrame(data={'rsID': rs_candidates, 'group': rs_group})
    
    snp_by_group = merged.merge(select_df, on='rsID', how='outer')
    snp_by_group = snp_by_group.fillna('Group D')
    snp_by_group = snp_by_group.rename(columns={'BETA_y': 'Criterion 1', 'A1_x': 'A1'})
    snp_by_group['Criterion 2'] = abs(snp_by_group['BETA_x'] - snp_by_group['Criterion 1'])
    snp_by_group = snp_by_group[['#ID', '#CHROM', 'POS', 'A1', 'rsID', f'cV2F_{tissue_trait_dict[tr]}', 'Criterion 1', 'Criterion 2', 'group']]
    #snp_by_group.to_csv('_'.join(pheno_name_dict[tr].split('/')) + '_SNPs.tsv', index=False, sep='\t')
    

In [9]:
snp_by_group

Unnamed: 0,#ID,#CHROM,POS,A1,rsID,cV2F_LUNG,Criterion 1,Criterion 2,group
0,1:913889:G:A,1,913889,A,rs2340596,0.489926,0.000001,0.000053,Group D
1,1:918384:G:T,1,918384,T,rs13303118,0.610554,0.000079,0.000023,Group D
2,1:949608:G:A,1,949608,A,rs1921,0.654584,-0.000253,0.000018,Group D
3,1:1006223:G:A,1,1006223,A,rs9442394,0.331460,0.000009,0.000003,Group D
4,1:1018704:A:G,1,1018704,G,rs9442372,0.272479,0.000232,0.000005,Group D
...,...,...,...,...,...,...,...,...,...
10900,X:153711305:C:A,23,153711305,A,rs5945432,0.000000,-0.000046,0.000045,Group D
10901,X:153713787:C:T,23,153713787,T,rs7057286,0.000000,0.000097,0.000023,Group D
10902,X:153760654:G:A,23,153760654,A,rs2230037,0.000000,0.000076,0.000052,Group D
10903,X:153881525:G:C,23,153881525,C,rs17328091,0.000000,0.000112,0.000036,Group D


In [10]:
### LDL-C
tr='INI20030780'

df_beta = pd.read_csv(f'{data_d}/{tissue_trait_dict[tr]}/{tr}/exclude_APOE/snpnet.BETAs.tsv.gz', sep='\t', compression='gzip')
df_beta = df_beta.iloc[23:, :]
df_beta_base = pd.read_csv(f'{base_d}/{tr}/exclude_APOE/snpnet.BETAs.tsv.gz', sep='\t', compression='gzip')
df_beta_base = df_beta_base.iloc[23:, :]
merged = df_beta_base.merge(df_beta, on='#ID')
merged = merged.merge(annot[['#CHROM', 'POS', 'rsID', 'ID', 'cV2F_' + tissue_trait_dict[tr]]], left_on='#ID', right_on='ID')
merged = merged[merged['#CHROM'] != 'MT']
merged['#CHROM'] = merged['#CHROM'].replace('X', 23)
merged['#CHROM'] = merged['#CHROM'].astype(int)
merged['cV2F_' + tissue_trait_dict[tr]] = merged['cV2F_' + tissue_trait_dict[tr]].fillna(0)
merged['BETA_x'] = merged['BETA_x'].fillna(0)
merged['BETA_y'] = merged['BETA_y'].fillna(0)
merged = merged.sort_values(by=['#CHROM', 'POS'])
x_1 = merged['#CHROM'].tolist()
x_cv2f = merged['cV2F_' + tissue_trait_dict[tr]].tolist()
y = merged['BETA_y'].tolist()
x = merged['BETA_x'].tolist()
ids = merged['rsID'].tolist()
    
s1, s2, overlap = assign_color_size(x, y, x_1, x_cv2f, cutoffs[tissue_trait_dict[tr]])
    
rs_candidates = [ids[i] for i in s1] +[ids[i] for i in s2]+[ids[i] for i in overlap]
rs_group = ['Group C' for i in np.arange(len(s1))] +['Group B' for i in np.arange(len(s2))]+['Group A' for i in np.arange(len(overlap))]
    
select_df = pd.DataFrame(data={'rsID': rs_candidates, 'group': rs_group})
    
snp_by_group = merged.merge(select_df, on='rsID', how='outer')
snp_by_group = snp_by_group.fillna('Group D')
snp_by_group = snp_by_group.rename(columns={'BETA_y': 'Criterion 1', 'A1_x': 'A1'})
snp_by_group['Criterion 2'] = abs(snp_by_group['BETA_x'] - snp_by_group['Criterion 1'])
snp_by_group = snp_by_group[['#ID', '#CHROM', 'POS', 'A1', 'rsID', f'cV2F_{tissue_trait_dict[tr]}', 'Criterion 1', 'Criterion 2', 'group']]

In [11]:
snp_by_group

Unnamed: 0,#ID,#CHROM,POS,A1,rsID,cV2F_LIVER,Criterion 1,Criterion 2,group
0,1:967658:C:T,1,967658,T,rs4970349,0.772511,-0.001575,0.000799,Group D
1,1:1171417:C:T,1,1171417,T,rs6603782,0.484944,0.000363,0.000073,Group D
2,1:1451380:T:C,1,1451380,C,rs9439458,0.437485,0.000169,0.000042,Group D
3,1:1551927:T:C,1,1551927,C,rs7418389,0.724051,-0.000290,0.000025,Group D
4,1:1687791:C:T,1,1687791,T,rs72634840,0.465423,-0.000200,0.000102,Group D
...,...,...,...,...,...,...,...,...,...
11715,X:153640406:C:T,23,153640406,T,rs62617809,0.000000,0.000156,0.000022,Group D
11716,X:153689893:G:A,23,153689893,A,rs35285799,0.000000,-0.000154,0.000061,Group D
11717,X:153881525:G:C,23,153881525,C,rs17328091,0.000000,-0.000202,0.000064,Group D
11718,X:154302678:C:T,23,154302678,T,rs5945286,0.000000,-0.000046,0.000126,Group D
