In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import os
from os.path import join
import time

%load_ext autoreload
%autoreload 2

In [2]:
OUT_PATH='/n/groups/price/martin/LDSPEC_data/UKBB_trait'
DATA_PATH='/n/groups/price/martin/LDSPEC_data/UKBB_trait_from_Steven'

### Load data

In [3]:
# Get the ID list
df_fam = pd.read_csv('/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/unrelated_337K.txt',
                     sep=' ', index_col=False)
df_fam.columns = ['FID', 'IID']
id_list = list(df_fam['IID'])
print('n_sample=%d' % len(id_list))

n_sample=337425


In [4]:
# 64 independent traits
xl = pd.ExcelFile(DATA_PATH+'/Description_080419.xlsx')
df_64_inde_trait = xl.parse('64 Recommended indepent traits', )
inde_trait_list = list(df_64_inde_trait['Trait_Identifier'])

In [5]:
# UKBB phen file
df_phen=pd.read_csv(DATA_PATH+'/UKB_v3.061518.tab', sep='\t', index_col=False)
df_cov=pd.read_csv(DATA_PATH+'/ukb4777.processed_and_post.plinkPCs.tab.gz', 
                   sep='\t', index_col=False, compression='gzip')
df_phen_bc=pd.read_csv(DATA_PATH+'/UKB_biochemistry.051619.tab',
                       sep='\t', index_col=False)
df_cov_bc=pd.read_csv(DATA_PATH+'/UKB_biochemistry.051619.cov',
                       sep='\t', index_col=False)

### trait_list, .phen, and .cov files

In [6]:
trait_list = sorted([x for x in df_phen.columns if x not in ['FID', 'IID']])
print('# trait_list=%d'%len(trait_list))
temp_df = df_phen.loc[df_phen['FID'].isin(id_list)].copy()
trait_list = [x for x in trait_list if (~temp_df[x].isna()).sum()>1e5] # filter for N>1e5
print('# Remove N<1e5 traits, trait_list=%d'%len(trait_list))
trait_list_indpt = [x for x in trait_list if 'UKB_460K.%s'%x in inde_trait_list]
print('# Restrict to 64 indpt traits, trait_list_indpt=%d'%len(trait_list_indpt))

trait_list_bc = sorted([x for x in df_phen_bc.columns if x not in ['FID', 'IID']])
print('# trait_list_bc=%d'%len(trait_list_bc))
temp_df = df_phen_bc.loc[df_phen_bc['FID'].isin(id_list)].copy()
trait_list_bc = [x for x in trait_list_bc if (~temp_df[x].isna()).sum()>1e5]
print('# Remove N<1e5 traits, trait_list_bc=%d'%len(trait_list_bc))
trait_list_bc_indpt = [x for x in trait_list_bc if 'UKB_460K.%s'%x in inde_trait_list]
print('# Restrict to 64 indpt traits, trait_list_bc_indpt=%d'%len(trait_list_bc_indpt))

with open(OUT_PATH+'/trait_list.txt', 'w') as f:
    for x in trait_list:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_indpt.txt', 'w') as f:
    for x in trait_list_indpt:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_other.txt', 'w') as f:
    for x in [x for x in trait_list if x not in trait_list_indpt]:
        f.write('%s\n'%x)
        
with open(OUT_PATH+'/trait_list_bc.txt', 'w') as f:
    for x in trait_list_bc:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_bc_indpt.txt', 'w') as f:
    for x in trait_list_bc_indpt:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_bc_other.txt', 'w') as f:
    for x in [x for x in trait_list_bc if x not in trait_list_bc_indpt]:
        f.write('%s\n'%x)
        
with open(OUT_PATH+'/trait_list_all.txt', 'w') as f:
    for x in trait_list+trait_list_bc:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_all_indpt.txt', 'w') as f:
    for x in trait_list_indpt+trait_list_bc_indpt:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_all_other.txt', 'w') as f:
    for x in [x for x in trait_list+trait_list_bc if x not in trait_list_indpt+trait_list_bc_indpt]:
        f.write('%s\n'%x)

# trait_list=93
# Remove N<1e5 traits, trait_list=90
# Restrict to 64 indpt traits, trait_list_indpt=23
# trait_list_bc=30
# Remove N<1e5 traits, trait_list_bc=28
# Restrict to 64 indpt traits, trait_list_bc_indpt=9


In [7]:
# Write phen file
for trait in trait_list:
    temp_df = df_phen.loc[df_phen['FID'].isin(id_list), ['FID', 'IID', trait]].copy()
    temp_df.dropna(inplace=True)
    temp_df.to_csv(OUT_PATH + '/%s.phen'%trait, sep='\t', header=True, index=False)
    print('%s, n_sample=%d' % (trait, temp_df.shape[0]))
    
for trait in trait_list_bc:
    temp_df = df_phen_bc.loc[df_phen_bc['FID'].isin(id_list), ['FID', 'IID', trait]].copy()
    temp_df.dropna(inplace=True)
    temp_df.to_csv(OUT_PATH + '/%s.phen'%trait, sep='\t', header=True, index=False)
    print('%s, n_sample=%d' % (trait, temp_df.shape[0]))

blood_EOSINOPHIL_COUNT, n_sample=323294
blood_HIGH_LIGHT_SCATTER_RETICULOCYTE_COUNT, n_sample=321545
blood_LYMPHOCYTE_COUNT, n_sample=326239
blood_MEAN_CORPUSCULAR_HEMOGLOBIN, n_sample=326123
blood_MEAN_PLATELET_VOL, n_sample=327140
blood_MEAN_SPHERED_CELL_VOL, n_sample=321581
blood_MONOCYTE_COUNT, n_sample=325257
blood_PLATELET_COUNT, n_sample=326556
blood_PLATELET_DISTRIB_WIDTH, n_sample=326768
blood_RBC_DISTRIB_WIDTH, n_sample=325330
blood_RED_COUNT, n_sample=327144
blood_WHITE_COUNT, n_sample=326659
bmd_HEEL_TSCOREz, n_sample=327675
body_BALDING1, n_sample=155070
body_BALDING4, n_sample=155070
body_BMIz, n_sample=336329
body_HEIGHTz, n_sample=336694
body_LEFT_HANDED, n_sample=337353
body_WHRadjBMIz, n_sample=336781
bp_DIASTOLICadjMEDz, n_sample=310771
bp_SYSTOLICadjMEDz, n_sample=310771
cancer_ALL, n_sample=337422
cancer_BREAST, n_sample=337422
cancer_MELANOMA, n_sample=337422
cancer_PROSTATE, n_sample=337422
cov_EDU_COLLEGE, n_sample=334288
cov_EDU_YEARS, n_sample=334288
cov_SMOKI

### .resid.phen file

In [8]:
cov_list = ['cov_ASSESS_CENTER', 'cov_GENO_ARRAY', 'cov_SEX']
qcov_list = ['cov_AGE', 'cov_AGE_SQ'] + ['PC%d'%x for x in np.arange(1,21)] 

# Categorical covaraites 
temp_df_cov = df_cov[['FID', 'IID'] + cov_list]
temp_df_cov.index = temp_df_cov['IID']
temp_df_cov = pd.get_dummies(temp_df_cov[cov_list], columns=cov_list, drop_first=True)

# df_reg
df_reg = df_cov[['FID', 'IID'] + qcov_list]
df_reg.index = df_reg['IID']
df_reg = df_reg.join(temp_df_cov)
PC_list = [x for x in df_reg.columns if 'PC' in x]
df_reg[PC_list] *= np.sqrt(df_reg.shape[0])
df_reg['const'] = 1
df_reg.drop(['FID', 'IID'], axis=1, inplace=True)
df_reg = df_reg.astype(dtype=np.float32)

# Compute residual
for trait in trait_list:
    df_phen = pd.read_csv(OUT_PATH+'/%s.phen'%trait, sep='\t', header=0, index_col=False)
    df_phen.index = df_phen['IID']    
    sample_list = sorted(set(df_phen.index) & set(df_reg.index))
    print('%s, n_sample=%d, n_sample_retain=%d' % (trait, df_phen.shape[0], len(sample_list)))

    temp_df_reg = df_reg.loc[sample_list].copy()
    drop_list = [x for x in temp_df_reg if (temp_df_reg[x].std()<1e-2) & (x!='const')]
    if len(drop_list)>0:
        temp_df_reg = temp_df_reg.drop(columns=drop_list)
        print('    col dropped: %s'%', '.join(drop_list))
        
    v_y = df_phen.loc[sample_list][trait].values.astype(np.float32)
    mat_X = temp_df_reg.values.astype(np.float32)
    n_sample = v_y.shape[0]
    mat_xtx = np.dot(mat_X.T, mat_X)/n_sample
    mat_xty = np.dot(mat_X.T, v_y)/n_sample
    coef_ = np.linalg.solve(mat_xtx, mat_xty).reshape([-1])
    v_y_res = v_y - mat_X.dot(coef_)
    print('    var(v_y)=%0.4g, var(v_y_res)=%0.4g, R_square=%0.4g\n'
          %(v_y.var(), v_y_res.var(), 1- v_y_res.var()/v_y.var()))
    
    temp_df = pd.DataFrame( data = {
        "FID" : sample_list,
        "IID" : sample_list,
        trait : v_y_res / v_y_res.std()
    }
    )
    temp_df.to_csv(OUT_PATH + '/%s.resid.phen'%trait, sep='\t', header=True, index=False)

blood_EOSINOPHIL_COUNT, n_sample=323294, n_sample_retain=323294
    var(v_y)=0.9929, var(v_y_res)=0.9855, R_square=0.007434

blood_HIGH_LIGHT_SCATTER_RETICULOCYTE_COUNT, n_sample=321545, n_sample_retain=321545
    var(v_y)=0.9968, var(v_y_res)=0.9929, R_square=0.003932

blood_LYMPHOCYTE_COUNT, n_sample=326239, n_sample_retain=326239
    var(v_y)=0.9868, var(v_y_res)=0.9782, R_square=0.008709

blood_MEAN_CORPUSCULAR_HEMOGLOBIN, n_sample=326123, n_sample_retain=326123
    var(v_y)=0.9519, var(v_y_res)=0.9426, R_square=0.00983

blood_MEAN_PLATELET_VOL, n_sample=327140, n_sample_retain=327140
    var(v_y)=0.9921, var(v_y_res)=0.9887, R_square=0.003405

blood_MEAN_SPHERED_CELL_VOL, n_sample=321581, n_sample_retain=321581
    var(v_y)=0.9738, var(v_y_res)=0.9706, R_square=0.003254

blood_MONOCYTE_COUNT, n_sample=325257, n_sample_retain=325257
    var(v_y)=0.9898, var(v_y_res)=0.9857, R_square=0.004183

blood_PLATELET_COUNT, n_sample=326556, n_sample_retain=326556
    var(v_y)=0.9906, var(v_y

    var(v_y)=0.2286, var(v_y_res)=0.2277, R_square=0.004205

parents_DIABETES, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.1382, var(v_y_res)=0.1372, R_square=0.007199

parents_HEART_DISEASE, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.1878, var(v_y_res)=0.1858, R_square=0.01092

parents_HIGH_BLOOD_PRESSURE, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.3111, var(v_y_res)=0.3041, R_square=0.02237

parents_LUNG_CANCER, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.07614, var(v_y_res)=0.07567, R_square=0.006249

parents_PARKINSON, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.03363, var(v_y_res)=0.03356, R_square=0.002114

parents_PROSTATE_CANCER, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.06391, var(v_y_res)=0.06374, R_square=0.00255

parents_SEVERE_DEPRESSION, n_sample=328406, n_sample_retain=328406
    var(v_y)=0.09062, var(v_y_res)=0.09017, R_square=0.004919

parents_STROKE, n_sample=328406, n_sample_retain=328406
    var(v

In [9]:
# trait_bc
cov_list = ['cov_ASSESS_CENTER', 'cov_GENO_ARRAY', 'cov_SEX']
qcov_list = ['cov_AGE', 'cov_AGE_SQ', 'biochemistry_dilutionfactor'] + ['PC%d'%x for x in np.arange(1,21)] 

# Categorical covaraites 
temp_df_cov = df_cov_bc[['FID', 'IID'] + cov_list]
temp_df_cov.index = temp_df_cov['IID']
temp_df_cov = pd.get_dummies(temp_df_cov[cov_list], columns=cov_list, drop_first=True)

# df_reg
df_reg = df_cov_bc[['FID', 'IID'] + qcov_list].copy()
df_reg.dropna(inplace=True)
df_reg.index = df_reg['IID']
df_reg = df_reg.join(temp_df_cov)
PC_list = [x for x in df_reg.columns if 'PC' in x]
df_reg[PC_list] *= np.sqrt(df_reg.shape[0])
df_reg['const'] = 1
df_reg.drop(['FID', 'IID'], axis=1, inplace=True)
df_reg = df_reg.astype(dtype=np.float32)

# Compute residual
for trait in trait_list_bc:
    df_phen = pd.read_csv(OUT_PATH+'/%s.phen'%trait, sep='\t', header=0, index_col=False)
    df_phen.index = df_phen['IID']    
    sample_list = sorted(set(df_phen.index) & set(df_reg.index))
    print('%s, n_sample=%d, n_sample_retain=%d' % (trait, df_phen.shape[0], len(sample_list)))

    temp_df_reg = df_reg.loc[sample_list].copy()
    drop_list = [x for x in temp_df_reg if (temp_df_reg[x].std()<1e-2) & (x!='const')]
    if len(drop_list)>0:
        temp_df_reg = temp_df_reg.drop(columns=drop_list)
        print('    col dropped: %s'%', '.join(drop_list))
        
    v_y = df_phen.loc[sample_list][trait].values.astype(np.float32)
    mat_X = temp_df_reg.values.astype(np.float32)
    n_sample = v_y.shape[0]
    mat_xtx = np.dot(mat_X.T, mat_X)/n_sample
    mat_xty = np.dot(mat_X.T, v_y)/n_sample
    coef_ = np.linalg.solve(mat_xtx, mat_xty).reshape([-1])
    v_y_res = v_y - mat_X.dot(coef_)
    print('    var(v_y)=%0.4g, var(v_y_res)=%0.4g, R_square=%0.4g\n'
          %(v_y.var(), v_y_res.var(), 1- v_y_res.var()/v_y.var()))
    
    temp_df = pd.DataFrame( data = {
        "FID" : sample_list,
        "IID" : sample_list,
        trait : v_y_res / v_y_res.std()
    }
    )
    temp_df.to_csv(OUT_PATH + '/%s.resid.phen'%trait, sep='\t', header=True, index=False)

biochemistry_AlanineAminotransferase, n_sample=319181, n_sample_retain=317355
    var(v_y)=121.4, var(v_y_res)=109.8, R_square=0.09573

biochemistry_Albumin, n_sample=294381, n_sample_retain=292543
    col dropped: biochemistry_dilutionfactor
    var(v_y)=6.7, var(v_y_res)=6.44, R_square=0.03876

biochemistry_AlkalinePhosphatase, n_sample=320600, n_sample_retain=318768
    var(v_y)=509.7, var(v_y_res)=486.7, R_square=0.04525

biochemistry_ApolipoproteinA, n_sample=292858, n_sample_retain=291023
    col dropped: biochemistry_dilutionfactor
    var(v_y)=0.07301, var(v_y_res)=0.06134, R_square=0.1599

biochemistry_ApolipoproteinB, n_sample=320120, n_sample_retain=318292
    var(v_y)=0.05673, var(v_y_res)=0.05645, R_square=0.00501

biochemistry_AspartateAminotransferase, n_sample=318500, n_sample_retain=316688
    var(v_y)=52.04, var(v_y_res)=48.65, R_square=0.06518

biochemistry_Calcium, n_sample=293883, n_sample_retain=292048
    col dropped: biochemistry_dilutionfactor
    var(v_y)=0.00

### Sanity check 

In [11]:
PHEN_FILE = '/n/groups/price/martin/LDSPEC_data/UKBB_trait/@.resid.phen'
PHEN_FILE_OLD = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/UKBB_trait/@.resid.phen'
# PHEN_FILE = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/UKBB_trait/@.resid.phen"
# PHEN_FILE_OLD = "/n/groups/price/martin/data_GDREG/WES_50K_imp/UKBB_trait/@.resid.phen"
# for trait in trait_list + trait_list_bc : 
for trait in trait_list_indpt + trait_list_bc_indpt : 
    temp_df1 = pd.read_csv(PHEN_FILE.replace('@', trait), sep='\t', header=0, index_col=False)
    temp_df1.index = temp_df1['IID']
    temp_df2 = pd.read_csv(PHEN_FILE_OLD.replace('@', trait), sep='\t', header=0, index_col=False)
    temp_df2.index = temp_df2['IID']
#     temp_df2.index = [int(x.split('_')[1]) for x in temp_df2['IID']]
    sample_list = sorted(set(temp_df1.index) & set(temp_df2.index))
    print('%s n_sample1=%d, n_sample2=%d, n_overlap=%d'
          % (trait, temp_df1.shape[0], temp_df2.shape[0], len(sample_list)))
    
    temp_df1 = temp_df1.loc[sample_list]
    temp_df2 = temp_df2.loc[sample_list]
    
    print('    cor=%0.3f, mean_abs_err=%0.4g' % (
        np.corrcoef(temp_df1[trait], temp_df2[trait])[0,1], 
        np.absolute(temp_df1[trait]-temp_df2[trait]).mean()
    ))

blood_PLATELET_COUNT n_sample1=326556, n_sample2=326556, n_overlap=326556
    cor=1.000, mean_abs_err=7.361e-05
blood_RBC_DISTRIB_WIDTH n_sample1=325330, n_sample2=325330, n_overlap=325330
    cor=1.000, mean_abs_err=0.001384
blood_RED_COUNT n_sample1=327144, n_sample2=327144, n_overlap=327144
    cor=1.000, mean_abs_err=8.19e-05
blood_WHITE_COUNT n_sample1=326659, n_sample2=326659, n_overlap=326659
    cor=1.000, mean_abs_err=2.394e-05
bmd_HEEL_TSCOREz n_sample1=327675, n_sample2=327675, n_overlap=327675
    cor=1.000, mean_abs_err=0.0005877
body_BALDING1 n_sample1=155070, n_sample2=155070, n_overlap=155070
    cor=1.000, mean_abs_err=7.956e-05
body_BMIz n_sample1=336329, n_sample2=336329, n_overlap=336329
    cor=1.000, mean_abs_err=4.084e-05
body_HEIGHTz n_sample1=336694, n_sample2=336694, n_overlap=336694
    cor=1.000, mean_abs_err=3.442e-05
body_WHRadjBMIz n_sample1=336781, n_sample2=336781, n_overlap=336781
    cor=1.000, mean_abs_err=8.161e-07
bp_DIASTOLICadjMEDz n_sample1=3107