In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import os
from os.path import join
import time

%load_ext autoreload
%autoreload 2

In [2]:
OUT_PATH='/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/UKBB_trait'
DATA_PATH='/n/groups/price/martin/data_GDREG/UKBB_trait_from_Steven'

### Load data

In [3]:
# Get the ID list
df_fam = pd.read_csv('/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/unrelated_337K.txt',
                     sep=' ', index_col=False)
df_fam.columns = ['FID', 'IID']
id_list = list(df_fam['IID'])
print('n_sample=%d' % len(id_list))

n_sample=337425


In [4]:
# 64 independent traits
xl = pd.ExcelFile(DATA_PATH+'/Description_080419.xlsx')
df_64_inde_trait = xl.parse('64 Recommended indepent traits', )
inde_trait_list = list(df_64_inde_trait['Trait_Identifier'])

In [5]:
# UKBB phen file
df_phen=pd.read_csv(DATA_PATH+'/UKB_v3.061518.tab', sep='\t', index_col=False)
df_cov=pd.read_csv(DATA_PATH+'/ukb4777.processed_and_post.plinkPCs.tab.gz', 
                   sep='\t', index_col=False, compression='gzip')
df_phen_bc=pd.read_csv(DATA_PATH+'/UKB_biochemistry.051619.tab',
                       sep='\t', index_col=False)
df_cov_bc=pd.read_csv(DATA_PATH+'/UKB_biochemistry.051619.cov',
                       sep='\t', index_col=False)

### trait_list, .phen, and .cov files

In [6]:
# trait_list
trait_list = sorted([x for x in df_phen.columns if 'UKB_460K.%s'%x in inde_trait_list])
trait_list_bc = sorted([x for x in df_phen_bc.columns if 'UKB_460K.%s'%x in inde_trait_list])
print('# trait_list=%d'%len(trait_list))
print('# trait_list_bc=%d'%len(trait_list_bc))

with open(OUT_PATH+'/trait_list.txt', 'w') as f:
    for x in trait_list:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_bc.txt', 'w') as f:
    for x in trait_list_bc:
        f.write('%s\n'%x)
with open(OUT_PATH+'/trait_list_all.txt', 'w') as f:
    for x in trait_list+trait_list_bc:
        f.write('%s\n'%x)

# trait_list=23
# trait_list_bc=9


In [7]:
# Write phen file
for trait in trait_list:
    temp_df = df_phen.loc[df_phen['FID'].isin(id_list), ['FID', 'IID', trait]].copy()
    temp_df.dropna(inplace=True)
    temp_df.to_csv(OUT_PATH + '/%s.phen'%trait, sep='\t', header=True, index=False)
    print('%s, n_sample=%d' % (trait, temp_df.shape[0]))
    
for trait in trait_list_bc:
    temp_df = df_phen_bc.loc[df_phen_bc['FID'].isin(id_list), ['FID', 'IID', trait]].copy()
    temp_df.dropna(inplace=True)
    temp_df.to_csv(OUT_PATH + '/%s.phen'%trait, sep='\t', header=True, index=False)
    print('%s, n_sample=%d' % (trait, temp_df.shape[0]))

blood_PLATELET_COUNT, n_sample=326556
blood_RBC_DISTRIB_WIDTH, n_sample=325330
blood_RED_COUNT, n_sample=327144
blood_WHITE_COUNT, n_sample=326659
bmd_HEEL_TSCOREz, n_sample=327675
body_BALDING1, n_sample=155070
body_BMIz, n_sample=336329
body_HEIGHTz, n_sample=336694
body_WHRadjBMIz, n_sample=336781
bp_DIASTOLICadjMEDz, n_sample=310771
cancer_BREAST, n_sample=337422
cancer_PROSTATE, n_sample=337422
cov_EDU_YEARS, n_sample=334288
disease_ALLERGY_ECZEMA_DIAGNOSED, n_sample=337006
disease_HYPOTHYROIDISM_SELF_REP, n_sample=337422
lung_FEV1FVCzSMOKE, n_sample=274120
lung_FVCzSMOKE, n_sample=274120
mental_NEUROTICISM, n_sample=274265
other_MORNINGPERSON, n_sample=301336
pigment_SUNBURN, n_sample=251786
repro_MENARCHE_AGE, n_sample=176122
repro_MENOPAUSE_AGE, n_sample=104413
repro_NumberChildrenEverBorn_Pooled, n_sample=335534
biochemistry_AlkalinePhosphatase, n_sample=320600
biochemistry_AspartateAminotransferase, n_sample=318500
biochemistry_Cholesterol, n_sample=321534
biochemistry_Creati

### .resid.phen file

In [8]:
cov_list = ['cov_ASSESS_CENTER', 'cov_GENO_ARRAY', 'cov_SEX']
qcov_list = ['cov_AGE', 'cov_AGE_SQ'] + ['PC%d'%x for x in np.arange(1,21)] 

# Categorical covaraites 
temp_df_cov = df_cov[['FID', 'IID'] + cov_list]
temp_df_cov.index = temp_df_cov['IID']
temp_df_cov = pd.get_dummies(temp_df_cov[cov_list], columns=cov_list, drop_first=True)

# df_reg
df_reg = df_cov[['FID', 'IID'] + qcov_list]
df_reg.index = df_reg['IID']
df_reg = df_reg.join(temp_df_cov)
PC_list = [x for x in df_reg.columns if 'PC' in x]
df_reg[PC_list] *= np.sqrt(df_reg.shape[0])
df_reg['const'] = 1
df_reg.drop(['FID', 'IID'], axis=1, inplace=True)
df_reg = df_reg.astype(dtype=np.float32)

# Compute residual
for trait in trait_list:
    df_phen = pd.read_csv(OUT_PATH+'/%s.phen'%trait, sep='\t', header=0, index_col=False)
    df_phen.index = df_phen['IID']    
    sample_list = sorted(set(df_phen.index) & set(df_reg.index))
    print('%s, n_sample=%d, n_sample_retain=%d' % (trait, df_phen.shape[0], len(sample_list)))

    temp_df_reg = df_reg.loc[sample_list].copy()
    drop_list = [x for x in temp_df_reg if (temp_df_reg[x].std()<1e-2) & (x!='const')]
    if len(drop_list)>0:
        temp_df_reg = temp_df_reg.drop(columns=drop_list)
        print('    col dropped: %s'%', '.join(drop_list))
        
    v_y = df_phen.loc[sample_list][trait].values.astype(np.float32)
    mat_X = temp_df_reg.values.astype(np.float32)
    n_sample = v_y.shape[0]
    mat_xtx = np.dot(mat_X.T, mat_X)/n_sample
    mat_xty = np.dot(mat_X.T, v_y)/n_sample
    coef_ = np.linalg.solve(mat_xtx, mat_xty).reshape([-1])
    v_y_res = v_y - mat_X.dot(coef_)
    print('    var(v_y)=%0.4g, var(v_y_res)=%0.4g, R_square=%0.4g\n'
          %(v_y.var(), v_y_res.var(), 1- v_y_res.var()/v_y.var()))
    
    temp_df = pd.DataFrame( data = {
        "FID" : sample_list,
        "IID" : sample_list,
        trait : v_y_res / v_y_res.std()
    }
    )
    temp_df.to_csv(OUT_PATH + '/%s.resid.phen'%trait, sep='\t', header=True, index=False)

blood_PLATELET_COUNT, n_sample=326556, n_sample_retain=326556
    var(v_y)=0.9906, var(v_y_res)=0.9883, R_square=0.002315

blood_RBC_DISTRIB_WIDTH, n_sample=325330, n_sample_retain=325330
    var(v_y)=0.9712, var(v_y_res)=0.9659, R_square=0.005489

blood_RED_COUNT, n_sample=327144, n_sample_retain=327144
    var(v_y)=0.97, var(v_y_res)=0.9665, R_square=0.003551

blood_WHITE_COUNT, n_sample=326659, n_sample_retain=326659
    var(v_y)=0.9827, var(v_y_res)=0.9732, R_square=0.009671

bmd_HEEL_TSCOREz, n_sample=327675, n_sample_retain=327675
    var(v_y)=0.9775, var(v_y_res)=0.9716, R_square=0.00599

body_BALDING1, n_sample=155070, n_sample_retain=155070
    col dropped: cov_SEX_1
    var(v_y)=0.2172, var(v_y_res)=0.2151, R_square=0.01009

body_BMIz, n_sample=336329, n_sample_retain=336329
    var(v_y)=0.9885, var(v_y_res)=0.9774, R_square=0.01119

body_HEIGHTz, n_sample=336694, n_sample_retain=336694
    var(v_y)=0.9658, var(v_y_res)=0.9407, R_square=0.02594

body_WHRadjBMIz, n_sample=3367

In [9]:
# trait_bc
cov_list = ['cov_ASSESS_CENTER', 'cov_GENO_ARRAY', 'cov_SEX']
qcov_list = ['cov_AGE', 'cov_AGE_SQ', 'biochemistry_dilutionfactor'] + ['PC%d'%x for x in np.arange(1,21)] 

# Categorical covaraites 
temp_df_cov = df_cov_bc[['FID', 'IID'] + cov_list]
temp_df_cov.index = temp_df_cov['IID']
temp_df_cov = pd.get_dummies(temp_df_cov[cov_list], columns=cov_list, drop_first=True)

# df_reg
df_reg = df_cov_bc[['FID', 'IID'] + qcov_list].copy()
df_reg.dropna(inplace=True)
df_reg.index = df_reg['IID']
df_reg = df_reg.join(temp_df_cov)
PC_list = [x for x in df_reg.columns if 'PC' in x]
df_reg[PC_list] *= np.sqrt(df_reg.shape[0])
df_reg['const'] = 1
df_reg.drop(['FID', 'IID'], axis=1, inplace=True)
df_reg = df_reg.astype(dtype=np.float32)

# Compute residual
for trait in trait_list_bc:
    df_phen = pd.read_csv(OUT_PATH+'/%s.phen'%trait, sep='\t', header=0, index_col=False)
    df_phen.index = df_phen['IID']    
    sample_list = sorted(set(df_phen.index) & set(df_reg.index))
    print('%s, n_sample=%d, n_sample_retain=%d' % (trait, df_phen.shape[0], len(sample_list)))

    temp_df_reg = df_reg.loc[sample_list].copy()
    drop_list = [x for x in temp_df_reg if (temp_df_reg[x].std()<1e-2) & (x!='const')]
    if len(drop_list)>0:
        temp_df_reg = temp_df_reg.drop(columns=drop_list)
        print('    col dropped: %s'%', '.join(drop_list))
        
    v_y = df_phen.loc[sample_list][trait].values.astype(np.float32)
    mat_X = temp_df_reg.values.astype(np.float32)
    n_sample = v_y.shape[0]
    mat_xtx = np.dot(mat_X.T, mat_X)/n_sample
    mat_xty = np.dot(mat_X.T, v_y)/n_sample
    coef_ = np.linalg.solve(mat_xtx, mat_xty).reshape([-1])
    v_y_res = v_y - mat_X.dot(coef_)
    print('    var(v_y)=%0.4g, var(v_y_res)=%0.4g, R_square=%0.4g\n'
          %(v_y.var(), v_y_res.var(), 1- v_y_res.var()/v_y.var()))
    
    temp_df = pd.DataFrame( data = {
        "FID" : sample_list,
        "IID" : sample_list,
        trait : v_y_res / v_y_res.std()
    }
    )
    temp_df.to_csv(OUT_PATH + '/%s.resid.phen'%trait, sep='\t', header=True, index=False)

biochemistry_AlkalinePhosphatase, n_sample=320600, n_sample_retain=318768
    var(v_y)=509.7, var(v_y_res)=487, R_square=0.04454

biochemistry_AspartateAminotransferase, n_sample=318500, n_sample_retain=316688
    var(v_y)=52.04, var(v_y_res)=48.68, R_square=0.06466

biochemistry_Cholesterol, n_sample=321534, n_sample_retain=319694
    var(v_y)=1.294, var(v_y_res)=1.234, R_square=0.04643

biochemistry_Creatinine, n_sample=320794, n_sample_retain=318959
    var(v_y)=207, var(v_y_res)=134.8, R_square=0.3491

biochemistry_IGF1, n_sample=319545, n_sample_retain=317604
    var(v_y)=30.36, var(v_y_res)=28.11, R_square=0.07407

biochemistry_Phosphate, n_sample=293982, n_sample_retain=292141
    col dropped: biochemistry_dilutionfactor
    var(v_y)=0.02553, var(v_y_res)=0.02598, R_square=-0.01771

biochemistry_TotalBilirubin, n_sample=317254, n_sample_retain=315431
    var(v_y)=13.66, var(v_y_res)=12.71, R_square=0.06922

biochemistry_TotalProtein, n_sample=294055, n_sample_retain=292217
    c

### Sanity check 

In [26]:
PHEN_FILE = "/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001/UKBB_trait/@.resid.phen"
PHEN_FILE_OLD = "/n/groups/price/martin/data_GDREG/WES_50K_imp/UKBB_trait/@.resid.phen"
for trait in trait_list + trait_list_bc : 
    temp_df1 = pd.read_csv(PHEN_FILE.replace('@', trait), sep='\t', header=0, index_col=False)
    temp_df1.index = temp_df1['IID']
    temp_df2 = pd.read_csv(PHEN_FILE_OLD.replace('@', trait), sep='\t', header=0, index_col=False)
    temp_df2.index = [int(x.split('_')[1]) for x in temp_df2['IID']]
    sample_list = sorted(set(temp_df1.index) & set(temp_df2.index))
    print('%s n_sample1=%d, n_sample2=%d, n_overlap=%d'
          % (trait, temp_df1.shape[0], temp_df2.shape[0], len(sample_list)))
    
    temp_df1 = temp_df1.loc[sample_list]
    temp_df2 = temp_df2.loc[sample_list]
    
    print('    cor=%0.3f, mean_abs_err=%0.4g' % (
        np.corrcoef(temp_df1[trait], temp_df2[trait])[0,1], 
        np.absolute(temp_df1[trait]-temp_df2[trait]).mean()
    ))

blood_PLATELET_COUNT n_sample1=326556, n_sample2=326666, n_overlap=326555
    cor=1.000, mean_abs_err=0.004696
blood_RBC_DISTRIB_WIDTH n_sample1=325330, n_sample2=325440, n_overlap=325329
    cor=1.000, mean_abs_err=0.0138
blood_RED_COUNT n_sample1=327144, n_sample2=327254, n_overlap=327143
    cor=1.000, mean_abs_err=0.01353
blood_WHITE_COUNT n_sample1=326659, n_sample2=326768, n_overlap=326658
    cor=1.000, mean_abs_err=0.01075
bmd_HEEL_TSCOREz n_sample1=327675, n_sample2=327783, n_overlap=327674
    cor=1.000, mean_abs_err=0.01143
body_BALDING1 n_sample1=155070, n_sample2=155117, n_overlap=155069
    cor=1.000, mean_abs_err=0.4974
body_BMIz n_sample1=336329, n_sample2=336441, n_overlap=336328
    cor=1.000, mean_abs_err=0.009044
body_HEIGHTz n_sample1=336694, n_sample2=336806, n_overlap=336693
    cor=1.000, mean_abs_err=0.024
body_WHRadjBMIz n_sample1=336781, n_sample2=336894, n_overlap=336780
    cor=1.000, mean_abs_err=0.01007
bp_DIASTOLICadjMEDz n_sample1=310771, n_sample2=3108

In [22]:
temp_df1.index

Int64Index([1000019, 1000022, 1000035, 1000046, 1000063, 1000081, 1000105,
            1000112, 1000137, 1000153,
            ...
            6026444, 6026475, 6026489, 6026493, 6026509, 6026511, 6026525,
            6026538, 6026542, 6026556],
           dtype='int64', name='IID', length=326556)

In [23]:
temp_df2.index

Index(['3540154', '3885612', '4366506', '4593906', '3290211', '3063674',
       '4433313', '4236030', '2044367', '1523594',
       ...
       '5428227', '4135122', '1667087', '2301154', '4033247', '4122500',
       '3078725', '3999932', '3454751', '3876480'],
      dtype='object', length=326666)