In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import ldspec
import zipfile

# autoreload
%load_ext autoreload
%autoreload 2

### Get small 50K data

In [4]:
temp_df = pd.read_csv(
    "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/unrelated_337K.txt", sep=' ', header=None
)
np.random.seed(0)
ind_select = np.random.choice([0, 1], size=temp_df.shape[0], p=[0.85, 0.15]).astype(bool)
temp_df_small = temp_df.loc[ind_select]
# temp_df_small.to_csv(
#     "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/unrelated_337K_ss50K.txt",
#     sep=' ', header=None, index=False,
# )
temp_df_rep = temp_df.loc[~ind_select]
temp_df_rep.to_csv(
    "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/unrelated_337K_rep287K.txt",
    sep=' ', header=None, index=False,
)

### Get NB44K and EAS2K

In [2]:
df_sample = pd.read_csv(
    '/n/groups/price/UKBiobank/app10438assoc/ukb4777.processed_and_post.ukbPCs.tab.gz', sep='\t',
)
print('df_sample', df_sample.shape[0])
df_unrelated_337K = pd.read_csv(
    "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/unrelated_337K.txt", sep=' ', header=None
)
print('df_unrelated_337K', df_unrelated_337K.shape[0])
df_nonStringentBritish = pd.read_csv(
    "/n/groups/price/UKBiobank/sampleQC/remove.nonStringentBritish.FID_IID.txt", sep=' ', header=None
)
print('df_nonStringentBritish', df_nonStringentBritish.shape[0])
df_White = pd.read_csv(
    "/n/groups/price/UKBiobank/sampleQC/remove.White.FID_IID.txt", sep=' ',
)
print('df_White', df_White.shape[0])
df_related = pd.read_csv(
    "/n/groups/price/UKBiobank/sampleQC/remove.related.FID_IID.txt", sep=' ', header=None,
)
print('df_related', df_related.shape[0])

df_sample 488374
df_unrelated_337K 337426
df_nonStringentBritish 78674
df_White 459335
df_related 81158


In [11]:
### 40K non-British Europeans (Weissbrod) (suffix: NB44K)
### df_nonStringentBritish & df_White \ df_unrelated_337K \ df_related
temp_set = set(df_nonStringentBritish[0]) & set(df_White['FID'])
temp_set = temp_set - set(df_unrelated_337K[0]) - set(df_related[0])

temp_df = df_sample.loc[df_sample['FID'].isin(temp_set), ['FID', 'IID']]
print('temp_df', temp_df.shape[0])
display(df_sample.loc[df_sample['IID'].isin(temp_df['FID'])].value_counts('cov_ETHNICITY'))
temp_df.to_csv(
    "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/sample_nonBritish_White_unrelated_44K.txt",
    sep=' ', header=None, index=False,
)

temp_df 44012


cov_ETHNICITY
1001.0    18189
1003.0    14715
1002.0    10650
1.0         458
dtype: int64

In [16]:
### 1.5K EAS (suffix: EAS2K)
### Chinese
temp_set = set(df_sample.loc[df_sample['cov_ETHNICITY'].isin([5]), 'FID'])
temp_set = temp_set - set(df_unrelated_337K[0]) - set(df_related[0])

temp_df = df_sample.loc[df_sample['FID'].isin(temp_set), ['FID', 'IID']]
print('temp_df', temp_df.shape[0])
display(df_sample.loc[df_sample['IID'].isin(temp_df['FID'])].value_counts('cov_ETHNICITY'))
temp_df.to_csv(
    "/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/sample_unrelated_EAS2K.txt",
    sep=' ', header=None, index=False,
)

temp_df 1422


cov_ETHNICITY
5.0    1422
dtype: int64

### Check sumstats 

In [5]:
sumstats_file1 = '/n/groups/price/martin/LDSPEC_data/UKBimp_rep287K_MAF001_chimp/sumstats/@.nomhc.sumstats.gz'
sumstats_file2 = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/sumstats/@.nomhc.sumstats.gz'
trait = 'blood_RED_COUNT'
temp_df1 = pd.read_csv(sumstats_file1.replace('@', trait), sep="\t", index_col=None)
temp_df2 = pd.read_csv(sumstats_file2.replace('@', trait), sep="\t", index_col=None)
temp_df1.index = temp_df1['SNP']
temp_df2.index = temp_df2['SNP']
snp_list = list(set(temp_df1.index) & set(temp_df2.index))[:10000]
np.corrcoef(temp_df1.loc[snp_list, 'Z'], temp_df2.loc[snp_list, 'Z'])

array([[1.        , 0.95796357],
       [0.95796357, 1.        ]])

### Check LD results

In [3]:
SNP_RANGE_FILE="/n/groups/price/martin/LDSPEC_data/UKBimp_ss50K_MAF001_chimp/ukb_imp_v3.snp_range.txt"
SNP_RANGE_FILE_UF="/n/groups/price/martin/LDSPEC_data/UKBimp_ss50K_MAF001_chimp/ukb_imp_v3.snp_range.uf.txt"
LD_FILE="/n/scratch/users/j/jz286/imp_geno_ss50K_chimp.ldspec_ld_1e6/ukb_imp_v3_chimp.@_ld.npz"

snp_range_list = []
with open(SNP_RANGE_FILE, "r") as f:
    for line in f:
        snp_range_list.append(line.strip())

snp_range_uf_list = []
for snp_range in snp_range_list:
    ld_file = LD_FILE.replace("@", snp_range)
    if os.path.exists(ld_file):
        if zipfile.is_zipfile(ld_file):
            continue            
    snp_range_uf_list.append(snp_range)

with open(SNP_RANGE_FILE_UF, "w") as f:
    for snp_range in snp_range_uf_list:
        f.write("%s\n" % snp_range)
        
print("%d/%d unfinished" % (len(snp_range_uf_list), len(snp_range_list)))

2/1490 unfinished


In [6]:
DATA_PATH = "/n/groups/price/martin/LDSPEC_data/"

In [9]:
temp_col = pd.read_csv(
    "/n/groups/price/martin/LDSPEC_data/finemapping/release1.1/UKBB_94traits_release1.cols", 
    sep='\t', header=None,
)

df_fm = pd.read_csv(
    "/n/groups/price/martin/LDSPEC_data/finemapping/release1.1/UKBB_94traits_release1.bed.gz",
    sep='\t', header=None,
)
df_fm.columns = temp_col[0]

In [14]:
(df_fm['trait']=='Height').sum()

192668

### Check assoc testing results 

In [2]:
DATA_PATH = "/n/groups/price/martin/LDSPEC_data/"
trait_list = pd.read_csv(
    '/n/groups/price/martin/LDSPEC_data/UKBB_trait_ss50K/trait_list_all_indpt.txt', header=None,
)[0].to_list()
rho_r_ratio_list = [-1, -0.5, 0, 0.5, 1]
pairwise_file = '/n/groups/price/martin/LDSPEC_data/rv1_assoc/jnt_test_120724/UKBimp_ss50K_MAF001_chimp.@t.@r.tsv'

for trait in trait_list:
    log_file = pairwise_file.replace('@t', trait).replace('.@r.tsv', '.log')
    with open(log_file, 'r') as f:
        line_list = []
        for line in f:
            line_list.append(line)
    print(trait)
    print(line_list[-2:])

blood_PLATELET_COUNT
['     c22_s190000_e199305\n', '    sys_time=29342.8s, sys_mem=8.1GB\n']
blood_RBC_DISTRIB_WIDTH
['     c22_s190000_e199305\n', '    sys_time=31800.9s, sys_mem=8GB\n']
blood_RED_COUNT
['     c22_s190000_e199305\n', '    sys_time=31252.9s, sys_mem=8GB\n']
blood_WHITE_COUNT
['     c22_s190000_e199305\n', '    sys_time=24523.5s, sys_mem=7.9GB\n']
bmd_HEEL_TSCOREz
['     c22_s190000_e199305\n', '    sys_time=36117.5s, sys_mem=8GB\n']
body_BALDING1
['     c22_s190000_e199305\n', '    sys_time=18212.4s, sys_mem=7.8GB\n']
body_BMIz
['     c22_s190000_e199305\n', '    sys_time=17792.1s, sys_mem=7.9GB\n']
body_HEIGHTz
['     c22_s190000_e199305\n', '    sys_time=60054.7s, sys_mem=8.5GB\n']
body_WHRadjBMIz
['     c22_s190000_e199305\n', '    sys_time=14211.3s, sys_mem=7.9GB\n']
bp_DIASTOLICadjMEDz
['     c22_s190000_e199305\n', '    sys_time=18209.9s, sys_mem=7.8GB\n']
cancer_BREAST
['     c22_s190000_e199305\n', '    sys_time=8968.3s, sys_mem=7.8GB\n']
cancer_PROSTATE
['   