In [54]:
import pandas as pd
import numpy as np
import re

In [55]:
Afr_test = pd.read_csv('test_pop/Afr_test.fam', sep='\t', header=None)

In [56]:
Afr_test.columns = ['#FID', 'IID', 'PID', 'MID', 'Sex', 'pheno']

In [57]:
pheno_dict = {'INI30120' : 'blood_count', 'INI20030780': 'blood_biochemistry', 'INI1003063':'spirometry', 'INI50030700': 'blood_biochemistry'}

In [58]:
for p in ['INI30120', 'INI20030780', 'INI1003063', 'INI50030700']:
    pheno_df = pd.read_csv(f'/net/bmc-lab5/data/kellis/group/tanigawa/data/ukb21942/pheno/{pheno_dict[p]}.tsv.gz', compression='gzip', sep='\t')
    to_save = pheno_df[['#FID', 'IID', p]].merge(Afr_test[['#FID', 'IID']], on=['#FID', 'IID']).rename(columns={'#FID': 'FID', p:'PHENO'})
    to_save = to_save.dropna()
    to_save.to_csv(f'test_pheno/{p}.pheno.txt', header=True, index=False, sep='\t')

In [13]:
sumstat_dir = '/home/lucytian/data/4_polypred/gwas_geno'

lym = f"{sumstat_dir}/blood_count/pop_10PCs/Afr_train_val/INI30120.glm.tsv.gz" 
ldl = f"{sumstat_dir}/blood_biochemistry/pop_10PCs/Afr_train_val/INI20030780.glm.tsv.gz" #
fev = f"{sumstat_dir}/spirometry/pop_10PCs/Afr_train_val/INI1003063.glm.tsv.gz"
egfr = f"{sumstat_dir}/blood_biochemistry/pop_10PCs/Afr_train_val/INI50030700.glm.tsv.gz"

In [14]:
snpvar = pd.read_csv('priors/snps_with_var.gz', sep='\t', compression='gzip')

In [49]:
dfs = []
#min_maf = 0.001
for p in [lym, ldl, fev, egfr]:
    match = re.search(r'INI\d+', p)
    if match:
        trait_id = match.group()
        print(trait_id)
    else:
        raise ValueError(f"Could not extract trait ID from path: {p}")
    
    df = pd.read_csv(p, compression='gzip', sep='\t')
    df = df.rename(columns={'#CHROM': 'CHR', 'POS': 'BP', 'REF': 'A2'})
    df = df[~df['CHR'].isin(['X', 'Y', 'MT'])]
    df['CHR'] = df['CHR'].astype(int)
    
    merged = df.merge(snpvar, on=['CHR', 'BP'], suffixes=("_sum", "_snpvar"))

    # Alleles match
    same = (merged['A1_sum'] == merged['A1_snpvar']) & (merged['A2_sum'] == merged['A2_snpvar'])

    # Alleles are flipped
    flipped = (merged['A1_sum'] == merged['A2_snpvar']) & (merged['A2_sum'] == merged['A1_snpvar'])

    mask = same | flipped
    merged = merged.loc[mask].copy()     # filter AND copy to avoid chained assignment
    flipped = flipped.loc[mask] 

    # Flip beta where alleles are flipped
    merged.loc[flipped, 'BETA'] *= -1
    merged.loc[flipped, 'ALT_FREQS'] = 1 - merged.loc[flipped, 'ALT_FREQS']
    merged['effect_allele'] = np.where(flipped, merged['A2_sum'], merged['A1_sum'])
    merged['alt_allele']    = np.where(flipped, merged['A1_sum'], merged['A2_sum'])
    
    vc = merged['SNP'].value_counts()
    merged = merged[merged["SNP"].isin(vc[vc==1].index)]
    
    merged = merged[['SNP', 'CHR', 'BP', 'effect_allele', 'alt_allele', 'BETA', 'SE', 'SNPVAR', 'ALT_FREQS']].rename(columns={'effect_allele': 'A1', 'alt_allele': 'A2', 'ALT_FREQS': 'MAF'})

    merged.to_csv(f'sumstat/{trait_id}_Afr_sumstats.txt.gz',  sep=" ", index=False, compression='gzip')

INI30120


  df = pd.read_csv(p, compression='gzip', sep='\t')


INI20030780


  df = pd.read_csv(p, compression='gzip', sep='\t')


INI1003063


  df = pd.read_csv(p, compression='gzip', sep='\t')


INI50030700


  df = pd.read_csv(p, compression='gzip', sep='\t')
