# Harmonize Summary Statistics For GWAS Browser
- **Author** - Frank Grenn
- **Date Started** - May 2020
- **Quick Description:** harmonize the different gwas summary statistics for use later



In [None]:
import pandas as pd
import numpy as np

In [None]:
DATADIR="/path/to/AppDataProcessing"

In [None]:
def formatVariant(chrm,bp,ref,alt):
    return str(chrm)+":"+str(bp)+"_"+str(ref)+"/"+str(alt);

## META5

In [None]:
#get summary stats
data = pd.read_csv("/path/to/resultsForSmr_filtered.tab.gz",sep="\t")
print(data.shape)
print(data.head())

In [None]:
#get the snp position data to merge
pos_data = pd.read_csv("/path/to/HRC_RS_conversion_final_with_CHR.txt", sep = "\t")
print(pos_data.shape)
print(pos_data.head())

In [None]:
merge = pd.merge(data,pos_data,how='left',left_on='SNP',right_on='ID')
print(merge.shape)
print(merge.head())

In [None]:
split = merge['POS'].str.split(":",n=1,expand = True)
merge['chromosome']=split[0]
merge['position']=split[1]

In [None]:
merge['log_pvalue']=-1*np.log10(merge.p)

In [None]:
merge['variant']=merge.apply(lambda x: formatVariant(x.chromosome, x.position, x.REF, x.ALT), axis = 1)

In [None]:
print(merge.shape)
print(merge.head())

In [None]:
print(merge.columns)

In [None]:
meta5_harm = merge[['SNP','A1','A2','REF','ALT','b','se','freq','p','log_pvalue','chromosome','position','variant']]

harmonize the column names

In [None]:
meta5_harm.columns = ['RSID','A1','A2','REF','ALT','BETA','SE','FREQ','P','LOG_P','CHR','BP','CHR_BP_REF_ALT']
print(meta5_harm.shape)
print(meta5_harm.head())

In [None]:
meta5_harm.to_csv(f"{DATADIR}/meta5_sumstats_harmonized.csv",index=None)

# Progression (Insomnia)



In [None]:
progi_data = pd.read_csv("/path/to/AppDataProcessing/locuszoom/base_INS.txt",sep="\t")
#locus_snp='rs61863020'

print(progi_data.shape)
print(progi_data.head())

In [None]:
progi_pos_data = pd.read_csv("/path/to/AppDataProcessing/locuszoom/reference.txt")
print(len(progi_pos_data.index))
print(progi_pos_data.head())

In [None]:
progi_merge = pd.merge(progi_data,progi_pos_data,how='left',left_on='SNP',right_on='SNP')

In [None]:
progi_merge['START'] = progi_merge['START'].astype('int32')
progi_merge['CHR'] = progi_merge['CHR'].astype('str')

In [None]:
print(len(progi_merge.index))
print(progi_merge.head())

In [None]:
progi_merge['log_pvalue']=-1*np.log10(progi_merge.P)

In [None]:
progi_merge['variant']=progi_merge.apply(lambda x: formatVariant(x.CHR, x.START, x.REF, x.ALT), axis = 1)

In [None]:
print(len(progi_merge.index))
print(progi_merge.head())

In [None]:
print(progi_merge.columns)

In [None]:
progi_merge['A1']=None
progi_merge['A2']=None

In [None]:
progi_harm = progi_merge[['RSID','A1','A2','REF','ALT','BETA','SE','MAF','P','log_pvalue','CHR','START','variant']]

In [None]:
progi_harm.columns = ['RSID','A1','A2','REF','ALT','BETA','SE','FREQ','P','LOG_P','CHR','BP','CHR_BP_REF_ALT']
print(progi_harm.shape)
print(progi_harm.head())

In [None]:
progi_harm.to_csv(f"{DATADIR}/prog_ins_sumstats_harmonized.csv",index=None)

# Progression (Hoehn and Yahr)

In [None]:
proghy_data = pd.read_csv("/path/to/AppDataProcessing/locuszoom/surv_HY3.txt",sep="\t")
#locus_snp='rs382940'
print(proghy_data.shape)
print(proghy_data.head())

In [None]:
proghy_pos_data = pd.read_csv("/path/to/AppDataProcessing/locuszoom/reference.txt")
print(len(proghy_pos_data.index))
print(proghy_pos_data.head())

In [None]:
proghy_merge = pd.merge(proghy_data,proghy_pos_data,how='left',left_on='SNP',right_on='SNP')

In [None]:
proghy_merge['START'] = proghy_merge['START'].astype('int32')
proghy_merge['CHR'] = proghy_merge['CHR'].astype('str')

In [None]:
print(len(proghy_merge.index))
print(proghy_merge.head())

In [None]:
proghy_merge['log_pvalue']=-1*np.log10(proghy_merge.P)

In [None]:
proghy_merge['variant']=proghy_merge.apply(lambda x: formatVariant(x.CHR, x.START, x.REF, x.ALT), axis = 1)

In [None]:
print(len(proghy_merge.index))
print(proghy_merge.head())

In [None]:
proghy_merge['A1']=None
proghy_merge['A2']=None

In [None]:
proghy_harm = proghy_merge[['RSID','A1','A2','REF','ALT','BETA','SE','MAF','P','log_pvalue','CHR','START','variant']]

In [None]:
proghy_harm.columns = ['RSID','A1','A2','REF','ALT','BETA','SE','FREQ','P','LOG_P','CHR','BP','CHR_BP_REF_ALT']
print(proghy_harm.shape)
print(proghy_harm.head())

In [None]:
proghy_harm.to_csv(f"{DATADIR}/prog_hy_sumstats_harmonized.csv",index=None)

# Asian GWAS

In [None]:
asiangwas_data = pd.read_csv("/path/to/6724PDcases-24851controls-5843213snps-summary-stats-metaP-SE.txt.gz",sep='\t')
print(asiangwas_data.shape)
print(asiangwas_data.head())

In [None]:
#separate the SNP col
asiangwas_split = asiangwas_data[['CHR','BP','SNP','A1','A2','BETA','P','SE']]

asiangwas_split[['SNP','BP_tmp','Ref','Alt']]=asiangwas_split['SNP'].str.split(':',expand = True)
asiangwas_split = asiangwas_split[['CHR','BP','SNP','Ref','Alt','A1','A2','BETA','P','SE']]
asiangwas_split = asiangwas_split.reset_index(drop=True)
print(asiangwas_split.head())


check if their A1 and A2 correspond to what should be the Ref and Alt we got from the split

In [None]:
test = asiangwas_split[['Alt', 'A1']].assign(NE=((asiangwas_split.Alt!=None) & (asiangwas_split.Alt == asiangwas_split.A1) ))
print(test.head())
print("these two numbers should be the same:")
print(len(asiangwas_split[asiangwas_split['Alt'].isna()==False].index))
print(len(test[test.NE==True].index))

In [None]:
#if above two numbers are the same then we can say A1 equals Alt for these variants
asiangwas_split['Alt']=asiangwas_split['A1']
asiangwas_split['Ref']=asiangwas_split['A2']

In [None]:
asiangwas_split['FREQ'] = None

In [None]:
asiangwas_split['log_pvalue']=-1*np.log10(asiangwas_split.P)

In [None]:
asiangwas_split['CHR'] = asiangwas_split['CHR'].astype(str)

In [None]:
asiangwas_split['variant']=asiangwas_split.apply(lambda x: formatVariant(x.CHR, x.BP, x.Ref, x.Alt), axis = 1)

In [None]:
print(asiangwas_split.head())

In [None]:
print(asiangwas_split.columns)

In [None]:
asiangwas_harm = asiangwas_split[['SNP','A1','A2','Ref','Alt','BETA','SE','FREQ','P','log_pvalue','CHR','BP','variant']]

In [None]:
asiangwas_harm.columns = ['RSID','A1','A2','REF','ALT','BETA','SE','FREQ','P','LOG_P','CHR','BP','CHR_BP_REF_ALT']
print(asiangwas_harm.shape)
print(asiangwas_harm.head())

In [None]:
asiangwas_harm.to_csv(f"{DATADIR}/asiangwas_sumstats_harmonized.csv",index=None)