In [55]:
import pandas as pd

In [56]:
kunkle_gwas_data = pd.read_csv("/home/eramamur/resources/gwas/Kunkle_etal_Stage1_results.txt",
                               sep=' ')

In [57]:
jansen_gwas_data = pd.read_csv("/home/eramamur/resources/gwas/AD_sumstats_Jansenetal.txt",
                               sep='\t')

In [58]:
kunkle_gwas_data.head()

Unnamed: 0,Chromosome,Position,MarkerName,Effect_allele,Non_Effect_allele,Beta,SE,Pvalue
0,1,100000012,rs10875231,T,G,-0.0026,0.0168,0.8758
1,1,100000827,rs6678176,T,C,0.0008,0.0156,0.9574
2,1,100000843,rs78286437,T,C,-0.0136,0.033,0.6792
3,1,100000989,chr1:100000989:I,A,ATC,-0.0099,0.0343,0.7731
4,1,100001138,rs144406489,A,G,-0.0061,0.0612,0.9204


In [59]:
jansen_gwas_data.head()

Unnamed: 0,uniqID.a1a2,CHR,BP,A1,A2,SNP,Z,P,Nsum,Neff,dir,MAF,BETA,SE
0,1:715265_T_C,1,715265,T,C,rs12184267,2.121973,0.03384,381761,381761.0,??+?,0.040807,0.012275,0.005785
1,1:715367_G_A,1,715367,G,A,rs12184277,1.957915,0.05024,382151,382151.0,??+?,0.041069,0.011285,0.005764
2,1:717485_A_C,1,717485,A,C,rs12184279,1.912438,0.05582,382180,382180.0,??+?,0.040576,0.011087,0.005797
3,1:720381_T_G,1,720381,T,G,rs116801199,2.295404,0.02171,382954,382954.0,??+?,0.042162,0.013052,0.005686
4,1:721290_C_G,1,721290,C,G,rs12565286,2.315602,0.02058,382779,382779.0,??+?,0.042378,0.013137,0.005673


In [60]:
snp_data = pd.read_csv("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/snigdha_snp_list_unique_haploreg_hg19_positions_1kb_and_500bp_sequences_microglia_monocyte_neuron_bulk_scores_and_biccn_classifiers_scores_satpathy_regression_effect_scores.txt",
                       sep='\t',
                       header=0)

In [61]:
def get_jansen_gwas_data(snp_data_row):
    columnNames = ["Jansen.P",
                   "Jansen.Z",
                   "Jansen.MAF",
                   "Jansen.BETA",
                   "Jansen.SE"]
    rsid = snp_data_row["LD_RSID"]
    ref = snp_data_row["REF"].upper()
    alt = snp_data_row["ALT"].upper()
    
    jansen_data_matching_rsids = jansen_gwas_data.loc[jansen_gwas_data["SNP"]==rsid]
    
    for jansen_index,jansen_row in jansen_data_matching_rsids.iterrows():
        a1 = jansen_row["A1"].upper()
        a2 = jansen_row["A2"].upper()
        
        if a1==ref and a2==alt:
            return pd.Series([jansen_row["P"],
                              jansen_row["Z"],
                              jansen_row["MAF"],
                              jansen_row["BETA"],
                              jansen_row["SE"]], index=columnNames)
        elif a1==alt and a2==ref:
            return pd.Series([jansen_row["P"],
                              -1*jansen_row["Z"],
                              jansen_row["MAF"],
                              -1*jansen_row["BETA"],
                              jansen_row["SE"]], index=columnNames)
    return pd.Series([None,None,None,None,None], index=columnNames)

In [62]:
def get_kunkle_gwas_data(snp_data_row):
    columnNames = ["Kunkle.P",
                   "Kunkle.BETA",
                   "Kunkle.SE"]
    rsid = snp_data_row["LD_RSID"]
    ref = snp_data_row["REF"].upper()
    alt = snp_data_row["ALT"].upper()
    
    kunkle_data_matching_rsids = kunkle_gwas_data.loc[kunkle_gwas_data["MarkerName"]==rsid]
    
    for kunkle_index,kunkle_row in kunkle_data_matching_rsids.iterrows():
        a1 = kunkle_row["Effect_allele"].upper()
        a2 = kunkle_row["Non_Effect_allele"].upper()
        
        if a1==ref and a2==alt:
            return pd.Series([kunkle_row["Pvalue"],
                              kunkle_row["Beta"],
                              kunkle_row["SE"]], index=columnNames)
        elif a1==alt and a2==ref:
            return pd.Series([kunkle_row["Pvalue"],
                              -1*kunkle_row["Beta"],
                              kunkle_row["SE"]], index=columnNames)
    return pd.Series([None,None,None], index=columnNames)

In [52]:
jansen_snp_data = snp_data.apply(get_jansen_gwas_data, axis=1)
kunkle_snp_data = snp_data.apply(get_kunkle_gwas_data, axis=1)

In [63]:
snp_data = pd.concat([snp_data, jansen_snp_data, kunkle_snp_data],axis=1)

In [84]:
snp_data.to_csv("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/snigdha_snp_list_unique_haploreg_hg19_positions_1kb_and_500bp_sequences_microglia_monocyte_neuron_bulk_scores_and_biccn_classifiers_scores_satpathy_regression_effect_scores_jansen_kunkle_gwas_data.txt",
                sep='\t',
                index=False)

In [85]:
sentinel_snp_mapping_data = pd.read_csv("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/snigdha_snp_list_sentinel_ld_mapping.txt",
                                        sep='\t',
                                        header=0)

In [95]:
def get_sentinel_snps(snp_data_row):
    ld_rsid = snp_data_row["LD_RSID"]
    ref = snp_data_row["REF"]
    alt = snp_data_row["ALT"]
    sentinel_snp_mapping_matching_ld_rsid = sentinel_snp_mapping_data.loc[sentinel_snp_mapping_data["LD_RSID"]==ld_rsid]    
    lead_rsids = set()
    for mapping_index, mapping_row in sentinel_snp_mapping_matching_ld_rsid.iterrows():
        lead_rsid = mapping_row["LEAD_RSID"]
        if ref == mapping_row["REF"] and alt == mapping_row["ALT"]:
            lead_rsids.add(lead_rsid)
    return list(lead_rsids)

In [101]:
sentinel_snps = snp_data.apply(get_sentinel_snps, axis=1)

In [105]:
with open("/projects/pfenninggroup/machineLearningForComputationalBiology/eramamur_stuff/ad_variants_processing/snigdha_snp_list_unique_sentinel_snp_only.txt", 'w') as f:
    f.write("LEAD_RSIDS")
    f.write("\n")
    for snp in sentinel_snps:
        f.write("\t".join(snp))
        f.write("\n")