# Generate Locus Compare Input Files Using Sieberts et al. eQTL Data
- **Author(s)** - Frank Grenn
- **Date Started** - January 2021
- **Quick Description:** Make meta5 and Sieberts et al. data files for locus compare plots
- **Data:**   
Data from => https://www.nature.com/articles/s41597-020-00642-8  
Downloaded from => https://www.synapse.org/#!Synapse:syn16984815

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
WRKDIR = '$PATH/AppDataProcessing'
QTLDIR = f"{WRKDIR}/qtl"
TSVDIR = f"{QTLDIR}/tsv"


## 1) Gene List

In [None]:
evidence = pd.read_csv(f"{WRKDIR}/genes_by_locus.csv")


evidence_genes =evidence.GENE.tolist()

print(len(evidence_genes))
print(evidence_genes[0:10])


## 2) get the gwas data for (a) meta5 or (b) progression1 or (c) progression2

#### (a) meta5

In [None]:
#META5
gwas_id="META5"

gwas_in = "$PATH/AppDataProcessing/meta5_sumstats_harmonized.csv"
evidence_genes = evidence[evidence.GWAS==gwas_id].GENE.tolist()
print(len(evidence_genes))

gwas = pd.read_csv(gwas_in)
print(gwas.shape)
print(gwas.head())

In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid = gwas[gwas.RSID!='.']# gwas %>% dplyr::filter(RSID!=".")
print(gwas_smr_rsid.shape)
print(gwas_smr_rsid.head())

In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique = gwas_smr_rsid.drop_duplicates()
print(gwas_smr_unique.shape)
print(gwas_smr_unique.head())

In [None]:
gwas = gwas_smr_unique

#### (b) progression (NOTE: need to run notebook twice for this because each locus has its own summary stats file)

##### Using HY3 Sum Stats File OR...

In [None]:
#Progression Loci
gwas_id="Progression"
##rs382940, 9:108058562, 2
#gwas_in = '$PATH/AppDataProcessing/locuszoom/surv_HY3.txt'
gwas_in = "$PATH/AppDataProcessing/prog_hy_sumstats_harmonized.csv"
evidence_genes = evidence[((evidence.GWAS==gwas_id) & (evidence.LOC_NUM==2))].GENE.tolist()
print(len(evidence_genes))

gwas = pd.read_csv(gwas_in)
print(gwas.shape)
print(gwas.head())



In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid = gwas[gwas.RSID!=""]
print(gwas_smr_rsid.shape)
print(gwas_smr_rsid.head())

In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique = gwas_smr_rsid.drop_duplicates()
print(gwas_smr_unique.shape)
print(gwas_smr_unique.head())

In [None]:
gwas = gwas_smr_unique

##### ... Using INS Sum Stats File

In [None]:
#Progression Loci
gwas_id="Progression"
##rs61863020, 10:112956055, 1
##gwas_in = '$PATH/AppDataProcessing/locuszoom/base_INS.txt'
gwas_in = '$PATH/AppDataProcessing/prog_ins_sumstats_harmonized.csv'
evidence_genes = evidence[(evidence.GWAS==gwas_id) & (evidence.LOC_NUM==1)].GENE.tolist()
print(len(evidence_genes))

gwas = pd.read_csv(gwas_in)
print(gwas.shape)
print(gwas.head())


In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid = gwas[gwas.RSID!=""]
print(gwas_smr_rsid.shape)
print(gwas_smr_rsid.head())


In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique = gwas_smr_rsid.drop_duplicates()
print(gwas_smr_unique.shape)
print(gwas_smr_unique.head())

In [None]:
gwas =gwas_smr_unique

#### (iii) Asian Gwas

In [None]:
gwas_id="Asian"
#gwas_in = "$PATH/summary_stats/asian_GWAS/6724PDcases-24851controls-5843213snps-summary-stats-metaP-SE.txt.gz"
gwas_in = "$PATH/AppDataProcessing/asiangwas_sumstats_harmonized.csv"
evidence_genes = evidence[evidence.GWAS==gwas_id].GENE.tolist()
print(len(evidence_genes))



gwas = pd.read_csv(gwas_in)
print(gwas.shape)
print(gwas.head())

In [None]:
#ignore SNPs with no rsid
gwas_smr_rsid = gwas[gwas.RSID!=""]
print(gwas_smr_rsid.shape)
print(gwas_smr_rsid.head())


In [None]:
#the file has duplicates for some reason, so remove them
gwas_smr_unique = gwas_smr_rsid.drop_duplicates()
print(gwas_smr_unique.shape)
print(gwas_smr_unique.head())

In [None]:
gwas = gwas_smr_unique

## 3) Get Risk Variant Data

In [None]:
gwas_risk_variants = pd.read_csv("$PATH/AppDataProcessing/gwas_risk_variants.csv")
print(gwas_risk_variants.shape)
print(gwas_risk_variants.head())


In [None]:
GWASTSVDIR=f"{TSVDIR}/{gwas_id}"
GWASTSVDIR

## 4) eQTL
split the eQTL data by the genes we want to make it easier to read 

In [None]:
swarm = open("$PATH/AppDataProcessing/qtl/split_cortex_eqtl_by_gene.swarm","w")
for gene in evidence_genes:
    swarm.write(f"grep {gene}, $PATH/cortical_meta_eqtl_summary_stats/Cortex_MetaAnalysis_ROSMAP_CMC_HBCC_Mayo_cis_eQTL_release.csv | awk -F',' '{{print $3,$4,$8}}' > $PATH/qtl_temp/{gene}_eqtl.csv\n")
swarm.close()

In [None]:
#swarm -f $PATH/AppDataProcessing/qtl/split_cortex_eqtl_by_gene.swarm -g 5 -t 2 --partition quick --time=01:00:00

In [None]:
def format_locuscompare_tsvs(gene):
    #add column names to the gene eqtl files
    eqtl_file = f"$PATH/qtl_temp/{gene}_eqtl.csv"
    print(os.stat(eqtl_file).st_size)
    if os.stat(eqtl_file).st_size!=0:
        
        gene_eqtl_data = pd.read_csv(eqtl_file,header=None,sep=" ")

    
        if len(gene_eqtl_data.index > 0):
            
            gene_eqtl_data.columns = ['RSID','CHR_BP_REF_ALT','P']    
            gene_eqtl_data = gene_eqtl_data[gene_eqtl_data.RSID != "."]
            gene_eqtl_data.to_csv(f"$PATH/qtl_temp/{gene}_eqtl.tsv", index=None,sep="\t")


            gwas_filter = gwas[gwas.RSID.isin(gene_eqtl_data.RSID.tolist())]
            gwas_filter[['RSID','CHR_BP_REF_ALT','P']].to_csv(f"$PATH/qtl_temp/{gene}_gwas.tsv",index=None,sep="\t")
    
    

In [None]:
count = 1
for gene in evidence_genes:
    print(gene)
    print(count)
    format_locuscompare_tsvs(gene)
    count = count +1

    

In [None]:
#grep "\.$(printf '\t')" SLC39A1_eqtl.tsv 