# Loci Gene List
- **Author** - Frank Grenn
- **Date Started** - April 2020
- **Quick Description:** code to get summary statistics and genes for risk loci for app

 - #### 1) Get List of Genes 1Mb Up and Downstream of Risk Variants
 - #### 2) Summary Stats For Risk Variants
 - #### 3) Update the `genes_by_locus.csv` File
 - #### 4) Update the `gwas_risk_variants.csv` File


In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
DATADIR='/path/to/AppDataProcessing'
WRKDIR = f'{DATADIR}/genelist'
refFlat_hg19 = '/path/to/refFlat_HG19.txt'
#change this depending on the new gwas you are using
sum_stats_file=f"{DATADIR}/asiangwas_sumstats_harmonized.csv"

## 1) Get List of Genes 1Mb Up and Downstream of Risk Variants
#### refFlat .txt to .bed for overlap later

In [None]:
refFlat = pd.read_csv(refFlat_hg19, sep='\t',header=None)
print(refFlat.head())

In [None]:
refFlat_bed = refFlat.iloc[:,[2,4,5,0,1]]
print(refFlat_bed.head())

In [None]:
refFlat_bed.to_csv(f"{WRKDIR}/refFlat_HG19.bed", index = None, header = None,sep='\t')

#### Extract Summary Stats for the GWAS Risk Variants
code may need changing depending on file

In [None]:
sum_stats = pd.read_csv(sum_stats_file)
print(sum_stats.shape)
print(sum_stats.head())

#### filter the for the risk variants somehow


In [None]:
risk_vars = sum_stats[(sum_stats['RSID']=='rs246814') | (sum_stats['RSID']=='rs9638616')]
print(risk_vars.head())

In [None]:
risk_vars_info = risk_vars[['CHR','RSID','A1','A2','BETA','P','SE','BP','REF','ALT']]

risk_vars_info = risk_vars_info.reset_index(drop=True)
print(risk_vars_info.head())


#### Manually assign the nearest gene and Locus Number
hopefully this is stated in the gwas study paper somewhere

In [None]:
#manually assign the gene
risk_vars_info['NEAR_GENE']= ['SV2C','WBSCR17']
#locus number
risk_vars_info['LOC_NUM'] = risk_vars_info.index +1

In [None]:
print(risk_vars_info.head())

#### Make a .bed for the Risk Variants
change position to 1 Mb up and downstream

In [None]:
risk_vars_bed = risk_vars_info[['CHR','BP','BP','NEAR_GENE','LOC_NUM']]
risk_vars_bed.columns = ['CHR','Start','End','Nearest Gene','Locus Number']
risk_vars_bed[['CHR']]='chr'+risk_vars_bed[['CHR']].astype(str)
risk_vars_bed[['Start']]  = risk_vars_bed[['Start']].astype(int)-1000000
risk_vars_bed[['End']]  = risk_vars_bed[['End']].astype(int)+1000000

print(risk_vars_bed.head())

In [None]:


risk_vars_bed.to_csv(f"{WRKDIR}/Asian_GWAS.bed", index = None, sep='\t')

#### Get Overlap
run this in terminal

In [None]:
print(f"module load bedtools\n\
intersectBed -a {WRKDIR}/Asian_GWAS.bed -b {WRKDIR}/refFlat_HG19.bed -wb > {WRKDIR}/asian_gwas_genes.txt")

In [None]:
overlap = pd.read_csv(f"{WRKDIR}/asian_gwas_genes.txt",sep="\t",header=None)
print(overlap.head())

#### Write the List of Genes to a File

In [None]:
gene_list = overlap[[4,8]]
gene_list = gene_list.drop_duplicates()
gene_list.columns = ['LOC_NUM','GENE']
print(gene_list.shape)
gene_list.to_csv(f"{WRKDIR}/asian_gwas_gene_list.txt",sep='\t',index=None)


## 2) Summary Stats For Risk Variants

In [None]:
risk_vars_info

#### Count Number of Genes Per Locus

In [None]:
risk_vars_info['NUM_GENES'] = 0
for locnum in risk_vars_info['LOC_NUM'].tolist():
    risk_vars_info['NUM_GENES'][(risk_vars_info['LOC_NUM']==locnum)] = len((overlap[overlap[4]==locnum])[[4,8]].drop_duplicates().index)

In [None]:
print(risk_vars_info)

#### Do any other calculations that are missing

In [None]:
import math

In [None]:
risk_vars_info['BETA'] = pd.to_numeric(risk_vars_info['BETA'],errors='coerce')

In [None]:
risk_vars_info['OR'] = np.exp(risk_vars_info['BETA'])

In [None]:
risk_vars_info['CHR_BP'] = risk_vars_info['CHR'].astype(str)+":"+risk_vars_info['BP'].astype(str)

In [None]:
risk_vars_info['MAF'] = None

In [None]:
print(risk_vars_info.head())

In [None]:
risk_vars_info.to_csv(f"{DATADIR}/AsianLoci.csv",index=None)

## 3) Update the `genes_by_locus.csv` File
append new data to the current version of the file

In [None]:
genes_by_locus = pd.read_csv(f"{DATADIR}/genes_by_locus.csv")
print(genes_by_locus.shape)
print(genes_by_locus.head())

In [None]:
genes_by_locus = genes_by_locus[(genes_by_locus['GWAS']=='META5') | (genes_by_locus['GWAS']=='Progression')]
print(genes_by_locus.shape)

In [None]:
gene_list

#### append the new gwas data 

In [None]:
gene_list['GWAS'] = 'Asian'
genes_by_locus_new = genes_by_locus.append(gene_list,sort=True)
print(genes_by_locus_new.shape)
print(genes_by_locus_new.tail())

In [None]:
genes_by_locus_new.to_csv(f"{DATADIR}/genes_by_locus.csv",index=None)

## 4) Update the `gwas_risk_variants.csv` File

In [None]:
all_risk_vars = pd.read_csv(f"{DATADIR}/gwas_risk_variants.csv")
print(all_risk_vars.shape)
print(all_risk_vars.head())

In [None]:
risk_vars_info

In [None]:
for_all = risk_vars_info[['LOC_NUM','RSID','CHR','BP','REF','ALT','NEAR_GENE']]
for_all['CHR_BP'] = for_all['CHR'].astype(str)+":"+for_all['BP'].astype(str)
for_all['GWAS']='Asian'
for_all.columns = ['LOC_NUM','RSID','CHR','BP','REF','ALT','NEAR_GENE','CHR_BP','GWAS']
print(for_all.head())

In [None]:
append_risk_vars = all_risk_vars.append(for_all)
print(append_risk_vars.shape)
print(append_risk_vars.head())
print(append_risk_vars.tail())

In [None]:
append_risk_vars.to_csv(f"{DATADIR}/gwas_risk_variants.csv",index=None)