# Finemapping Processing
- **Author** - Frank Grenn
- **Date Started** - April 2020
- **Quick Description:** filter the finemapping data for the GWAS browser. This code filters the data by prob > 0.01, assigns variants to the browser locus numbers and looks for coding variants for each variant. 


In [None]:
import pandas as pd
import numpy as np

get all rows from results.csv with prob > 0.01

In [None]:
results = pd.read_csv("/path/to/finemap/results.csv")
print(results.shape)
print(results.head())

In [None]:
print(results.columns)

In [None]:
results_cols = results[["SNP","chr","position", "A1", "A2", "freq", "p", "prob", "log10bf"]]
print(results_cols.shape)
print(results_cols.head())

In [None]:
results_filter = results_cols[results_cols["prob"] > 0.01]
print(results_filter.shape)
print(results_filter.head())

now liftover the above df (which is in hg38 coordinates) back to hg19

In [None]:
#add some identifier to the cols
results_filter['name'] = np.arange(len(results_filter))
print(results_filter.shape)
print(results_filter.head())
print(results_filter.tail())

In [None]:
results_f_sub = results_filter[['chr','position', 'name']]
results_f_sub['position2'] = results_f_sub['position'] + 1
results_f_sub = results_f_sub[['chr','position', 'position2', 'name']]
print(results_f_sub.shape)
print(results_f_sub.head())
results_f_sub.to_csv("/path/to/AppDataProcessing/finemapping/for_liftover.txt", sep = '\t', index = None)

In [None]:
#get the liftover results
liftover_hg19 = pd.read_csv("/path/to/AppDataProcessing/finemapping/liftover_results.bed",sep="\t",header = None)
liftover_hg19.columns = ["chr", "position_hg19", "position2", "name"]
liftover_hg19 = liftover_hg19[['chr','position_hg19','name']]
print(liftover_hg19.shape)
print(liftover_hg19.head())

In [None]:
#now merge back with the original data
results_lo = pd.merge(left = liftover_hg19, right = results_filter, on = "name", how = "inner")
print(results_lo.shape)
print(results_lo.head())

also get the 90 risk variants from the results.csv  
we will check if the new hg19 positions are in any of the loci ranges to assign locus numbers

In [None]:
loci = pd.read_csv("/path/to/AppDataProcessing/GWAS_loci_overview.csv")
print(loci.shape)
print(loci.head())

In [None]:
loci['start'] = loci['BP'].str.replace(",","").astype(int)- 1000000
loci['end'] = loci['BP'].str.replace(",","").astype(int)+ 1000000
loci['CHR'] = 'chr' + loci['CHR'].astype(str)
print(loci.shape)
print(loci.head())

In [None]:
summary = pd.read_csv("/path/to/AppDataProcessing/finemapping/loci_fm_summary.csv")
print(summary.shape)
print(summary.head())

iterate over the filtered results df and assign each row to a locus number from the GWAS_loci_overview ranges

In [None]:
have_locus = pd.DataFrame()
results_lo['index_variant']=""
results_lo['locus number'] = ""
no_locus = pd.DataFrame()

for index, row in results_lo.iterrows():
    index_row = loci[(loci["CHR"] == row["chr_x"]) & (loci['start']< row["position_hg19"]) & (loci['end'] > row["position_hg19"]) ]

    #if not assigned to a locus
    if(len(index_row) == 0):
        print(results_lo.iloc[index,])
        print("\n\n\n")
        no_locus = no_locus.append(results_lo.loc[index])

    else:
        var = list(index_row['SNP'])[0]
        locus = list(index_row['Locus Number'])[0]

        #break
        results_lo.at[index,'index_variant'] = var#[index, 'index_variant'] = index_row['index_variant']
        results_lo.at[index, 'locus number'] = locus
    
        have_locus = have_locus.append(results_lo.loc[index])
print(results_lo.shape)
print(results_lo.head())
print("final df:")
print(have_locus.shape)

In [None]:
print(len(set(have_locus['locus number'])))

In [None]:
print(have_locus.shape)
print(results_lo.shape)

In [None]:
#see the loci that aren't covered by the results
set(loci['Locus Number']) ^ set(have_locus['locus number'])

check for coding variants

In [None]:
annot = pd.read_csv("/path/to/HRC_ouput_annovar_ALL.txt",sep='\t', skiprows=0, low_memory = False)
print(annot.shape)
print(annot.head())

In [None]:
have_locus.head()

In [None]:
have_locus['chr_x'] = have_locus['chr_x'].str.replace("chr","")
have_locus['position_hg19'] = have_locus['position_hg19'].astype(int).astype(str)

print(have_locus.shape)
print(have_locus.head())

In [None]:
coding = pd.merge(left = have_locus, right = annot, left_on = ["chr_x","position_hg19"], right_on = ["Chr", "Start"], how = "left")
print(coding.shape)
print(coding.head())

In [None]:
coding.to_csv("/path/to/AppDataProcessing/finemapping/finemapcoding.csv",index = None)

In [None]:
#check how many coding vairants we have
aa_changes = [ c for c in coding['AAChange.refGene'] if  '.' is not c ]
len(aa_changes)

### Make DataFrames containing rows with and without coding data

In [None]:
fm_coding = coding[coding['ExonicFunc.refGene']!='.']
print(fm_coding.shape)
fm_not_coding = coding[coding['ExonicFunc.refGene']=='.']
print(fm_not_coding.shape)

merge the finemapping results with coding variants with coding variant data from the app to look for which AA change to use

In [None]:
app_coding_var = pd.read_csv("/path/to/AppDataProcessing/results/CodingVariants.csv")
print(app_coding_var.shape)
print(app_coding_var.head())

In [None]:
merged = pd.merge(left = fm_coding, right = app_coding_var,left_on = "SNP", right_on = "ID" , how = "left")
merged.shape
merged

In [None]:
has_app_coding = merged[merged['ID'].notna()].reset_index()
has_app_coding 

In [None]:
#if all indices printed then can use the AA change from the other app data (to fix multiple AA changes)
for index, row in has_app_coding.iterrows():
    if(row['AA Change'] in row['AAChange.refGene']):
        print(index)


In [None]:
has_app_coding.columns

In [None]:
has_app_coding = has_app_coding[['locus number_x','SNP','Chr','position_hg19','Ref','Alt','freq','p','prob','log10bf','Func.refGene','ExonicFunc.refGene','AA Change']]
has_app_coding.columns = fm_not_coding.columns = ['locus number_x','SNP','Chr','position_hg19','Ref','Alt','freq','p','prob','log10bf','Func.refGene','ExonicFunc.refGene','AAChange.refGene']
has_app_coding

now the ones we still need to check

In [None]:
no_app_coding = merged[merged['ID'].isna()]
no_app_coding

if just one AAChange.refGene then we should be fine

In [None]:
no_app_coding_single = no_app_coding[no_app_coding['AAChange.refGene'].str.count(',')==0]
no_app_coding_single = no_app_coding_single[['locus number_x','SNP','Chr','position_hg19','Ref','Alt','freq','p','prob','log10bf','Func.refGene','ExonicFunc.refGene','AAChange.refGene']]
no_app_coding_single

if multiple then we need to find the one we want

In [None]:
no_app_coding_multi = no_app_coding[no_app_coding['AAChange.refGene'].str.count(',')!=0]
no_app_coding_multi

use gnomad to get the best transcript for each SNP  
rs72819488:ENST00000317620  
rs4858798:ENST00000328631  
rs2230457:ENST00000304400

then can use this R code to filter for the NCBI/refSeq ids we need to pick the right one

```
library(biomaRt)
ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
values <- c("ENST00000317620.9","ENST00000328631.5","ENST00000304400.7")
values <- c("ENST00000317620","ENST00000328631","ENST00000304400")
results <- getBM(attributes = c("refseq_mrna", "ensembl_transcript_id", "hgnc_symbol", filters = "ensembl_transcript_id"), values = values, mart = ensembl)

for(value in values)
{
  print(value)
  print(results[which(results$ensembl_transcript_id == value),]$refseq_mrna)
  print("\n")
}
#listDatasets(useMart("ensembl"))
#listFilters(ensembl)
#listAttributes(ensembl)


```

rs72819488:ENST00000317620:NM_001165978  
rs4858798:ENST00000328631:NM_001005909 or NM_016291  
rs2230457:ENST00000304400:NM_001364583 or NM_000919

In [None]:
refseq_ids = ["NM_001165978","NM_001005909","NM_016291","NM_001364583","NM_000919"]

In [None]:
#if all indices printed then can use the AA change from the other app data (to fix multiple AA changes)
for index, row in no_app_coding_multi.iterrows():
    split = row['AAChange.refGene'].split(",")
    for val in split:
        for rs in refseq_ids:
            if(rs in val):
                print(val)
                print(rs)
#repeat to assign
for index, row in no_app_coding_multi.iterrows():
    split = row['AAChange.refGene'].split(",")
    for val in split:
        for rs in refseq_ids:
            if(rs in val):
                no_app_coding_multi.at[index,'AAChange.refGene'] = val

no_app_coding_multi

In [None]:
no_app_coding_multi = no_app_coding_multi[['locus number_x','SNP','Chr','position_hg19','Ref','Alt','freq','p','prob','log10bf','Func.refGene','ExonicFunc.refGene','AAChange.refGene']]
no_app_coding_multi

format the df with no coding data

In [None]:
fm_not_coding.columns

In [None]:
fm_not_coding = fm_not_coding[['locus number','SNP','Chr','position_hg19','Ref','Alt','freq','p','prob','log10bf','Func.refGene','ExonicFunc.refGene','AAChange.refGene']]
fm_not_coding.columns = ['locus number_x','SNP','Chr','position_hg19','Ref','Alt','freq','p','prob','log10bf','Func.refGene','ExonicFunc.refGene','AAChange.refGene']
fm_not_coding

#### now append everything together

In [None]:
print(fm_not_coding.shape)
print(no_app_coding_multi.shape)
print(no_app_coding_single.shape)
print(has_app_coding.shape)

In [None]:
print(fm_not_coding.columns)
print(no_app_coding_multi.columns)
print(no_app_coding_single.columns)
print(has_app_coding.columns)

In [None]:
final_df = no_app_coding_multi.append(no_app_coding_single).append(fm_not_coding).append(has_app_coding)
final_df.columns = ['Locus Number','SNP','Chr','Position', 'Ref', 'Alt', 'Freq', 'P-value', 'prob', 'log10bf','Func.refGene','ExonicFunc.refGene','AAChange.refGene']
final_df = final_df.replace(".","NA")
final_df = final_df.drop_duplicates()
final_df.shape

In [None]:
final_df.to_csv("/path/to/AppDataProcessing/results/fineMappingFilteredData.csv",index=None)

In [None]:
final_df = pd.read_csv("/path/to/AppDataProcessing/results/fineMappingFilteredData.csv")
print(final_df.head())

In [None]:
final_df['Chr'] = 'chr'+final_df['Chr'].astype(str)
print(final_df.head())

In [None]:
variants = final_df[['Chr','Position','SNP']]
variants  = variants.sort_values(by=['Chr','Position'])


variants = variants[['Chr','Position','Position','SNP']]
print(variants.shape)
print(variants.head())
variants.to_csv("/path/to/AppDataProcessing/finemapping/filtered.bed",index=None,header=None,sep='\t')

In [None]:
refhg19 = pd.read_csv("/path/to/refFlat_HG19.txt",sep='\t',header =None)

print(refhg19.head())

In [None]:

ref = refhg19.iloc[:,[2,4,5,0]]
ref.columns = ['chr','start','end','gene']

In [None]:
print(ref.head())

In [None]:
ref  = ref.sort_values(by=['chr','start','end'])
print(ref.head())

In [None]:
ref.to_csv("/path/to/ref.bed",index=None,header=None,sep='\t')

In [None]:
print("bedtools intersect -a AppDataProcessing/finemapping/filtered.bed -b ref.bed -wb > test.txt")

In [None]:
print("bedtools closest -a AppDataProcessing/finemapping/filtered.bed -b ref.bed -wb > test_closest.txt")

In [None]:
results = pd.read_csv("/path/to/test_closest.txt",sep='\t',header=None)
print(results.shape)
print(results.head())

In [None]:
results = results.drop_duplicates()

In [None]:
print(results.shape)

In [None]:
rs_neargene = results.iloc[:,[3,7]]
rs_neargene.columns = ['RSID','NearGene']
print(rs_neargene.shape)
print(rs_neargene.head())

In [None]:
print(rs_neargene.drop_duplicates().shape)

In [None]:
merge = pd.merge(left = final_df, right = rs_neargene, left_on = 'SNP', right_on= 'RSID', how = 'inner')
print(merge.shape)
print(merge.head())

In [None]:
merge_nodup = merge.drop_duplicates()
print(merge_nodup.shape)

In [None]:
print(merge_nodup.head())

In [None]:
merge_nodup.to_csv("/path/to/results.csv",index=None)

In [None]:
print(final_df.shape)
print(final_df.head())

In [None]:
nodup = final_df.drop_duplicates()
print(nodup.shape)