# Import Gene Annotation File

https://github.com/hakha-most/gwas_eqtl/blob/master/gene_annotations/genes.protein_coding.v39.gtf

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip

# Local path to GTF file
gtf_file_path = 'genes.protein_coding.v39.gtf'

# Load the GTF file into a pandas DataFrame and use the first row as headers
gtf_df = pd.read_csv(gtf_file_path, sep='\t', header=0)

# Display the first few rows of the dataframe
gtf_df.sort_values('start')
gtf_df.head()

#### convert chr to int

In [8]:
gtf_df['chr'] = gtf_df['chr'].str[3:].astype(int)
gtf_df['start'] = gtf_df['start'].astype(int)
gtf_df.groupby('chr')['start'].agg({'min', 'max'}).reset_index()

Positions reset for each chromosome

### File Description

This file contains **gene annotations** for **protein-coding genes** and provides information about the **location of genes on the genome**. Below is a description of each column:

1. **chr**: The chromosome where the gene is located (e.g., `chr1`, `chr2`, etc.).
2. **start**: The start position of the gene on the chromosome (in base pairs).
3. **end**: The end position of the gene on the chromosome (in base pairs).
4. **strand**: Indicates the strand on which the gene is located (`+` for forward strand, `-` for reverse strand).
5. **GeneSymbol**: The symbol or name of the gene (e.g., `OR4F5`, `SAMD11`), typically assigned by organizations like HGNC (HUGO Gene Nomenclature Committee).
6. **cons**: This column indicates the type of gene. In this file, all genes are classified as **protein_coding**.
7. **gene**: The Ensembl gene ID, a unique identifier for the gene (e.g., `ENSG00000186092`).
8. **hgnc_id**: The unique identifier for the gene assigned by the **HGNC** (HUGO Gene Nomenclature Committee).
9. **tss**: The **transcription start site**, the position where transcription of the gene starts on the chromosome.
10. **tes**: The **transcription end site**, the position where transcription of the gene ends on the chromosome.

# Import GWAS data

 https://docs.google.com/spreadsheets/d/1kvPoupSzsSFBNSztMzl04xMoSC3Kcx3CrjVf4yBmESU/edit?gid=178908679#gid=178908679 (row 7217)
 
 50_irnt.gwas.imputed_v3.both_sexes.tsv.bgz' 

In [10]:
# Path to GWAS file
gwas_file_path = '50_irnt.gwas.imputed_v3.both_sexes.tsv.bgz'

# Open and read the compressed .bgz file using gzip
with gzip.open(gwas_file_path, 'rt') as f:
    # Load the file into a pandas DataFrame
    gwas_df = pd.read_csv(f, sep='\t')

In [None]:
# Display the first few rows to inspect the structure of the GWAS file
gwas_df

### GWAS Summary Statistics Dataset Description

1. **variant**: The unique identifier for each SNP (Single Nucleotide Polymorphism). This can include information like chromosome, position, reference allele, and alternative allele (e.g., `1:12345:A:G`).
   - **Example**: `1:10583:T:G`

2. **minor_allele**: The allele that is less frequent in the population (minor allele) for this particular SNP.
   - **Example**: `G`

3. **minor_AF**: The **minor allele frequency** (AF), which represents the frequency of the minor allele in the population. It ranges from 0 to 1.
   - **Example**: `0.35` (35% of individuals carry the minor allele)

4. **low_confidence_variant**: A flag indicating whether the variant has **low confidence** due to imputation quality or other uncertainties. Values may be `TRUE` or `FALSE`.
   - **Example**: `FALSE`

5. **n_complete_samples**: The number of samples for which complete genotype data is available for this variant.
   - **Example**: `300,000`

6. **AC**: The **allele count** of the minor allele, i.e., the number of times the minor allele appears in the study population (across all samples).
   - **Example**: `50000`

7. **ytx**: Likely a placeholder for a phenotype-related statistic; depending on the dataset, this could represent something like the trait mean or effect size (its exact meaning depends on the specific analysis).

8. **beta**: The **effect size** of the SNP on the trait being studied (in this case, likely height). It represents the change in the trait per additional copy of the minor allele.
   - **Example**: `0.05` (the trait increases by 0.05 units for each additional copy of the minor allele)

9. **se**: The **standard error** of the effect size (beta), indicating the precision of the estimated effect.
   - **Example**: `0.01`

10. **tstat**: The **t-statistic** for the beta estimate, which is the ratio of the beta estimate to its standard error.
    - **Example**: `5.0` (higher values indicate more significant associations)

11. **pval**: The **p-value** of the association between the SNP and the trait. This indicates the significance of the result, with smaller p-values suggesting stronger evidence that the SNP is associated with the trait.
    - **Example**: `1.2e-6` (a very small p-value, indicating strong evidence of association)

# Find Variant Chromosome and Position

In [13]:
gwas_df[['chr', 'pos', 'ref', 'alt']] = gwas_df['variant'].str.split(':', expand=True)
gwas_df = gwas_df.loc[gwas_df['chr'] != 'X']
gwas_df['chr'] = gwas_df['chr'].astype(int)
gwas_df['pos'] = gwas_df['pos'].astype(int)

gwas_df.groupby('chr')['pos'].agg({'min','max'}).reset_index()

Unnamed: 0,variant,minor_allele,minor_AF,low_confidence_variant,n_complete_samples,AC,ytx,beta,se,tstat,pval,chr,pos,ref,alt
0,1:15791:C:T,T,5.44076e-09,True,360388,0.003922,0.003474,18.0499,178.468,0.101138,0.919441,1,15791,C,T


It appears that position resets on each chromosome 

# Join to Find 5 Nearest Genes 

In [31]:
def repeated_forward_merge(gwas_df, gtf_df, num_merges=3):
    # Initial sorting of key columns for the first merge
    gwas_sorted = gwas_df[['variant', 'pval', 'chr', 'pos']].sort_values('pos')
    gtf_sorted = gtf_df[['chr', 'start', 'GeneSymbol', 'gene']].sort_values('start')

    # Ensure both 'start' columns are of the same type (convert to float to handle NaNs)
    gtf_sorted['start'] = gtf_sorted['start'].astype(float)
    gwas_sorted['pos'] = gwas_sorted['pos'].astype(float)

    # First merge_asof
    merged_df = pd.merge_asof(
        gwas_sorted, 
        gtf_sorted, 
        by='chr', 
        left_on='pos', 
        right_on='start', 
        direction='forward',
        suffixes=('', '_f1')  # First set of suffixes
    )

    # Replace NaN values in 'start' column and rename for subsequent merges
    merged_df['start'] = merged_df['start'].fillna(np.inf)
    merged_df = merged_df.rename(columns={'start': 'start_f1', 'GeneSymbol': 'GeneSymbol_f1', 'gene': 'gene_f1'})
    merged_df = merged_df.sort_values('start_f1')

    # Iterative forward merges
    for i in range(2, num_merges + 1):
        # Ensure both columns are of the same type
        merged_df[f'start_f{i-1}'] = merged_df[f'start_f{i-1}'].astype(float)
        gtf_sorted['start'] = gtf_sorted['start'].astype(float)

        # Perform the next forward merge
        merged_df = pd.merge_asof(
            merged_df, 
            gtf_sorted, 
            by='chr', 
            left_on=f'start_f{i-1}', 
            right_on='start', 
            direction='forward',
            allow_exact_matches=False,
            suffixes=(f'_f{i-1}', f'_f{i}')
        )

        # Replace NaN values in the new 'start' column and rename for next iteration
        merged_df['start'] = merged_df['start'].fillna(np.inf)
        merged_df = merged_df.rename(columns={
            'start': f'start_f{i}', 
            'GeneSymbol': f'GeneSymbol_f{i}', 
            'gene': f'gene_f{i}'
        })
        merged_df = merged_df.sort_values(f'start_f{i}')

    # After the last merge, replace all np.inf values back to NaN
    for i in range(1, num_merges + 1):
        merged_df[f'start_f{i}'] = merged_df[f'start_f{i}'].replace(np.inf, np.nan)
    
    return merged_df

f_final = repeated_forward_merge(gwas_df, gtf_df, num_merges=3)

In [33]:
# Adjusted function to ensure all negative infinity values are replaced with NaN in the final result
def repeated_backward_merge_with_nan(gwas_df, gtf_df, num_merges=3):
    # Initial sorting of key columns for the first merge
    gwas_sorted = gwas_df.sort_values('pos')
    gtf_sorted = gtf_df[['chr', 'start', 'GeneSymbol', 'gene']].sort_values('start')

    # Ensure both 'start' columns are of the same type (convert to float to handle NaNs)
    gtf_sorted['start'] = gtf_sorted['start'].astype(float)
    gwas_sorted['pos'] = gwas_sorted['pos'].astype(float)

    # First backward merge_asof
    merged_df = pd.merge_asof(
        gwas_sorted, 
        gtf_sorted, 
        by='chr', 
        left_on='pos', 
        right_on='start', 
        direction='backward',
        suffixes=('', '_b1')  # First set of suffixes
    )

    # Replace NaN values in 'start' column and rename for subsequent merges
    merged_df['start'] = merged_df['start'].fillna(-np.inf)
    merged_df = merged_df.rename(columns={'start': 'start_b1', 'GeneSymbol': 'GeneSymbol_b1', 'gene': 'gene_b1'})
    merged_df = merged_df.sort_values('start_b1')

    # Iterative backward merges
    for i in range(2, num_merges + 1):
        # Ensure both columns are of the same type
        merged_df[f'start_b{i-1}'] = merged_df[f'start_b{i-1}'].astype(float)
        gtf_sorted['start'] = gtf_sorted['start'].astype(float)

        # Perform the next backward merge
        merged_df = pd.merge_asof(
            merged_df, 
            gtf_sorted, 
            by='chr', 
            left_on=f'start_b{i-1}', 
            right_on='start', 
            direction='backward',
            allow_exact_matches=False,
            suffixes=(f'_b{i-1}', f'_b{i}')
        )

        # Replace NaN values in the new 'start' column and rename for next iteration
        merged_df['start'] = merged_df['start'].fillna(-np.inf)
        merged_df = merged_df.rename(columns={
            'start': f'start_b{i}', 
            'GeneSymbol': f'GeneSymbol_b{i}', 
            'gene': f'gene_b{i}'
        })
        merged_df = merged_df.sort_values(f'start_b{i}')

    # After the last merge, replace all -np.inf values back to NaN
    for i in range(1, num_merges + 1):
        merged_df[f'start_b{i}'] = merged_df[f'start_b{i}'].replace(-np.inf, np.nan)
    
    return merged_df

# Example usage:
f_b_final = repeated_backward_merge_with_nan(f_final, gtf_df, num_merges=3)

In [35]:
# Function to calculate differences between pos and all forward and backward start columns
def calculate_differences(df, num_merges=3):
    # Calculate the difference between pos and start_f1, start_f2, ..., start_fx for forward merges
    for i in range(1, num_merges + 1):
        df[f'diff_f{i}'] = df[f'start_f{i}'] - df['pos']
    
    # Calculate the difference between pos and start_b1, start_b2, ..., start_bx for backward merges
    for i in range(1, num_merges + 1):
        df[f'diff_b{i}'] = df[f'start_b{i}'] - df['pos']
    
    return df

f_b_final_with_differences = calculate_differences(f_b_final, num_merges=3)

In [None]:
f_b_final_with_differences[['diff_f1', 'diff_f2', 'diff_f3', 'diff_b1', 'diff_b2', 'diff_b3']].mean()

## Validation of above results: Implementing a lambda function to compute the 5 nearest neighboring genes for each mutation, then compute the inverse distance weighted s_het scores. Developing this solution to cross check the entries from the merge_asof method above

In [139]:
from pandarallel import pandarallel
import os
os.chdir("/Users/tanvibansal/Documents/GitHub/Capstone")
#grab a small sample of the gwas dataframe containing mutation information 
gwas_df.head()
gtf_df.head()
gwas_sample = gwas_df.iloc[np.random.choice(range(len(gwas_df)),size=100000,replace=False)] 

#drop the genes that are not present in the s_het file as these may cause na's that will break the ldsc algorithm in the next step
s_het_info = pd.read_excel('s_het_info.xlsx',sheet_name=1).rename(columns={"ensg":"GeneSymbol"}).set_index("GeneSymbol")
gtf_df_tidy = s_het_info.join(gtf_df.set_index("GeneSymbol"),how="inner")[["chr","start","end","gene"]]

#write a function to find the 5 nearest genes by absolute difference in mutation position and gene start position
def nearest_gene_finder(m):
    #get the absolute l1 distance 
    dists = np.abs(gtf_df_tidy.loc[gtf_df_tidy.chr == m.chr,"start"] - m.pos)
    top_5 = dists.sort_values()[0:5]
    out = pd.DataFrame(m).T
    out[["GeneSymbol.f1","GeneSymbol.f2","GeneSymbol.f3","GeneSymbol.f4","GeneSymbol.f5"]] = top_5.index
    out[["dist.f1","dist.f2","dist.f3","dist.f4","dist.f5"]] = top_5.values/1000 #convert distance to kilobases by dividing through 1000
    return(out)

#create a sample of the gwas dataframe with only mutations on chr 22 for this stage
gwas_sample_22 = gwas_df.loc[gwas_df.chr ==22]

#apply the function over each row of the sample mutation df 
pandarallel.initialize(progress_bar=True)
mutn_5_nearest_genes_sample = gwas_sample_22.parallel_apply(lambda m: nearest_gene_finder(m),axis=1)
gc.collect() 
mutn_5_nearest_genes_sample_df = pd.concat(list(mutn_5_nearest_genes_sample))

#now join the s_het values onto each of our 5 nearest genes and compute the distance weighted s_het 
s_het_w = mutn_5_nearest_genes_sample_df.copy(deep=True)
for i in range(1,6):
    s_het_temp = s_het_info[["post_mean"]].reset_index().rename(columns = {"GeneSymbol":"GeneSymbol.f%s"%(i)}).set_index("GeneSymbol.f%s"%(i))
    s_het_w = s_het_w.reset_index().set_index("GeneSymbol.f%s"%(i)).join(s_het_temp)
    s_het_w["s_het_w.f%s"%(i)] = s_het_w["post_mean"]/s_het_w["dist.f%s"%(i)]
    s_het_w = s_het_w.drop(columns="post_mean")
s_het_w = s_het_w.reset_index() 
s_het_w

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.
   0.00%                                          |        0 /    17977 |      
   0.00%                                          |        0 /    17977 |      
   0.00%                                          |        0 /    17977 |      
   0.00%                                          |        0 /    17977 |      
   0.00%                                          |        0 /    17977 |      
   0.00%                                          |        0 /    17977 |      
   0.00%                                          |        0 /    17976 |      
   0.00%                                          |        0 /    17976 |      
   0.00%                                          |        0 /    17976 |      
   0.00%                                          |        0 /    17976 |     MMMMMMMMM   1.00%      

Unnamed: 0,GeneSymbol.f5,GeneSymbol.f4,GeneSymbol.f3,GeneSymbol.f2,GeneSymbol.f1,index,variant,minor_allele,minor_AF,low_confidence_variant,n_complete_samples,AC,ytx,beta,se,tstat,pval,chr,pos,ref,alt,dist.f1,dist.f2,dist.f3,dist.f4,dist.f5,s_het_w.f1,s_het_w.f2,s_het_w.f3,s_het_w.f4,s_het_w.f5
0,ENSG00000183307,ENSG00000177663,ENSG00000215568,ENSG00000172967,ENSG00000198445,13184537,22:16464274:A:C,C,0.084046,False,360388,60578.3,-262.512,0.003413,0.003183,1.07204,0.283702,22,16464274,A,C,607.367,800.028,978.552,1101.570,1132.913,0.000004,0.000002,2.546518e-07,0.000008,0.000105
1,ENSG00000183307,ENSG00000177663,ENSG00000215568,ENSG00000172967,ENSG00000198445,13184538,22:16488635:C:A,A,0.080987,False,360388,58373.6,-220.884,0.003806,0.003111,1.22341,0.221175,22,16488635,C,A,583.006,775.667,954.191,1077.209,1108.552,0.000005,0.000002,2.611532e-07,0.000008,0.000107
2,ENSG00000183307,ENSG00000177663,ENSG00000215568,ENSG00000172967,ENSG00000198445,13184539,22:16488702:G:C,C,0.080078,False,360388,57718.1,-250.356,0.003739,0.003121,1.19791,0.230954,22,16488702,G,C,582.939,775.600,954.124,1077.142,1108.485,0.000005,0.000002,2.611715e-07,0.000008,0.000107
3,ENSG00000183307,ENSG00000177663,ENSG00000215568,ENSG00000172967,ENSG00000198445,13184540,22:16495833:C:A,A,0.078102,False,360388,56294.0,-249.417,0.004095,0.003069,1.33402,0.182199,22,16495833,C,A,575.808,768.469,946.993,1070.011,1101.354,0.000005,0.000002,2.631382e-07,0.000008,0.000108
4,ENSG00000183307,ENSG00000177663,ENSG00000215568,ENSG00000172967,ENSG00000198445,13184541,22:16498458:G:A,A,0.07444,False,360388,53654.4,-203.467,0.004117,0.003224,1.27711,0.201566,22,16498458,G,A,573.183,765.844,944.368,1067.386,1098.729,0.000005,0.000002,2.638696e-07,0.000008,0.000108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179761,ENSG00000008735,ENSG00000100299,ENSG00000251322,ENSG00000100312,ENSG00000079974,13364298,22:51229717:A:T,T,0.29612,False,360388,213436.0,-437.663,-0.002442,0.002004,-1.21861,0.222992,22,51229717,A,T,23.788,53.093,116.874,168.535,190.495,0.000127,0.000151,2.122508e-03,0.000009,0.003706
179762,ENSG00000008735,ENSG00000100299,ENSG00000251322,ENSG00000100312,ENSG00000079974,13364299,22:51229805:T:C,C,0.072999,False,360388,52615.7,-172.528,0.001794,0.003184,0.563533,0.573073,22,51229805,T,C,23.876,53.181,116.962,168.623,190.583,0.000126,0.000151,2.120911e-03,0.000009,0.003705
179763,ENSG00000008735,ENSG00000100299,ENSG00000251322,ENSG00000100312,ENSG00000079974,13364300,22:51231220:A:G,G,0.053676,False,360388,38688.2,-20.4974,0.002084,0.003916,0.532177,0.594604,22,51231220,A,G,25.291,54.596,118.377,170.038,191.998,0.000119,0.000147,2.095559e-03,0.000009,0.003677
179764,ENSG00000008735,ENSG00000100299,ENSG00000251322,ENSG00000100312,ENSG00000079974,13364301,22:51237063:T:C,C,0.299014,False,360388,215522.0,-260.204,-0.001741,0.001944,-0.895286,0.370635,22,51237063,T,C,31.134,60.439,124.220,175.881,197.841,0.000097,0.000133,1.996989e-03,0.000009,0.003569


## Prepare the annotation file containing our features for the LDSC program 

In [None]:
#format the dataframe's column names, columns to retain, and match the SNPs present in the bim file
s_het_w = s_het_w.reset_index().rename(columns={'chr': 'CHR','pos': 'BP','variant': 'SNP'})
annot_df = s_het_w[['CHR', 'BP', 'SNP', 's_het_w.f1', 's_het_w.f2', 's_het_w.f3', 's_het_w.f4', 's_het_w.f5']]
annot_df = annot_df.sort_values(['CHR', 'BP'])
annot_df

In [143]:
annot_df

Unnamed: 0,CHR,BP,SNP,s_het_w.f1,s_het_w.f2,s_het_w.f3,s_het_w.f4,s_het_w.f5
0,22,16464274,22:16464274:A:C,0.000004,0.000002,2.546518e-07,0.000008,0.000105
1,22,16488635,22:16488635:C:A,0.000005,0.000002,2.611532e-07,0.000008,0.000107
2,22,16488702,22:16488702:G:C,0.000005,0.000002,2.611715e-07,0.000008,0.000107
3,22,16495833,22:16495833:C:A,0.000005,0.000002,2.631382e-07,0.000008,0.000108
4,22,16498458,22:16498458:G:A,0.000005,0.000002,2.638696e-07,0.000008,0.000108
...,...,...,...,...,...,...,...,...
179761,22,51229717,22:51229717:A:T,0.000127,0.000151,2.122508e-03,0.000009,0.003706
179762,22,51229805,22:51229805:T:C,0.000126,0.000151,2.120911e-03,0.000009,0.003705
179763,22,51231220,22:51231220:A:G,0.000119,0.000147,2.095559e-03,0.000009,0.003677
179764,22,51237063,22:51237063:T:C,0.000097,0.000133,1.996989e-03,0.000009,0.003569


In [177]:
#column names for the bim file according to the plink documentation site
names = ["CHR","SNP","POS","BP","A1","A2"]
os.chdir("/Users/tanvibansal/Documents/GitHub/ldsc")
bim = pd.read_csv("22.bim",sep="\t",header=None,names=names)

#join the annotated df onto the bim file and sort in the order of the bim file exactly to prep for ldsc. we should have the same exact # of rows 
annot_df_filt = bim.set_index(["CHR","BP"]).join(annot_df.set_index(["CHR","BP"]),how="left",rsuffix=".a").drop_duplicates(subset=["SNP"])
annot_df_sort = annot_df_filt.reset_index().set_index("SNP").loc[bim.SNP.values]
annot_df_sort["CM"] = 0.0

#select the cols of interest and write the file out as .annot to use for ldsc
annot_df_prep = annot_df_sort.reset_index()[["CHR","BP","SNP", "CM", "s_het_w.f1", "s_het_w.f2", "s_het_w.f3", "s_het_w.f4", "s_het_w.f5"]]
annot_df_prep = annot_df_prep.dropna()
annot_df_prep.to_csv("1kg_eur/s_het_ch22.annot",sep="\t",index=False)
annot_df_prep

unique_snps = annot_df_prep['SNP'].unique()  # Get unique SNPs to avoid duplicates

# Save the SNPs to a text file
with open('1kg_eur/s_het_22/filtered_snps.txt', 'w') as f:
    for snp in unique_snps:
        f.write(f"{snp}\n") 

Unnamed: 0,CHR,BP,SNP,CM,s_het_w.f1,s_het_w.f2,s_het_w.f3,s_het_w.f4,s_het_w.f5
2,22,16877135,rs140378,0.0,0.000014,0.000003,4.405055e-07,0.000013,0.000164
3,22,16877230,rs131560,0.0,0.000014,0.000003,4.405795e-07,0.000013,0.000165
4,22,16886873,rs7287144,0.0,0.000015,0.000004,4.482213e-07,0.000013,0.000167
5,22,16888900,rs5748616,0.0,0.000015,0.000004,4.498615e-07,0.000013,0.000167
6,22,16892858,rs5748662,0.0,0.000015,0.000004,4.530991e-07,0.000013,0.000168
...,...,...,...,...,...,...,...,...,...
19150,22,51178090,rs2285395,0.0,0.005470,0.000108,3.801953e-03,0.000013,0.005084
19151,22,51181759,rs13056621,0.0,0.001562,0.000125,3.599541e-03,0.000013,0.004953
19152,22,51186228,rs3865766,0.0,0.000835,0.000153,3.380337e-03,0.000012,0.004803
19153,22,51211392,rs3888396,0.0,0.000551,0.000231,2.517184e-03,0.000010,0.004101


#### we need to run the following 2 commands to get the ldsc computation to run as intended 

1. Make the .bed/.bim files with our filtered SNPs 
plink --bfile /Users/tanvibansal/Documents/GitHub/ldsc/1kg_eur/22 --extract /Users/tanvibansal/Documents/GitHub/ldsc/1kg_eur/s_het_22/filtered_snps.txt --make-bed --out /Users/tanvibansal/Documents/GitHub/ldsc/1kg_eur/s_het_22

2. Run the ld score regression with our inputs specified

python ldsc.py --l2 --bfile /Users/tanvibansal/Documents/GitHub/ldsc/1kg_eur/s_het_22 --ld-wind-cm 1 --annot /Users/tanvibansal/Documents/GitHub/ldsc/1kg_eur/s_het_ch22.annot --out 1kg_eur/s_het_22

In [None]:
# Plot histograms for each difference column
def plot_histograms(df, columns):
    plt.figure(figsize=(14, 10))
    
    for i, col in enumerate(columns):
        plt.subplot(2, 3, i + 1)  # Create a subplot for each column
        df[col].hist(bins=50, color='skyblue', edgecolor='black')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

# Define the columns to plot
columns_to_plot = ['diff_f1', 'diff_f2', 'diff_f3', 'diff_b1', 'diff_b2', 'diff_b3']
# Call the function to plot histograms
plot_histograms(f_b_final_with_differences, columns_to_plot)

columns_to_plot = ["f1_dist","f2_dist","f3_dist","f4_dist","f5_dist"]
plot_histograms(nearest_genes_sample_df, columns_to_plot)

In [None]:
s_het_info = pd.read_excel('s_het_info.xlsx',sheet_name=1)

In [40]:
# Function to perform the merge for all forward and backward GeneSymbols and rename the 'post_mean' columns
def merge_s_het_info_with_all(f_b_final_with_differences, s_het_info, num_merges=3):
    # Rename 'ensg' column in s_het_info to match the GeneSymbol columns in f_b_final_with_differences
    s_het_info = s_het_info.rename(columns={'ensg': 'GeneSymbol'})

    # Iterate over both forward and backward gene symbols
    for i in range(1, num_merges + 1):
        # Forward merge for each GeneSymbol_f
        f_b_final_with_differences = pd.merge(
            f_b_final_with_differences, 
            s_het_info[['GeneSymbol', 'post_mean']], 
            left_on=f'GeneSymbol_f{i}', 
            right_on='GeneSymbol', 
            how='left',
            suffixes=('', f'_f{i}')
        )
        
        # Rename the post_mean column to indicate the forward direction
        f_b_final_with_differences = f_b_final_with_differences.rename(
            columns={'post_mean': f's_het_post_f{i}'}
        )
        
        # Drop the duplicated 'GeneSymbol' column created during the merge
        f_b_final_with_differences = f_b_final_with_differences.drop(columns='GeneSymbol', errors='ignore')

        # Backward merge for each GeneSymbol_b
        f_b_final_with_differences = pd.merge(
            f_b_final_with_differences, 
            s_het_info[['GeneSymbol', 'post_mean']], 
            left_on=f'GeneSymbol_b{i}', 
            right_on='GeneSymbol', 
            how='left',
            suffixes=('', f'_b{i}')
        )
        
        # Rename the post_mean column to indicate the backward direction
        f_b_final_with_differences = f_b_final_with_differences.rename(
            columns={'post_mean': f's_het_post_b{i}'}
        )
        
        # Drop the duplicated 'GeneSymbol' column created during the merge
        f_b_final_with_differences = f_b_final_with_differences.drop(columns='GeneSymbol', errors='ignore')

    return f_b_final_with_differences

# Example usage:
f_b_final_merged_s_het = merge_s_het_info_with_all(f_b_final_with_differences, s_het_info, num_merges=3)

In [None]:
f_b_final_merged_s_het

In [47]:
# f_b_final_merged_s_het[['diff_b1']] = abs(f_b_final_merged_s_het[['diff_b1']])
# f_b_final_merged_s_het[['diff_b2']] = abs(f_b_final_merged_s_het[['diff_b3']])
# f_b_final_merged_s_het[['diff_b3']] = abs(f_b_final_merged_s_het[['diff_b3']])
# Loop to apply the absolute value for all diff_b columns
def apply_abs_diff_b(f_b_final_merged_s_het, num_merges=3):
    for i in range(1, num_merges + 1):
        f_b_final_merged_s_het[f'diff_b{i}'] = f_b_final_merged_s_het[f'diff_b{i}'].abs()

    return f_b_final_merged_s_het

# Example usage:
f_b_final_merged_s_het = apply_abs_diff_b(f_b_final_merged_s_het, num_merges=3)

In [None]:
f_b_final_merged_s_het

In [49]:
# Function to keep only the distance and s_het measures from the dataframe
def keep_dist_and_s_het_measures(df, num_merges=3):
    # List to store column names for distance and s_het measures
    columns_to_keep = ['variant', 'chr', 'pos']

    # Loop to collect the diff and s_het_post column names for forward and backward directions
    for i in range(1, num_merges + 1):
        columns_to_keep.append(f'diff_f{i}')
        columns_to_keep.append(f'diff_b{i}')
        columns_to_keep.append(f's_het_post_f{i}')
        columns_to_keep.append(f's_het_post_b{i}')

    # Keep only the relevant columns
    df_filtered = df[columns_to_keep]

    return df_filtered

# Example usage:
filtered_f_b_final = keep_dist_and_s_het_measures(f_b_final_merged_s_het, num_merges=3)

In [None]:
filtered_f_b_final

In [None]:
# Function to compute s_het weighted by 1/distance per gene-SNP pair
def compute_weighted_s_het(df, num_merges=3):
    # Loop to compute the weighted s_het for both forward and backward directions
    for i in range(1, num_merges + 1):
        # Calculate inverse distance for forward and backward directions
        df[f'inv_dist_f{i}'] = 1 / df[f'diff_f{i}']
        df[f'inv_dist_b{i}'] = 1 / df[f'diff_b{i}']
        
        # Compute weighted s_het by multiplying s_het by inverse distance
        df[f'weighted_s_het_f{i}'] = df[f'inv_dist_f{i}'] * df[f's_het_post_f{i}']
        df[f'weighted_s_het_b{i}'] = df[f'inv_dist_b{i}'] * df[f's_het_post_b{i}']

    return df

# Example usage:
weighted_f_b_final = compute_weighted_s_het(filtered_f_b_final, num_merges=3)

In [None]:
weighted_f_b_final

In [53]:
# Function to keep only the weighted s_het values from the dataframe
def keep_weighted_s_het(df, num_merges=3):
    # List to store column names for weighted s_het values
    columns_to_keep = ['variant', 'chr', 'pos']

    # Loop to collect the weighted_s_het column names for forward and backward directions
    for i in range(1, num_merges + 1):
        columns_to_keep.append(f'weighted_s_het_f{i}')
        columns_to_keep.append(f'weighted_s_het_b{i}')

    # Keep only the relevant columns
    df_filtered = df[columns_to_keep]

    return df_filtered

# Example usage:
weighted_s_het_only = keep_weighted_s_het(weighted_f_b_final, num_merges=3)

In [None]:
weighted_s_het_only

In [None]:
#oliver's results 
weighted_s_het_only.iloc[np.argwhere(np.isin(weighted_s_het_only.variant, m["variant.mut"].values)).flatten()]