# Import Gene Annotation File

https://github.com/hakha-most/gwas_eqtl/blob/master/gene_annotations/genes.protein_coding.v39.gtf

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gzip

# Local path to GTF file
gtf_file_path = '/Users/oliverkafka/Documents/NYU/3rd Sem NYU/Captone Project/Data/genes.protein_coding.v39.gtf'
# Path to GWAS file
gwas_file_path = '/Users/oliverkafka/Documents/NYU/3rd Sem NYU/Captone Project/Data/50_irnt.gwas.imputed_v3.both_sexes.tsv.bgz'
# Path to S Het file
s_het_file_path = '/Users/oliverkafka/Documents/NYU/3rd Sem NYU/Captone Project/Data/s_het_info.xlsx'

In [None]:
# Load the GTF file into a pandas DataFrame and use the first row as headers
gtf_df = pd.read_csv(gtf_file_path, sep='\t', header=0)

# Display the first few rows of the dataframe
gtf_df.sort_values('start')
gtf_df.head()

In [None]:
gtf_df.shape

In [None]:
gtf_df['chr'] = gtf_df['chr'].str[3:].astype(int)
gtf_df['start'] = gtf_df['start'].astype(int)

In [None]:
gtf_df.groupby('chr')['start'].agg({'min', 'max'}).reset_index()

Positions reset for each chromosome

### File Description

This file contains **gene annotations** for **protein-coding genes** and provides information about the **location of genes on the genome**. Below is a description of each column:

1. **chr**: The chromosome where the gene is located (e.g., `chr1`, `chr2`, etc.).
2. **start**: The start position of the gene on the chromosome (in base pairs).
3. **end**: The end position of the gene on the chromosome (in base pairs).
4. **strand**: Indicates the strand on which the gene is located (`+` for forward strand, `-` for reverse strand).
5. **GeneSymbol**: The symbol or name of the gene (e.g., `OR4F5`, `SAMD11`), typically assigned by organizations like HGNC (HUGO Gene Nomenclature Committee).
6. **cons**: This column indicates the type of gene. In this file, all genes are classified as **protein_coding**.
7. **gene**: The Ensembl gene ID, a unique identifier for the gene (e.g., `ENSG00000186092`).
8. **hgnc_id**: The unique identifier for the gene assigned by the **HGNC** (HUGO Gene Nomenclature Committee).
9. **tss**: The **transcription start site**, the position where transcription of the gene starts on the chromosome.
10. **tes**: The **transcription end site**, the position where transcription of the gene ends on the chromosome.

# Import GWAS data

 https://docs.google.com/spreadsheets/d/1kvPoupSzsSFBNSztMzl04xMoSC3Kcx3CrjVf4yBmESU/edit?gid=178908679#gid=178908679 (row 7217)
 
 50_irnt.gwas.imputed_v3.both_sexes.tsv.bgz' 

In [None]:
# Open and read the compressed .bgz file using gzip
with gzip.open(gwas_file_path, 'rt') as f:
    # Load the file into a pandas DataFrame
    gwas_df = pd.read_csv(f, sep='\t')

In [None]:
# Display the first few rows to inspect the structure of the GWAS file
gwas_df

### GWAS Summary Statistics Dataset Description

1. **variant**: The unique identifier for each SNP (Single Nucleotide Polymorphism). This can include information like chromosome, position, reference allele, and alternative allele (e.g., `1:12345:A:G`).
   - **Example**: `1:10583:T:G`

2. **minor_allele**: The allele that is less frequent in the population (minor allele) for this particular SNP.
   - **Example**: `G`

3. **minor_AF**: The **minor allele frequency** (AF), which represents the frequency of the minor allele in the population. It ranges from 0 to 1.
   - **Example**: `0.35` (35% of individuals carry the minor allele)

4. **low_confidence_variant**: A flag indicating whether the variant has **low confidence** due to imputation quality or other uncertainties. Values may be `TRUE` or `FALSE`.
   - **Example**: `FALSE`

5. **n_complete_samples**: The number of samples for which complete genotype data is available for this variant.
   - **Example**: `300,000`

6. **AC**: The **allele count** of the minor allele, i.e., the number of times the minor allele appears in the study population (across all samples).
   - **Example**: `50000`

7. **ytx**: Likely a placeholder for a phenotype-related statistic; depending on the dataset, this could represent something like the trait mean or effect size (its exact meaning depends on the specific analysis).

8. **beta**: The **effect size** of the SNP on the trait being studied (in this case, likely height). It represents the change in the trait per additional copy of the minor allele.
   - **Example**: `0.05` (the trait increases by 0.05 units for each additional copy of the minor allele)

9. **se**: The **standard error** of the effect size (beta), indicating the precision of the estimated effect.
   - **Example**: `0.01`

10. **tstat**: The **t-statistic** for the beta estimate, which is the ratio of the beta estimate to its standard error.
    - **Example**: `5.0` (higher values indicate more significant associations)

11. **pval**: The **p-value** of the association between the SNP and the trait. This indicates the significance of the result, with smaller p-values suggesting stronger evidence that the SNP is associated with the trait.
    - **Example**: `1.2e-6` (a very small p-value, indicating strong evidence of association)

In [None]:
gwas_df.shape

# Find Variant Chromosome and Position

In [None]:
gwas_df[['chr', 'pos', 'ref', 'alt']] = gwas_df['variant'].str.split(':', expand=True)
gwas_df.head(1)

In [None]:
gwas_df = gwas_df[gwas_df['chr'] != 'X']
gwas_df['chr'] = gwas_df['chr'].astype(int)
gwas_df['pos'] = gwas_df['pos'].astype(int)

In [None]:
gwas_df.groupby('chr')['pos'].agg({'min','max'}).reset_index()

It appears that position resets on each chromosome 

# Join to Find 5 Nearest Genes 

In [None]:
def repeated_forward_merge(gwas_df, gtf_df, num_merges=3):
    # Initial sorting of key columns for the first merge
    gwas_sorted = gwas_df[['variant', 'pval', 'chr', 'pos']].sort_values('pos')
    gtf_sorted = gtf_df[['chr', 'start', 'GeneSymbol', 'gene']].sort_values('start')

    # Ensure both 'start' columns are of the same type (convert to float to handle NaNs)
    gtf_sorted['start'] = gtf_sorted['start'].astype(float)
    gwas_sorted['pos'] = gwas_sorted['pos'].astype(float)

    # First merge_asof
    merged_df = pd.merge_asof(
        gwas_sorted, 
        gtf_sorted, 
        by='chr', 
        left_on='pos', 
        right_on='start', 
        direction='forward',
        suffixes=('', '_f1')  # First set of suffixes
    )

    # Replace NaN values in 'start' column and rename for subsequent merges
    merged_df['start'] = merged_df['start'].fillna(np.inf)
    merged_df = merged_df.rename(columns={'start': 'start_f1', 'GeneSymbol': 'GeneSymbol_f1', 'gene': 'gene_f1'})
    merged_df = merged_df.sort_values('start_f1')

    # Iterative forward merges
    for i in range(2, num_merges + 1):
        # Ensure both columns are of the same type
        merged_df[f'start_f{i-1}'] = merged_df[f'start_f{i-1}'].astype(float)
        gtf_sorted['start'] = gtf_sorted['start'].astype(float)

        # Perform the next forward merge
        merged_df = pd.merge_asof(
            merged_df, 
            gtf_sorted, 
            by='chr', 
            left_on=f'start_f{i-1}', 
            right_on='start', 
            direction='forward',
            allow_exact_matches=False,
            suffixes=(f'_f{i-1}', f'_f{i}')
        )

        # Replace NaN values in the new 'start' column and rename for next iteration
        merged_df['start'] = merged_df['start'].fillna(np.inf)
        merged_df = merged_df.rename(columns={
            'start': f'start_f{i}', 
            'GeneSymbol': f'GeneSymbol_f{i}', 
            'gene': f'gene_f{i}'
        })
        merged_df = merged_df.sort_values(f'start_f{i}')

    # After the last merge, replace all np.inf values back to NaN
    for i in range(1, num_merges + 1):
        merged_df[f'start_f{i}'] = merged_df[f'start_f{i}'].replace(np.inf, np.nan)
    
    return merged_df

f_final = repeated_forward_merge(gwas_df, gtf_df, num_merges=3)

In [None]:
f_final

In [None]:
# Adjusted function to ensure all negative infinity values are replaced with NaN in the final result
def repeated_backward_merge_with_nan(gwas_df, gtf_df, num_merges=3):
    # Initial sorting of key columns for the first merge
    gwas_sorted = gwas_df.sort_values('pos')
    gtf_sorted = gtf_df[['chr', 'start', 'GeneSymbol', 'gene']].sort_values('start')

    # Ensure both 'start' columns are of the same type (convert to float to handle NaNs)
    gtf_sorted['start'] = gtf_sorted['start'].astype(float)
    gwas_sorted['pos'] = gwas_sorted['pos'].astype(float)

    # First backward merge_asof
    merged_df = pd.merge_asof(
        gwas_sorted, 
        gtf_sorted, 
        by='chr', 
        left_on='pos', 
        right_on='start', 
        direction='backward',
        suffixes=('', '_b1')  # First set of suffixes
    )

    # Replace NaN values in 'start' column and rename for subsequent merges
    merged_df['start'] = merged_df['start'].fillna(-np.inf)
    merged_df = merged_df.rename(columns={'start': 'start_b1', 'GeneSymbol': 'GeneSymbol_b1', 'gene': 'gene_b1'})
    merged_df = merged_df.sort_values('start_b1')

    # Iterative backward merges
    for i in range(2, num_merges + 1):
        # Ensure both columns are of the same type
        merged_df[f'start_b{i-1}'] = merged_df[f'start_b{i-1}'].astype(float)
        gtf_sorted['start'] = gtf_sorted['start'].astype(float)

        # Perform the next backward merge
        merged_df = pd.merge_asof(
            merged_df, 
            gtf_sorted, 
            by='chr', 
            left_on=f'start_b{i-1}', 
            right_on='start', 
            direction='backward',
            allow_exact_matches=False,
            suffixes=(f'_b{i-1}', f'_b{i}')
        )

        # Replace NaN values in the new 'start' column and rename for next iteration
        merged_df['start'] = merged_df['start'].fillna(-np.inf)
        merged_df = merged_df.rename(columns={
            'start': f'start_b{i}', 
            'GeneSymbol': f'GeneSymbol_b{i}', 
            'gene': f'gene_b{i}'
        })
        merged_df = merged_df.sort_values(f'start_b{i}')

    # After the last merge, replace all -np.inf values back to NaN
    for i in range(1, num_merges + 1):
        merged_df[f'start_b{i}'] = merged_df[f'start_b{i}'].replace(-np.inf, np.nan)
    
    return merged_df

# Example usage:
f_b_final = repeated_backward_merge_with_nan(f_final, gtf_df, num_merges=3)

In [None]:
f_b_final

In [None]:
# Function to calculate differences between pos and all forward and backward start columns
def calculate_differences(df, num_merges=3):
    # Calculate the difference between pos and start_f1, start_f2, ..., start_fx for forward merges
    for i in range(1, num_merges + 1):
        df[f'diff_f{i}'] = df[f'start_f{i}'] - df['pos']
    
    # Calculate the difference between pos and start_b1, start_b2, ..., start_bx for backward merges
    for i in range(1, num_merges + 1):
        df[f'diff_b{i}'] = df[f'start_b{i}'] - df['pos']
    
    return df

f_b_final_with_differences = calculate_differences(f_b_final, num_merges=3)

In [None]:
f_b_final_with_differences

In [None]:
f_b_final_with_differences[['diff_f1', 'diff_f2', 'diff_f3', 'diff_b1', 'diff_b2', 'diff_b3']].tail(100)

In [None]:
f_b_final_with_differences[['diff_f1', 'diff_f2', 'diff_f3', 'diff_b1', 'diff_b2', 'diff_b3']].mean()

In [None]:
# Plot histograms for each difference column
def plot_histograms(df, columns):
    plt.figure(figsize=(14, 10))
    
    for i, col in enumerate(columns):
        plt.subplot(2, 3, i + 1)  # Create a subplot for each column
        df[col].hist(bins=50, color='skyblue', edgecolor='black')
        plt.title(f'Distribution of {col}')
        plt.xlabel('Value')
        plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

# Define the columns to plot
columns_to_plot = ['diff_f1', 'diff_f2', 'diff_f3', 'diff_b1', 'diff_b2', 'diff_b3']

# Call the function to plot histograms
plot_histograms(f_b_final_with_differences, columns_to_plot)


In [None]:
s_het_info = pd.read_excel(s_het_file_path)

In [None]:
s_het_info

In [None]:
# Function to perform the merge for all forward and backward GeneSymbols and rename the 'post_mean' columns
def merge_s_het_info_with_all(f_b_final_with_differences, s_het_info, num_merges=3):
    # Rename 'ensg' column in s_het_info to match the GeneSymbol columns in f_b_final_with_differences
    s_het_info = s_het_info.rename(columns={'ensg': 'GeneSymbol'})

    # Iterate over both forward and backward gene symbols
    for i in range(1, num_merges + 1):
        # Forward merge for each GeneSymbol_f
        f_b_final_with_differences = pd.merge(
            f_b_final_with_differences, 
            s_het_info[['GeneSymbol', 'post_mean']], 
            left_on=f'GeneSymbol_f{i}', 
            right_on='GeneSymbol', 
            how='left',
            suffixes=('', f'_f{i}')
        )
        
        # Rename the post_mean column to indicate the forward direction
        f_b_final_with_differences = f_b_final_with_differences.rename(
            columns={'post_mean': f's_het_post_f{i}'}
        )
        
        # Drop the duplicated 'GeneSymbol' column created during the merge
        f_b_final_with_differences = f_b_final_with_differences.drop(columns='GeneSymbol', errors='ignore')

        # Backward merge for each GeneSymbol_b
        f_b_final_with_differences = pd.merge(
            f_b_final_with_differences, 
            s_het_info[['GeneSymbol', 'post_mean']], 
            left_on=f'GeneSymbol_b{i}', 
            right_on='GeneSymbol', 
            how='left',
            suffixes=('', f'_b{i}')
        )
        
        # Rename the post_mean column to indicate the backward direction
        f_b_final_with_differences = f_b_final_with_differences.rename(
            columns={'post_mean': f's_het_post_b{i}'}
        )
        
        # Drop the duplicated 'GeneSymbol' column created during the merge
        f_b_final_with_differences = f_b_final_with_differences.drop(columns='GeneSymbol', errors='ignore')

    return f_b_final_with_differences

# Example usage:
f_b_final_merged_s_het = merge_s_het_info_with_all(f_b_final_with_differences, s_het_info, num_merges=3)

In [None]:
f_b_final_merged_s_het.columns

In [None]:
f_b_final_merged_s_het

In [None]:
def apply_abs_diff_b(f_b_final_merged_s_het, num_merges=3):
    for i in range(1, num_merges + 1):
        f_b_final_merged_s_het[f'diff_b{i}'] = f_b_final_merged_s_het[f'diff_b{i}'].abs()

    return f_b_final_merged_s_het

# Example usage:
f_b_final_merged_s_het = apply_abs_diff_b(f_b_final_merged_s_het, num_merges=3)

In [None]:
f_b_final_merged_s_het

In [None]:
#set distances where we overlap with start of gene to 1
f_b_final_merged_s_het.loc[f_b_final_merged_s_het['start_f1'] == f_b_final_merged_s_het['pos'], 'diff_f1'] = 1
f_b_final_merged_s_het.loc[f_b_final_merged_s_het['start_b1'] == f_b_final_merged_s_het['pos'], 'diff_b1'] = 1

In [None]:
# Function to keep only the distance and s_het measures from the dataframe
def keep_dist_and_s_het_measures(df, num_merges=3):
    # List to store column names for distance and s_het measures
    columns_to_keep = ['variant', 'chr', 'pos']

    # Loop to collect the diff and s_het_post column names for forward and backward directions
    for i in range(1, num_merges + 1):
        columns_to_keep.append(f'diff_f{i}')
        columns_to_keep.append(f'diff_b{i}')
        columns_to_keep.append(f's_het_post_f{i}')
        columns_to_keep.append(f's_het_post_b{i}')

    # Keep only the relevant columns
    df_filtered = df[columns_to_keep]

    return df_filtered

# Example usage:
filtered_f_b_final = keep_dist_and_s_het_measures(f_b_final_merged_s_het, num_merges=3)

In [None]:
filtered_f_b_final

In [None]:
# Function to compute s_het weighted by 1/distance per gene-SNP pair
def compute_weighted_s_het(df, num_merges=3):
    # Loop to compute the weighted s_het for both forward and backward directions
    for i in range(1, num_merges + 1):
        # Calculate inverse distance for forward and backward directions
        df[f'inv_dist_f{i}'] = 1 / df[f'diff_f{i}']
        df[f'inv_dist_b{i}'] = 1 / df[f'diff_b{i}']
        
        # Compute weighted s_het by multiplying s_het by inverse distance
        df[f'weighted_s_het_f{i}'] = df[f'inv_dist_f{i}'] * df[f's_het_post_f{i}']
        df[f'weighted_s_het_b{i}'] = df[f'inv_dist_b{i}'] * df[f's_het_post_b{i}']

    return df

# Example usage:
weighted_f_b_final = compute_weighted_s_het(filtered_f_b_final, num_merges=3)

In [None]:
weighted_f_b_final

In [None]:
weighted_f_b_final[['inv_dist_f1', 'inv_dist_f2', 'inv_dist_f3', 
                          'inv_dist_b1', 'inv_dist_b2', 'inv_dist_b3']].agg(['min', 'max', 'mean'])

In [None]:
weighted_f_b_final[weighted_f_b_final['inv_dist_f1'] == float('inf')]

In [None]:
f_b_final_merged_s_het[f_b_final_merged_s_het['variant'] == '20:627259:T:C']

In [None]:
# Function to keep only the weighted s_het values from the dataframe
def keep_weighted_s_het(df, num_merges=3):
    # List to store column names for weighted s_het values
    columns_to_keep = ['variant', 'chr', 'pos']

    # Loop to collect the weighted_s_het column names for forward and backward directions
    for i in range(1, num_merges + 1):
        columns_to_keep.append(f'weighted_s_het_f{i}')
        columns_to_keep.append(f'weighted_s_het_b{i}')

    # Keep only the relevant columns
    df_filtered = df[columns_to_keep]

    return df_filtered

# Example usage:
weighted_s_het_only = keep_weighted_s_het(weighted_f_b_final, num_merges=3)

In [None]:
weighted_s_het_only

In [None]:
weighted_s_het_only[['weighted_s_het_f1', 'weighted_s_het_b1', 'weighted_s_het_f2', 
                          'weighted_s_het_b2', 'weighted_s_het_f3', 'weighted_s_het_b3']].agg(['min', 'max', 'mean'])