In [1]:
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('/home/bettimj/gamazon_rotation/mod_core-bed/hi-c/cs2g/cs2g_compiled_core-bed.csv', sep = ",")

In [2]:
data

Unnamed: 0,position (hg19),Disease/trait,Disease tissues,SNP,exp_val_gene,cS2G_gene,gene_overlap,tissue_gene_contact,gene_contact,gene_contact_total,CoRE-BED score of functionally validated gene
0,"1:109,817,590",LDL,"adipose, adrenal, liver, muscle",rs12740374,SORT1,CELSR2,CELSR2,"epithelial_C1orf112,placenta_and_eem_AK2,liver...","C1orf112, AK2, KDM1A, ST7L, CSDE1, CLCA4, RUN...","C1orf112, AK2, KDM1A, ST7L, ST7L, CSDE1, CLCA...",0.125
1,"1:162,020,969",QT interval,heart,rs7539120,NOS1AP,,,,,,
2,"2:60,718,043",Fetal hemoglobin level,"blood, bone, hsc_and_b_cell, liver",rs1427407,BCL11A,,BCL11A,,,,1.0
3,"2:60,725,451",Fetal hemoglobin level,"blood, bone, hsc_and_b_cell, liver",rs7606173,BCL11A,BCL11A,,,,,
4,"3:38,767,315",QRS prolongation,heart,rs6801957,SCN5A,SCN5A,SCN10A,"endocrine_XYLB,endocrine_ACVR2B,endocrine_RBMS...","XYLB, ACVR2B, RBMS3, GOLGA4, MAGI1, SYNPR, OX...","XYLB, ACVR2B, RBMS3, GOLGA4, MAGI1, SYNPR, OX...",
5,"4:90,674,431",Parkinson's disease,"brain, pns",rs356168,SNCA,SNCA,"SNCA,AC097478.3","pns_CD38,pns_MCUB,pns_RUFY3,epithelial_RUFY3,s...","CD38, MCUB, RUFY3, UBA6, TLL1, POLR2B, ARAP2,...","CD38, MCUB, RUFY3, RUFY3, RUFY3, UBA6, UBA6, ...",1.0
6,"5:56,031,822",Breast cancer,"adipose, breast, epithelial",rs17432750,MAP3K1,MAP3K1,,"brain_PDE4D,hsc_and_b_cell_RAB3C,brain_ITGA2,b...","PDE4D, RAB3C, ITGA2, S100Z, ZNF131, LVRN, C5o...","PDE4D, RAB3C, ITGA2, S100Z, ZNF131, LVRN, C5o...",
7,"5:56,052,695",Breast cancer,"adipose, breast, epithelial",rs62355900,MAP3K1,,,,,,
8,"5:56,053,479",Breast cancer,"adipose, breast, epithelial",rs74345699,MAP3K1,,,,,,
9,"5:56,134,276",Breast cancer,"adipose, breast, epithelial",rs16886397,MAP3K1,MAP3K1,MAP3K1,"stromal_ISL1,heart_ISL1,mesench_ISL1,epithelia...","ISL1, FAM13B, DEPDC1B, FLT4, TRIO, MTREX, DNA...","ISL1, ISL1, ISL1, FAM13B, DEPDC1B, FLT4, TRIO...",1.0


In [3]:
import pandas as pd
import numpy as np

# Function to generate the disease tissue-relevant credible set and compute the scores
def compute_core_bed_scores(row):
    # Step 1: Construct the credible set
    if pd.notnull(row["Disease tissues"]) and pd.notnull(row["tissue_gene_contact"]):
        disease_tissues = {tissue.strip() for tissue in row["Disease tissues"].split(',')}
        gene_tissue_pairs = row["tissue_gene_contact"].split(',')
        
        relevant_genes = []
        for pair in gene_tissue_pairs:
            tissue_gene_split = pair.rsplit('_', 1)
            if len(tissue_gene_split) == 2:
                tissue, gene = tissue_gene_split
                if tissue in disease_tissues:
                    relevant_genes.append(gene.strip())
        
        credible_set = set(relevant_genes) if relevant_genes else None
    else:
        credible_set = None

    # Step 2: If the credible set is empty, check the gene_overlap column
    if credible_set is None and pd.notnull(row["gene_overlap"]):
        overlap_genes = {gene.strip() for gene in row["gene_overlap"].split(',')}
        credible_set = overlap_genes if overlap_genes else None

    # If the credible set is still None, return None
    if credible_set is None:
        return None

    # Step 3: Calculate contact instances C(g)
    gene_counts = {}
    if isinstance(row["gene_contact_total"], str):
        for gene in credible_set:
            gene_counts[gene] = row["gene_contact_total"].count(gene)
    else:
        gene_counts = {gene: 0 for gene in credible_set}  # If data is missing or not a string, count as 0

    # Step 4: Rank genes and apply exponential decay
    ranked_genes = sorted(gene_counts, key=gene_counts.get, reverse=True)
    gene_ranks = {gene: rank for rank, gene in enumerate(ranked_genes, 1)}
    rank_weights = {gene: np.exp(-0.3 * (rank - 1)) for gene, rank in gene_ranks.items()}

    # Step 5: Calculate base score B(g)
    base_scores = {
        gene: 1 / sum(1 for g in credible_set if gene_ranks[g] == gene_ranks[gene]) 
        for gene in credible_set
    }

    # Step 6: Compute CoRE-BED score S(g)
    initial_scores = {
        gene: base_scores[gene] * rank_weights[gene] 
        for gene in credible_set
    }

    # Step 7: Adjust for overlap w(g)
    overlap_genes = set(row["gene_overlap"].split(',')) if pd.notnull(row["gene_overlap"]) else set()
    final_scores = {
        gene: score * (2 if gene in overlap_genes else 1) 
        for gene, score in initial_scores.items()
    }

    # Step 8: Normalize the final score F_normalized(g) using softmax
    exp_scores = {gene: np.exp(score) for gene, score in final_scores.items()}
    sum_exp_scores = sum(exp_scores.values()) if exp_scores else 1
    softmax_scores = {
        gene: exp_scores[gene] / sum_exp_scores 
        for gene in exp_scores
    }

    return softmax_scores

# Apply the function to compute scores and store the results
data['gene_scores'] = data.apply(compute_core_bed_scores, axis=1)

# Initialize an empty list to store highest-scoring genes
highest_scoring_genes_list = []

# Extract the highest-scoring gene for each SNP
for index, row in data.iterrows():
    gene_scores = row['gene_scores']
    if gene_scores:
        # Get the highest-scoring gene and its score
        highest_scoring_gene = max(gene_scores, key=gene_scores.get)
        highest_score = gene_scores[highest_scoring_gene]
        # Append to the list as a dictionary
        highest_scoring_genes_list.append({
            'SNP': row['SNP'],
            'Highest_Scoring_Gene': highest_scoring_gene,
            'CoRE_BED_Score': highest_score
        })

# Convert the list to a DataFrame
highest_scoring_genes = pd.DataFrame(highest_scoring_genes_list)

# Optionally, print out the highest scores for each SNP
for index, row in highest_scoring_genes.iterrows():
    print(f"SNP: {row['SNP']}, Highest Scoring Gene: {row['Highest_Scoring_Gene']}, CoRE-BED Score: {row['CoRE_BED_Score']}")

# Save the highest-scoring genes to a CSV file
highest_scoring_genes.to_csv('/home/bettimj/gamazon_rotation/mod_core-bed/hi-c/cs2g/cs2g_top_scoring_genes.csv', index=False)

SNP: rs12740374, Highest Scoring Gene: CELSR2, CoRE-BED Score: 0.02773976946376605
SNP: rs1427407, Highest Scoring Gene: BCL11A, CoRE-BED Score: 1.0
SNP: rs6801957, Highest Scoring Gene: SCN10A, CoRE-BED Score: 1.0
SNP: rs356168, Highest Scoring Gene: SNCA, CoRE-BED Score: 0.1173045165976123
SNP: rs16886397, Highest Scoring Gene: MAP3K1, CoRE-BED Score: 0.0972367296277918
SNP: rs9349379, Highest Scoring Gene: PHACTR1, CoRE-BED Score: 1.0
SNP: rs1772203, Highest Scoring Gene: PREP, CoRE-BED Score: 0.039787726723258386
SNP: rs7775698, Highest Scoring Gene: HBS1L, CoRE-BED Score: 1.0
SNP: rs66650371, Highest Scoring Gene: ARMT1, CoRE-BED Score: 0.11651131519463531
SNP: rs1990620, Highest Scoring Gene: THSD7A, CoRE-BED Score: 0.020087863841107927
SNP: rs1382568, Highest Scoring Gene: BLK, CoRE-BED Score: 0.4478009495563474
SNP: rs922483, Highest Scoring Gene: BLK, CoRE-BED Score: 0.5251254116917727
SNP: rs2370615, Highest Scoring Gene: VPS13B, CoRE-BED Score: 0.25523197194586256
SNP: rs112

In [10]:
import pandas as pd
import numpy as np

# Function to generate the disease tissue-relevant credible set and compute the scores
def compute_core_bed_scores(row):
    # Step 1: Construct the credible set
    if pd.notnull(row["Disease tissues"]) and pd.notnull(row["tissue_gene_contact"]):
        disease_tissues = {tissue.strip() for tissue in row["Disease tissues"].split(',')}
        gene_tissue_pairs = row["tissue_gene_contact"].split(',')
        
        relevant_genes = []
        for pair in gene_tissue_pairs:
            tissue_gene_split = pair.rsplit('_', 1)
            if len(tissue_gene_split) == 2:
                tissue, gene = tissue_gene_split
                if tissue in disease_tissues:
                    relevant_genes.append(gene.strip())
        
        credible_set = set(relevant_genes) if relevant_genes else None
    else:
        credible_set = None

    # Step 2: If the credible set is empty, check the gene_overlap column
    if credible_set is None and pd.notnull(row["gene_overlap"]):
        overlap_genes = {gene.strip() for gene in row["gene_overlap"].split(',')}
        credible_set = overlap_genes if overlap_genes else None

    # If the credible set is still None, return None
    if credible_set is None:
        return None

    # Step 3: Calculate contact instances C(g)
    gene_counts = {}
    if isinstance(row["gene_contact_total"], str):
        for gene in credible_set:
            gene_counts[gene] = row["gene_contact_total"].count(gene)
    else:
        gene_counts = {gene: 0 for gene in credible_set}  # If data is missing or not a string, count as 0

    # Step 4: Rank genes and apply exponential decay
    ranked_genes = sorted(gene_counts, key=gene_counts.get, reverse=True)
    gene_ranks = {gene: rank for rank, gene in enumerate(ranked_genes, 1)}
    rank_weights = {gene: np.exp(-0.3 * (rank - 1)) for gene, rank in gene_ranks.items()}

    # Step 5: Calculate base score B(g)
    base_scores = {
        gene: 1 / sum(1 for g in credible_set if gene_ranks[g] == gene_ranks[gene]) 
        for gene in credible_set
    }

    # Step 6: Compute CoRE-BED score S(g)
    initial_scores = {
        gene: base_scores[gene] * rank_weights[gene] 
        for gene in credible_set
    }

    # Step 7: Adjust for overlap w(g)
    overlap_genes = set(row["gene_overlap"].split(',')) if pd.notnull(row["gene_overlap"]) else set()
    final_scores = {
        gene: score * (2 if gene in overlap_genes else 1) 
        for gene, score in initial_scores.items()
    }

    # Step 8: Normalize the final score using softmax
    exp_scores = {gene: np.exp(score) for gene, score in final_scores.items()}
    sum_exp_scores = sum(exp_scores.values()) if exp_scores else 1
    softmax_scores = {
        gene: exp_scores[gene] / sum_exp_scores 
        for gene in exp_scores
    }

    return softmax_scores

# Apply the function to compute scores and store the results
data['gene_scores'] = data.apply(compute_core_bed_scores, axis=1)

# Initialize an empty list to store highest-scoring genes
highest_scoring_genes_list = []
all_gene_scores_list = []

# Extract the highest-scoring gene for each SNP and save all scores
for index, row in data.iterrows():
    gene_scores = row['gene_scores']
    if gene_scores:
        # Get the highest-scoring gene and its score
        highest_scoring_gene = max(gene_scores, key=gene_scores.get)
        highest_score = gene_scores[highest_scoring_gene]
        # Append to the list as a dictionary
        highest_scoring_genes_list.append({
            'SNP': row['SNP'],
            'Highest_Scoring_Gene': highest_scoring_gene,
            'CoRE_BED_Score': highest_score
        })
        # Save all gene scores for this SNP
        for gene, score in gene_scores.items():
            all_gene_scores_list.append({
                'SNP': row['SNP'],
                'Gene': gene,
                'CoRE_BED_Score': score
            })

# Convert the lists to DataFrames
highest_scoring_genes = pd.DataFrame(highest_scoring_genes_list)
all_gene_scores = pd.DataFrame(all_gene_scores_list)

# Optionally, print out the highest scores for each SNP
for index, row in highest_scoring_genes.iterrows():
    print(f"SNP: {row['SNP']}, Highest Scoring Gene: {row['Highest_Scoring_Gene']}, CoRE-BED Score: {row['CoRE_BED_Score']}")

# Save the highest-scoring genes to a CSV file
highest_scoring_genes.to_csv('/home/bettimj/gamazon_rotation/mod_core-bed/hi-c/t2d_gwas/t2d_top_scoring_genes.csv', index=False)

# Make sure CoRE_BED_Score is numeric
all_gene_scores['CoRE_BED_Score'] = pd.to_numeric(all_gene_scores['CoRE_BED_Score'], errors='coerce')

# Make sure the SNP column keeps the order from the input dataset
all_gene_scores['SNP'] = pd.Categorical(all_gene_scores['SNP'], categories=data['SNP'], ordered=True)

# Sort the dataframe by SNP and CoRE_BED_Score (in descending order)
all_gene_scores = all_gene_scores.sort_values(by=['SNP', 'CoRE_BED_Score'], ascending=[True, False])

# Save the sorted DataFrame
all_gene_scores.to_csv('/home/bettimj/gamazon_rotation/mod_core-bed/hi-c/cs2g/cs2g_all_gene_scores.csv', index=False)

SNP: rs12740374, Highest Scoring Gene: CELSR2, CoRE-BED Score: 0.02773976946376605
SNP: rs1427407, Highest Scoring Gene: BCL11A, CoRE-BED Score: 1.0
SNP: rs6801957, Highest Scoring Gene: SCN10A, CoRE-BED Score: 1.0
SNP: rs356168, Highest Scoring Gene: SNCA, CoRE-BED Score: 0.1173045165976123
SNP: rs16886397, Highest Scoring Gene: MAP3K1, CoRE-BED Score: 0.0972367296277918
SNP: rs9349379, Highest Scoring Gene: PHACTR1, CoRE-BED Score: 1.0
SNP: rs1772203, Highest Scoring Gene: PREP, CoRE-BED Score: 0.039787726723258386
SNP: rs7775698, Highest Scoring Gene: HBS1L, CoRE-BED Score: 1.0
SNP: rs66650371, Highest Scoring Gene: ARMT1, CoRE-BED Score: 0.11651131519463531
SNP: rs1990620, Highest Scoring Gene: THSD7A, CoRE-BED Score: 0.020087863841107927
SNP: rs1382568, Highest Scoring Gene: BLK, CoRE-BED Score: 0.4478009495563474
SNP: rs922483, Highest Scoring Gene: BLK, CoRE-BED Score: 0.5251254116917727
SNP: rs2370615, Highest Scoring Gene: VPS13B, CoRE-BED Score: 0.25523197194586256
SNP: rs112