In [1]:
import pandas as pd 
import re 
import numpy as np

# Compute the similarity between LLM and the enriched GO names (any GO term that is significant)

1. Remove the 'regulation of' from the name and run semantic similarity between GO and LLM names
2. pick the highest semantic similarity GO term 
3. plot the llm JI vs GO JI plot with colored by semantic similarity 

## Trim names

In [2]:
input_file = 'data/omics_revamped_LLM_gprofiler_new_gene_name_DF.tsv'
input_df = pd.read_csv(input_file, sep='\t')

# find cases where there is regulation of
example_df = input_df.loc[(input_df['Term'].str.contains('regulation of', flags=re.IGNORECASE, regex=True))&(input_df['LLM Name'].str.contains('regulation of', flags=re.IGNORECASE, regex=True))]
print(example_df.shape)
example_df

In [38]:
import re
def remove_regulation_of(text):
    text = text.lower()
    # Define the regular expression pattern
    pattern = r'.*regulation of '
    # # find if match 
    # match = re.match(pattern, text)
    # Substitute the matched pattern with an empty string
    result = re.sub(pattern, '', text)
    return result


def iter_df_trim_name(row):
    trim_GO_term = remove_regulation_of(row['Term'])
    
    llm_name = row['LLM Name'].split('and')
    if len(llm_name) > 1:
        split_name = []
        for name in llm_name:
            trim_llm_name = remove_regulation_of(name)
            split_name.append(trim_llm_name)
        return ' and '.join(split_name), trim_GO_term
    else:
        trim_llm_name = remove_regulation_of(row['LLM Name'])
    # trim_llm_name = remove_regulation_of(row['LLM Name'])
   
    return trim_llm_name, trim_GO_term
# Example usage
# example_string = "This is a test string with Regulation of Gene Expression"
# cleaned_string = remove_regulation_of(example_string)
# print(cleaned_string)  # Output: "gene expression"

# Apply the function to the 'Term' column
example_df[['trimed LLM Name','trimed Term']] = input_df.apply(lambda row: iter_df_trim_name(row), axis=1, result_type='expand')
example_df [['LLM Name', 'Term','trimed LLM Name','trimed Term']].sample(10)

In [27]:
# run on the input 
input_df[['trimed LLM Name','trimed Term']] = input_df.apply(lambda row: iter_df_trim_name(row), axis=1, result_type='expand')
input_df.to_csv('data/omics_revamped_LLM_gprofiler_new_gene_name_DF_trimed_name.tsv', sep='\t', index=False)

In [33]:
from semanticSimFunctions import getNameSimilarities_no_repeat
from transformers import AutoTokenizer, AutoModel
%reload_ext autoreload
%autoreload 2
SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

## test run with 10 examples

new_sim_df, llm_emb_dict, go_emb_dict = getNameSimilarities_no_repeat(example_sim_df, 'trimed LLM Name', 'trimed Term',SapBERT_tokenizer, SapBERT_model, llm_name_embedding_dict = {},go_term_embedding_dict = {}, simMetric = 'cosine_similarity', epsilon= 0.05)


## run semantic similarity in the command line (using the same environment) 

python run_omics_sem_sim.py --inputFile data/omics_revamped_LLM_gprofiler_new_gene_name_DF_trimed_name.tsv --nameCol1 'trimed LLM Name' --nameCol2 'trimed Term'


Note:
- will save the embeddings for all LLM names and GO term name as separate files 


## find the highest semantic similar GO among the ones that are significant 

In [3]:
import pandas as pd
sim_val_file = 'data/omics_revamped_LLM_gprofiler_new_gene_name_DF_trimed_name_simVals_DF.tsv'
sim_val_df = pd.read_csv(sim_val_file, sep='\t')
print(sim_val_df.shape)
sim_val_df.head()

In [4]:
adj_pval_thresh = 0.05
llm_confidence_thresh = 0.1

llm_conf_field = 'Score'
go_pval_field = 'Adjusted P-value'

both_named = sim_val_df[(sim_val_df[llm_conf_field] >= llm_confidence_thresh)&(sim_val_df[go_pval_field] <= adj_pval_thresh)]

print(both_named.shape)
print(both_named['GeneSetID'].nunique())

In [5]:
from utils.analyze_enrichment_utils import cal_JI_coverage
def get_max_semantic_sim(df):
    return df.loc[df['LLM_name_GO_term_sim'].idxmax()]
group_col = ["Source", "GeneSetID", "GeneSetName", "GeneList"] # group by these columns in future steps
grouped_df = both_named.groupby(group_col)

max_semantic_sim_df = grouped_df.apply(get_max_semantic_sim).reset_index(drop = True)

max_semantic_sim_df  = cal_JI_coverage(max_semantic_sim_df)

# merge the LLM supporting genes and JI 

max_semantic_sim_df[[ 'n_Genes', 'GeneList', 'LLM Name', 'Term','Adjusted P-value', 'intersection_size', 'term_size', 'trimed LLM Name', 'trimed Term',
       'LLM_name_GO_term_sim', 'gprofiler_JI']].head(10)

In [17]:
## load the LLM coverage and JI data as a common df to merge
llm_coverage_df = pd.read_csv("data/omics_revamped_LLM_genecounts_DF.tsv", sep="\t")

llm_JI_file = 'data/omics_revamped_LLM_w_best_matching_GO_terms_for_JI.tsv'
llm_ji_df = pd.read_csv(llm_JI_file, sep="\t")


In [16]:
# merge with the LLM coverage data
merged_DF = pd.merge(max_semantic_sim_df, llm_coverage_df, on=['Source','GeneSetID','GeneSetName', 'GeneList', 'n_Genes'], how='left')
print(merged_DF.shape)
# print(merged_DF.columns)


# merge with the LLM JI data

merged_DF = pd.merge(merged_DF, llm_ji_df, on=['Source','GeneSetID','GeneList','n_Genes', 'LLM Name','Supporting Count'], how='left')
print(merged_DF.shape)
print(merged_DF.columns)

# Plot the JI correlation scatter 

In [20]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams.update({'font.size': 7, 'font.family': 'sans-serif'})
plt.rcParams['xtick.labelsize'] = 7
plt.rcParams['ytick.labelsize'] = 7
plt.rcParams['axes.labelsize'] = 7
plt.rcParams['axes.titlesize'] = 7

In [14]:
similarity_thre = 0.5

# y_field = 'gprofiler_JI'
# y_label = 'g:Profiler coverage'
# x_field = 'LLM_JI'
# x_label = 'GPT-4 coverage'

y_field = 'intersection_size'
y_label = 'Number of covered genes\n(g:Profiler)'
x_field = 'Supporting Count'
x_label = 'Number of covered genes\n(GPT-4)'


# Filter the DataFrame
filtered_data_high = merged_DF[merged_DF['LLM_name_GO_term_sim'] >= similarity_thre]
filtered_data_low = merged_DF[merged_DF['LLM_name_GO_term_sim'] < similarity_thre]
print(f'high similarity: {filtered_data_high.shape[0]}', f'\nin percentage: {filtered_data_high.shape[0]/merged_DF.shape[0] *100 :.2f}%')
print(f'low similarity: {filtered_data_low.shape[0]}', f'\nin percentage: {filtered_data_low.shape[0]/merged_DF.shape[0] *100 :.2f}%')

# further breakdown the high similarity group into high and low JI
filtered_data_high_JI_high = filtered_data_high[filtered_data_high[x_field]>filtered_data_high[y_field]]
print(f'high similarity high number genes: {filtered_data_high_JI_high.shape[0]}', f'\nin percentage: {filtered_data_high_JI_high.shape[0]/merged_DF.shape[0] *100 :.2f}%')
filtered_data_high_JI_low = filtered_data_high[filtered_data_high[x_field]<=filtered_data_high[y_field]]
# LLM JI vs gprofiler JI, colored by the similarity with countinuous color
plt.figure(figsize=(3,3))

# Create the scatter plot using the filtered data with explicit normalization
# Here, we switch to using matplotlib directly

plt.scatter(
    x=filtered_data_high[x_field], 
    y=filtered_data_high[y_field], 
    c='#ae2012',
    s=8,  # Set the size of the points
    label='High similarity'
)

# plot the low similarity data
plt.scatter(
    x=filtered_data_low[x_field], 
    y=filtered_data_low[y_field], 
    c='#294c60',
    s=8,  # Set the size of the points
    label='Low similarity'
)

plt.xlabel(x_label)
plt.ylabel(y_label)
# plt.axvline(x=JI_thresh, color='black', linestyle='--', label='JI threshold')
# plt.axhline(y=JI_thresh, color='black', linestyle='--')
plt.xlim(-0.05, 40)
plt.ylim(-0.05, 40)
# Ensure the x and y ticks are the same
ticks = range(0, 41, 10)
plt.xticks(ticks)
plt.yticks(ticks)
# plot diagonal line
plt.plot([0, 40], [0, 40], lw=1, color = 'black', linestyle='--')


sns.despine()
plt.legend()
plt.savefig('figures/omics_LLM_genes_vs_gprofiler_genes_sep_similarity.svg', dpi=300)
plt.show()

In [69]:
def clean_gene_list(row):
    gene_list = row['intersections'].split(',')
    return ' '.join(gene_list)

merged_DF['intersections'] = merged_DF.apply(lambda row: clean_gene_list(row), axis=1)
merged_DF.to_csv('data/omics_revamped_LLM_gprofiler_new_gene_name_DF_trimed_name_simVals_max_named.tsv', sep='\t', index=False)