In [None]:
import altair as alt
import ast
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Any
from umap import UMAP

from nesta_ds_utils.viz.altair import saving as viz_save
from dsp_ai_eval.getters.scite import get_scite_df_w_embeddings
from dsp_ai_eval.getters.gpt import get_gpt_themes_embeddings, get_cluster_summaries_cleaned

from dsp_ai_eval import PROJECT_DIR, config

alt.data_transformers.disable_max_rows()

model = SentenceTransformer(config["embedding_model"])

In [None]:
def get_n_most_similar_abstracts(gpt_cluster_summaries: pd.DataFrame, 
                                 abstracts: pd.DataFrame, 
                                 n: int = 3) -> Dict[str, pd.DataFrame]:
    """
    Finds the n most similar abstracts for each cluster represented in the GPT cluster summaries.
    
    For each cluster, it takes the most representative document in that cluster (ie a GPT-generated sentence). Then it computes the cosine
    similarity between that document's embedding and the embeddings of all the abstracts.

    It returns a dictionary where each key is a topic name from the 
    GPT cluster summaries, and the value is a DataFrame containing the top n most similar abstracts, 
    along with their similarity scores and assigned topic.

    Parameters:
    - gpt_cluster_summaries (pd.DataFrame): A DataFrame with at least two columns: 'representative_docs' 
      which contains the representative documents for each cluster, and 'topic_name' which contains the name 
      of the topic associated with each cluster.
    - abstracts (pd.DataFrame): A DataFrame containing the abstracts with their embeddings in a column named 
      'embeddings'. Each embedding should be stored in a format that can be converted to a pandas Series.
    - n (int, optional): The number of similar abstracts to retrieve for each topic. Defaults to 3.

    Returns:
    - Dict[str, pd.DataFrame]: A dictionary mapping each topic name to a DataFrame containing the top n similar 
      abstracts, with additional columns 'topic' and 'similarity' for the topic name and similarity score, respectively.
    """
    most_similar_abstracts = {}

    for _, row in gpt_cluster_summaries.iterrows():
        doc = ast.literal_eval(row['representative_docs'])[0]
        reference_embedding = model.encode(doc)
        similarities = [cosine_similarity([reference_embedding], [embed])[0][0] for embed in abstracts['embeddings'].apply(pd.Series).values]
        top_indices = np.argsort(similarities)[::-1][:n]
        similar_abstracts = abstracts.iloc[top_indices]
        similar_abstracts['topic'] = row['topic_name']
        similar_abstracts['similarity'] = [similarities[i] for i in top_indices]
        
        most_similar_abstracts[row['topic_name']] = similar_abstracts
        
    return most_similar_abstracts

In [None]:
answers_long = get_gpt_themes_embeddings()
abstracts = get_scite_df_w_embeddings()

In [None]:
# Prep the two datasets so that they have the same columns, then concatenate them
abstracts = abstracts[['title_abstract', 'embeddings', 'total_cites']]
abstracts = abstracts.rename(columns={'title_abstract': 'doc'})
abstracts['gpt_model'] = 'research abstract'
abstracts['temperature'] = 'NA'
abstracts['source'] = 'abstract'
abstracts = abstracts[['doc', 'embeddings', 'total_cites', 'gpt_model', 'temperature', 'source']]

answers_long = answers_long[['answer_cleaned', 'embeddings', 'gpt_model', 'temperature']]
answers_long['embeddings'] = answers_long['embeddings'].apply(ast.literal_eval)
answers_long = answers_long.rename(columns={'answer_cleaned': 'doc'})
answers_long['total_cites'] = 0
answers_long['source'] = 'gpt'
answers_long = answers_long[['doc', 'embeddings', 'total_cites', 'gpt_model', 'temperature', 'source']]

all_data = pd.concat([abstracts, answers_long], ignore_index=True)

In [None]:
embeddings = all_data['embeddings'].apply(pd.Series).values

umap_2d = UMAP(random_state=42)
embeddings_2d = umap_2d.fit_transform(embeddings)

df_vis = pd.DataFrame(embeddings_2d, columns=["x", "y"])

df_vis = pd.concat([all_data, df_vis], axis=1)

In [None]:
df_vis['gpt_model'].value_counts()

In [None]:
opacity_condition = alt.condition(
            alt.datum.gpt_model == "research abstract", alt.value(1), alt.value(0.2)
        )

color_scale = alt.Scale(domain=['gpt-3.5-turbo', 'gpt-4', 'research abstract'],
                        range=['#0d0887', '#7e03a8',
                               #'#cc4778',
                               '#f0f921'
                               ])

scatter_plot = alt.Chart(df_vis).mark_circle(size=100).encode(
    x=alt.X('x:Q', axis=alt.Axis(ticks=False, labels=False, title=None)),
    y=alt.Y('y:Q', axis=alt.Axis(ticks=False, labels=False, title=None)),
    color=alt.Color('gpt_model', scale=color_scale),
    opacity=opacity_condition,
    tooltip=['source','gpt_model', 'doc']
).configure_legend(title=None, labelFontSize=20, titleFontSize=20).properties(width=800, height=600).interactive()

scatter_plot.save(PROJECT_DIR / f"outputs/figures/gpt_abstracts_overlap.html")
viz_save.save(scatter_plot, f"gpt_abstracts_overlap", PROJECT_DIR / "outputs/figures", save_png=True)

scatter_plot.display()

Because in the previous plot it can be hard to see if GPT summaries are obscuring research abstracts, in the next plot, we scale the size of the points by number of citations. I would hypothesise that abstracts that have been cited hundreds of times should be more influential and therefore more likely to be similar to GPT summaries. So perhaps where there are small, seemingly outlying clusters of GPT summaries, maybe there are actually a couple of highly influential research papers nearby?

In [None]:
# Try a plot where point size is scaled by number of citations

df_vis['size'] = df_vis['total_cites'].apply(lambda x: 100 if x == 0 else x*10)

scatter_plot = alt.Chart(df_vis).mark_circle().encode(
    x=alt.X('x:Q', axis=alt.Axis(ticks=False, labels=False, title=None)),
    y=alt.Y('y:Q', axis=alt.Axis(ticks=False, labels=False, title=None)),
    color='gpt_model',
    size='size:Q',
    opacity=alt.value(0.75),
    tooltip=['source','gpt_model', 'doc']
).properties(width=800, height=600).interactive()

scatter_plot.display()

# Calculate most similar papers

For each cluster of GPT summaries, find the N most similar research abstracts.

In [None]:
gpt_cluster_summaries = get_cluster_summaries_cleaned()

In [None]:
most_similar_abstracts = get_n_most_similar_abstracts(gpt_cluster_summaries, abstracts, n=3)

In [None]:
concatenated_df = pd.concat(most_similar_abstracts.values(), ignore_index=True)

In [None]:
concatenated_df.to_csv(PROJECT_DIR / "outputs/data/similar_abstracts.csv", index=False)

In [None]:
concatenated_df[concatenated_df['topic']=='International Technology Transfer']

In [None]:
concatenated_df[concatenated_df['topic']=='Skill Development and Technology Diffusion']