In [None]:
import altair as alt
import ast
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Any
from umap import UMAP

from nesta_ds_utils.viz.altair import saving as viz_save
from dsp_ai_eval.getters.scite import get_scite_df_w_embeddings
from dsp_ai_eval.getters.gpt import get_gpt_themes_embeddings, get_cluster_summaries_cleaned, get_topics

from dsp_ai_eval import PROJECT_DIR, config, logging

alt.data_transformers.disable_max_rows()

model = SentenceTransformer(config["embedding_model"])

In [None]:
def get_n_most_similar_abstracts(gpt_cluster_summaries: pd.DataFrame, 
                                 abstracts: pd.DataFrame, 
                                 n: int = 3) -> Dict[str, pd.DataFrame]:
    """
    Finds the n most similar abstracts for each cluster represented in the GPT cluster summaries.
    
    For each cluster, it takes the most representative document in that cluster (ie a GPT-generated sentence). Then it computes the cosine
    similarity between that document's embedding and the embeddings of all the abstracts.

    It returns a dictionary where each key is a topic name from the 
    GPT cluster summaries, and the value is a DataFrame containing the top n most similar abstracts, 
    along with their similarity scores and assigned topic.

    Parameters:
    - gpt_cluster_summaries (pd.DataFrame): A DataFrame with at least two columns: 'representative_docs' 
      which contains the representative documents for each cluster, and 'topic_name' which contains the name 
      of the topic associated with each cluster.
    - abstracts (pd.DataFrame): A DataFrame containing the abstracts with their embeddings in a column named 
      'embeddings'. Each embedding should be stored in a format that can be converted to a pandas Series.
    - n (int, optional): The number of similar abstracts to retrieve for each topic. Defaults to 3.

    Returns:
    - Dict[str, pd.DataFrame]: A dictionary mapping each topic name to a DataFrame containing the top n similar 
      abstracts, with additional columns 'topic' and 'similarity' for the topic name and similarity score, respectively.
    """
    most_similar_abstracts = {}

    for _, row in gpt_cluster_summaries.iterrows():
        doc = ast.literal_eval(row['representative_docs'])[0]
        reference_embedding = model.encode(doc)
        similarities = [cosine_similarity([reference_embedding], [embed])[0][0] for embed in abstracts['embeddings'].apply(pd.Series).values]
        top_indices = np.argsort(similarities)[::-1][:n]
        similar_abstracts = abstracts.iloc[top_indices]
        similar_abstracts['topic'] = row['topic_name']
        similar_abstracts['similarity'] = [similarities[i] for i in top_indices]
        
        most_similar_abstracts[row['topic_name']] = similar_abstracts
        
    return most_similar_abstracts

In [None]:
answers_long = get_gpt_themes_embeddings()
abstracts = get_scite_df_w_embeddings()

In [None]:
# Prep the two datasets so that they have the same columns, then concatenate them
abstracts = abstracts[['title_abstract', 'embeddings', 'total_cites']]
abstracts = abstracts.rename(columns={'title_abstract': 'doc'})
abstracts['gpt_model'] = 'research abstract'
abstracts['temperature'] = 'NA'
abstracts['source'] = 'abstract'
abstracts['topic_name'] = '--'
abstracts = abstracts[['doc', 'embeddings', 'total_cites', 'gpt_model', 'temperature', 'source', 'topic_name']]

gpt_cluster_summaries = get_cluster_summaries_cleaned()
topics = get_topics()

In [None]:
answers_long = answers_long[['answer_cleaned', 'embeddings', 'gpt_model', 'temperature']]
answers_long["topic"] = topics

In [None]:
answers_long = pd.merge(answers_long, gpt_cluster_summaries, on='topic', how='left')

In [None]:
answers_long.head()

In [None]:

answers_long['embeddings'] = answers_long['embeddings'].apply(ast.literal_eval)
answers_long = answers_long.rename(columns={'answer_cleaned': 'doc'})
answers_long['total_cites'] = 0
answers_long['source'] = 'gpt'
answers_long = answers_long[['doc', 'embeddings', 'total_cites', 'gpt_model', 'temperature', 'source', 'topic_name']]

all_data = pd.concat([abstracts, answers_long], ignore_index=True)

In [None]:
# check how many documents we have from the GPT responses vs how many research abstracts
all_data['source'].value_counts()

In [None]:
# We're interested in finding out how many abstracts (if any) fall near clusters of GPT responses, which is why
# we're only looking at GPT topic names for now
all_data["topic_name"].fillna("--", inplace=True)
all_data['topic_name'].value_counts()

In [None]:
embeddings = all_data['embeddings'].apply(pd.Series).values

umap_2d = UMAP(random_state=42)
embeddings_2d = umap_2d.fit_transform(embeddings)

df_vis = pd.DataFrame(embeddings_2d, columns=["x", "y"])

df_vis = pd.concat([all_data, df_vis], axis=1)

In [None]:
df_vis['gpt_model'].value_counts()

In [None]:
opacity_condition = alt.condition(
            alt.datum.source == "abstract", alt.value(0.5), alt.value(0.25)
        )

scatter_plot = alt.Chart(df_vis).mark_circle(size=100).encode(
    x=alt.X('x:Q', axis=alt.Axis(ticks=False, labels=False, title=None, grid=False)),
    y=alt.Y('y:Q', axis=alt.Axis(ticks=False, labels=False, title=None, grid=False)),
    color=alt.Color('source'),
    opacity=opacity_condition,
    tooltip=['source', 'doc']
).configure_legend(title=None, labelFontSize=20, titleFontSize=20).properties(width=800, height=600).interactive()

# scatter_plot.save(PROJECT_DIR / f"outputs/figures/gpt_abstracts_overlap.html")
# viz_save.save(scatter_plot, f"gpt_abstracts_overlap", PROJECT_DIR / "outputs/figures", save_png=True)

scatter_plot.display()

Because in the previous plot it can be hard to see if GPT summaries are obscuring research abstracts, in the next plot, we scale the size of the points by number of citations. I would hypothesise that abstracts that have been cited hundreds of times should be more influential and therefore more likely to be similar to GPT summaries. So perhaps where there are small, seemingly outlying clusters of GPT summaries, maybe there are actually a couple of highly influential research papers nearby?

In [None]:
# Try a plot where point size is scaled by number of citations

def map_citations_to_size(citations, quantile_values):
    
    a=int(quantile_values[0.25])
    b=int(quantile_values[0.5])
    c=int(quantile_values[0.75])
    
    if citations==0:
        return 'NA'
    elif 0 < citations < a:
        return f'5-{a-1}'
    elif a <= citations < b:
        return f'{a}-{b-1}'
    elif b <= citations < c:
        return f'{b}-{c-1}'
    else:
        return f'{c}+'

# Specify the desired quantiles as a list of probabilities
quantiles = [0.25, 0.5, 0.75]

# Use the quantile method to calculate the quantiles
quantile_values = df_vis[df_vis['source']=='abstract']['total_cites'].quantile(quantiles)
logging.info(f"quantiles: {quantile_values}")
    
df_vis['point_size'] = df_vis['total_cites'].apply(lambda x: map_citations_to_size(x, quantile_values))

# df_vis['size'] = df_vis['total_cites'].apply(lambda x: 100 if x == 0 else x*2)

a=int(quantile_values[0.25])
b=int(quantile_values[0.5])
c=int(quantile_values[0.75])

print(df_vis['point_size'].value_counts())
print(df_vis['point_size'].unique().tolist())

# First, define the size encoding that will be common to both layers
size_encode = alt.Size(
    "total_cites:Q",
    scale=alt.Scale(
        range=[50, 2000]
    ),
    legend=alt.Legend(title="Number of citations", titleFontSize=12, labelPadding=100, labelFontSize=12,
                      symbolFillColor='blue'),
)

# Define the base chart with common encoding settings
base_chart = alt.Chart(df_vis).mark_circle().transform_calculate(
        jittered_x="datum.x + sqrt(-2*log(random()))*cos(2*PI*random())*0.4",
        jittered_y="datum.y + sqrt(-2*log(random()))*sin(2*PI*random())*0.4"
    ).encode(
    x=alt.X('jittered_x:Q', axis=alt.Axis(ticks=False, labels=False, title=None,grid=False)),
    y=alt.Y('jittered_y:Q', axis=alt.Axis(ticks=False, labels=False, title=None,grid=False)),
    tooltip=['source', 'doc', 'point_size', 'total_cites']
)

# Create separate layers
gpt_points = base_chart.transform_filter(alt.datum.source == 'gpt').mark_circle(color='#9B30FF', size=50).encode(
    opacity=alt.value(0.1),
    # size=size_encode
)

abstract_points = base_chart.transform_filter(alt.datum.source == 'abstract').mark_circle(color='#3CB371').encode(
    opacity=alt.value(0.4),size=size_encode
)

# Layer the charts
layered_chart = alt.layer(gpt_points, abstract_points).properties(width=900, height=600).interactive()

layered_chart.save(PROJECT_DIR / f"outputs/figures/gpt_abstracts_overlap.html")
viz_save.save(layered_chart, f"gpt_abstracts_overlap", PROJECT_DIR / "outputs/figures", save_png=True)

# Display the chart
layered_chart.display()

Now do the same plot, but this time with the GPT points coloured by topic.

In [None]:
size_encode = alt.Size(
    "total_cites:Q",
    scale=alt.Scale(
        range=[50, 2000]#[50, 100, 500, 1000, 2000]
    ),
    legend=alt.Legend(title="Number of citations", titleFontSize=12, labelPadding=100, labelFontSize=12),
)

# Define the base chart with common encoding settings
base_chart = alt.Chart(df_vis).transform_calculate(
        # Adding jitter calculations to x and y fields directly
        jittered_x="datum.x + sqrt(-2*log(random()))*cos(2*PI*random())*0.4",
        jittered_y="datum.y + sqrt(-2*log(random()))*sin(2*PI*random())*0.4"
    ).encode(
    x=alt.X('jittered_x:Q', axis=alt.Axis(ticks=False, labels=False, title=None,grid=False)),
    y=alt.Y('jittered_y:Q', axis=alt.Axis(ticks=False, labels=False, title=None,grid=False)),
    size=size_encode,
    # opacity=alt.value(0.25),
    tooltip=['source', 'doc', 'point_size', 'total_cites', 'topic_name']
)

topic_color_encoding = alt.Color('topic_name:N', legend=alt.Legend(title="Topics"))

# Create separate layers
gpt_points = base_chart.transform_filter(alt.datum.source == 'gpt').mark_circle().encode(
    color=topic_color_encoding, #alt.value('#3CB371'),  # Change color as needed
    # size=size_encode,
    opacity=alt.value(0.1),
)

abstract_points = base_chart.transform_filter(alt.datum.source == 'abstract').mark_circle().encode(
    color=alt.value('#9B30FF'),  # Change color as needed
    # size=size_encode,
    opacity=alt.value(0.3),
)

# Layer the charts
layered_chart = alt.layer(gpt_points, abstract_points).properties(width=900, height=600).interactive()

layered_chart.save(PROJECT_DIR / f"outputs/figures/gpt_abstracts_overlap_topics.html")
viz_save.save(layered_chart, f"gpt_abstracts_overlap_topics", PROJECT_DIR / "outputs/figures", save_png=True)

# Display the chart
layered_chart.display()

# Calculate most similar papers

For each cluster of GPT summaries, find the N most similar research abstracts.

Using a metric such as cosine similarity is important because how the distances look visually may be misleading - this was pointed out by Max in a PR review. Cosine similarity gives a more reliable/holistic picture of how similar or different the two text vectors actually are.

In [None]:
gpt_cluster_summaries = get_cluster_summaries_cleaned()

In [None]:
most_similar_abstracts = get_n_most_similar_abstracts(gpt_cluster_summaries, abstracts, n=3)

In [None]:
concatenated_df = pd.concat(most_similar_abstracts.values(), ignore_index=True)

In [None]:
concatenated_df.to_csv(PROJECT_DIR / "outputs/data/similar_abstracts.csv", index=False)

In [None]:
concatenated_df['topic'].unique()

In [None]:
concatenated_df[concatenated_df['topic']=='International Technology Transfer and its Impact on UK']

In [None]:
concatenated_df[concatenated_df['topic']=='Skill Development and Technology Diffusion']

In [None]:
concatenated_df[concatenated_df['topic']=='Regional Disparities in Technology Diffusion']