# Scientific literature mining

Using embeddings, we will mine scientific litterature to identify relationships between papers and find similar papers.

We already have extracted title and abstract for several preprints from arXiv. These papers are taken from multiple topics:
- *nanoporous materials*
- *many-body*
- *machine learning*
- *quantum computing*
- *biomolecular modeling*

## Load required libraries


In [1]:
from itertools import cycle
import json
import textwrap

from fastembed import TextEmbedding
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objs as go
import umap.umap_ as umap

tqdm.pandas()

## Load the model

In [2]:
model = TextEmbedding("nomic-ai/nomic-embed-text-v1.5-Q")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

## Load the data

In [3]:
f_in = open("arxiv_papers.json")
data = json.load(f_in)
papers = pd.json_normalize(data)

Display number of papers and the first record of the paper dataset:

In [4]:
print(f"Dimensions of papers dataframe: {papers.shape}")
print("First paper:")
print(papers.iloc[0])

Dimensions of papers dataframe: (400, 5)
First paper:
id                          http://arxiv.org/abs/2402.01321v1
date                                     2024-02-02T11:17:55Z
title       Ionic Current Rectification in Nanopores: Effe...
abstract    Ionic Current Rectification (ICR) can appear i...
category                                 nanoporous materials
Name: 0, dtype: object


In [5]:
def get_embedding(row: pd.Series) -> np.ndarray:
    """Get the embedding for a paper's title and abstract (merged)."""
    text_to_embed = row["title"] + " " + row["abstract"]
    return list(model.embed(text_to_embed))[0]

papers["embedding"] = papers.progress_apply(get_embedding, axis="columns")

  0%|          | 0/400 [00:00<?, ?it/s]

In [6]:
print(papers.iloc[0])

id                           http://arxiv.org/abs/2402.01321v1
date                                      2024-02-02T11:17:55Z
title        Ionic Current Rectification in Nanopores: Effe...
abstract     Ionic Current Rectification (ICR) can appear i...
category                                  nanoporous materials
embedding    [1.384288, 1.9946833, -3.2957623, -0.6682733, ...
Name: 0, dtype: object


In [7]:
embeddings_array = np.vstack(papers["embedding"].values)
umap_reducer = umap.UMAP(metric="cosine", n_components=2, random_state=42)
reduce_embeddings = umap_reducer.fit_transform(embeddings_array)
papers[["umap_x", "umap_y"]] = reduce_embeddings

  warn(


In [8]:
def wrap_text(text: str, width: int=70) -> str:
    """Wrap text to a specified width."""
    return "<br>".join(textwrap.wrap(text, width=width))

def set_tooltip(row: pd.Series) -> str:
    """Create a tooltip for each paper."""
    label = (
        f"<b>Title:</b> {row["title"]}<br>"
    )
    return label

papers["tooltip"] = papers.apply(set_tooltip, axis="columns")

In [9]:
colors = cycle(px.colors.qualitative.Plotly)
layout = {
    "title": "2D UMAP Embeddings",
    "width": 800,
    "height": 600,
    "plot_bgcolor": "rgba(0,0,0,0)",
    "hovermode": "closest",
}

fig = go.Figure(layout=layout)
for label in papers["category"].unique():
    color = next(colors)
    subset = papers[papers["category"] == label]
    trace = go.Scattergl(
        x = subset["umap_x"],
        y = subset["umap_y"],
        name = label,
        mode = "markers",
        marker = dict(
            color = color,
            size = 8,
            line = dict(width=0.5),
            opacity=0.75
        ),
        text=subset["tooltip"]
    )
    fig.add_trace(trace)

fig.show()

In [10]:
target = "We find that long DNA molecules that have binding affinity for the nanostars are preferentially enriched on the interface"
target_embedding = list(model.embed(target))[0]

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(np.vstack(papers["embedding"].values), target_embedding.reshape(1, -1))


In [12]:
papers["similarity_score"] = similarities.flatten()

In [13]:
# Sort papers by similarity score in descending order
most_similar_papers = papers.sort_values(by="similarity_score", ascending=False)

# Display the top 10 most similar papers
print("Top 10 papers most similar to 'Machine Learning in Physical Sciences: Theory and Applications':")
for i, (index, row) in enumerate(most_similar_papers.head(10).iterrows(), 1):
    print(f"{i:2d}: {row['title']} (Score: {row['similarity_score']:.3f})")

Top 10 papers most similar to 'Machine Learning in Physical Sciences: Theory and Applications':
 1: Controlling the size and adhesion of DNA droplets using surface-active DNA molecules (Score: 0.760)
 2: Identification of DNA Bases Using Nanopores Created in Finite-Size Nanoribbons from Graphene, Phosphorene, and Silicene (Score: 0.689)
 3: DNA translocation through nanopores with salt gradients: The role of osmotic flow (Score: 0.659)
 4: First principles investigation of nanopore sequencing using variable voltage bias on graphene-based nanoribbons (Score: 0.658)
 5: Condensation and activator/repressor control of a transcription-regulated biomolecular liquid (Score: 0.657)
 6: Quantum Capacitance Modifies Interionic Interactions in Semiconducting Nanopores (Score: 0.642)
 7: A zero-depth nanopore capillary for the analysis of translocating biomolecules (Score: 0.639)
 8: Anomalous Long-range Hard-wall Repulsion between Polymers in Solvent Mixtures and Its Implication for Biomolecular

In [16]:
layout = {
    "title": "2D UMAP Embeddings Colored by Similarity Score",
    "width": 800,
    "height": 600,
    "plot_bgcolor": "rgba(0,0,0,0)",
    "hovermode": "closest",
}

fig = go.Figure(layout=layout)
trace = go.Scattergl(
    x = papers["umap_x"],
    y = papers["umap_y"],
    mode = "markers",
    marker = dict(
        color = papers["similarity_score"],
        colorscale = "Viridis",
        colorbar = dict(title="Similarity Score"),
        size = 8,
        line = dict(width=0.5),
        opacity=0.75
    ),
    text=papers["tooltip"]
)
fig.add_trace(trace)

fig.show()