# Scientific literature mining

Using embeddings, we will mine scientific litterature to identify relationships between papers and find similar papers.

We already have extracted title and abstract for several preprints from arXiv. These papers are taken from multiple topics:
- *nanoporous materials*
- *many-body*
- *machine learning*
- *quantum computing*
- *biomolecular modeling*

## Load required libraries


In [84]:
from itertools import cycle
import json

from fastembed import TextEmbedding
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import plotly
import plotly.graph_objs as go
import umap.umap_ as umap

tqdm.pandas()

## Load the model

In [12]:
model = TextEmbedding("nomic-ai/nomic-embed-text-v1.5-Q")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

## Load the data

In [31]:
f_in = open("arxiv_papers.json")
data = json.load(f_in)
papers = pd.json_normalize(data)

Display number of papers and the first record of the paper dataset:

In [41]:
print(f"Dimensions of papers dataframe: {papers.shape}")
print("First paper:")
print(papers.iloc[0])

Dimensions of papers dataframe: (400, 5)
First paper:
id                          http://arxiv.org/abs/2402.01321v1
date                                     2024-02-02T11:17:55Z
title       Ionic Current Rectification in Nanopores: Effe...
abstract    Ionic Current Rectification (ICR) can appear i...
query                                    nanoporous materials
Name: 0, dtype: object


In [49]:
def get_embedding(row):
    text_to_embed = row["title"] + " " + row["abstract"]
    return list(model.embed(text_to_embed))[0]

papers["embedding"] = papers.progress_apply(get_embedding, axis="columns")

  0%|          | 0/400 [00:00<?, ?it/s]

In [55]:
print(papers.iloc[0])

id                            http://arxiv.org/abs/2402.01321v1
date                                       2024-02-02T11:17:55Z
title         Ionic Current Rectification in Nanopores: Effe...
abstract      Ionic Current Rectification (ICR) can appear i...
query                                      nanoporous materials
embeddings    [1.384288, 1.9946833, -3.2957623, -0.6682733, ...
embedding     [1.384288, 1.9946833, -3.2957623, -0.6682733, ...
Name: 0, dtype: object


In [76]:
embeddings_array = np.vstack(papers["embedding"].values)
umap_reducer = umap.UMAP(metric="cosine", n_components=2, random_state=42)
reduce_embeddings = umap_reducer.fit_transform(embeddings_array)
papers[["umap_x", "umap_y"]] = reduce_embeddings

  warn(


In [91]:
import textwrap
def wrap_text(text, width=70):
    return "<br>".join(textwrap.wrap(text, width=width))

def set_tooltip(row):
    label = (
        f"<b>Title:</b> {row["title"]}<br>"
    )
    return label

papers["tooltip"] = papers.apply(set_tooltip, axis="columns")

In [93]:
colors = cycle(plotly.colors.sequential.Viridis)
import plotly.express as px

colors = cycle(px.colors.qualitative.Plotly)
layout = {
    "title": "2D UMAP Embeddings",
    "width": 800,
    "height": 600,
    "hovermode": "closest",
}

fig = go.Figure(layout=layout)
for label in papers["query"].unique():
    color = next(colors)
    subset = papers[papers["query"] == label]
    trace = go.Scattergl(
        x = subset["umap_x"],
        y = subset["umap_y"],
        name = label,
        mode = "markers",
        marker = dict(
            color = color,
            size = 8,
            line = dict(width=0.5),
            opacity=0.75
        ),
        text=subset["tooltip"]
    )
    fig.add_trace(trace)

fig.show()