# Scientific Keyword Trends

Contains detailed analysis of some scientific keyword trends relating to disease and data analysis.

## ✋Set Up

### Install libraries

In [None]:
# Install for settings.
!pip install --quiet numpy==1.25.2 pandas==1.5.3 plotly==5.15.0 kaleido==0.2.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m75.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.
mizani 0.13.1 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.
plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 

In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import kaleido
import re  # Regular expression library

import warnings
warnings.filterwarnings;

## Load Data

Import datasets, that contains full embeddings and reduced embeddings.

In [None]:
df = pd.read_hdf('Files/embeddings_full_tSNE_uMAP_01MAR2024.h5', key='embeddings')

In [None]:
df.head(3)

Unnamed: 0,pmid,title,abstract,language,journal_title,pub_year,authors,predicted_category,full_embeddings,umap_2D_x,umap_2D_y,umap_3D_x,umap_3D_y,umap_3D_z,tsne_2D_x,tsne_2D_y
0,9748443,Effect of slow growth on metabolism of Escheri...,Escherichia coli growing on glucose in minimal...,eng,Journal of bacteriology,1998.0,"Tweeddale H, Notley-McRobb L, Ferenci T",Microbiology,"[[0.04921199, 0.1013429, 0.009529841, -0.08067...",7.770308,7.748135,8.087116,7.735348,6.041772,-18.861538,25.245789
1,10675895,On the optimization of classes for the assignm...,"At present, the assignment of function to nove...",eng,Trends in biotechnology,2000.0,"Kell DB, King RD",unlabeled,"[[0.074717656, 0.12005615, 0.023376802, 0.0167...",3.69291,6.724228,4.727096,6.715599,7.221296,45.593254,63.230408
2,10731098,Assessing the effect of reactive oxygen specie...,A two-dimensional thin-layer chromatographic a...,eng,Redox report : communications in free radical ...,1999.0,"Tweeddale H, Notley-McRobb L, Ferenci T",unlabeled,"[[-0.009071778, 0.013007838, -0.0069063944, -0...",8.485703,8.156181,8.033753,7.959665,5.779803,-18.892046,25.188398


## Keywords Highlights

### Version 1 - Black and White
go to the second version.

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

def plot_keyword_comparison(
    df,
    terms,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    n_cols=2,
    marker_size=3,
    fig_width=1500,
    fig_height=600,
    max_points=None
):
    """
    Plots multiple scatter subplots, each highlighting documents containing certain keywords.
    Uses Scattergl for faster GPU rendering and allows optional downsampling.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing embedding coordinates and text data.
    terms : list of str
        A list of keywords or regex patterns to highlight in the text_col.
    x_col : str
        Name of the DataFrame column representing x-coordinates (e.g., t-SNE or UMAP x).
    y_col : str
        Name of the DataFrame column representing y-coordinates (e.g., t-SNE or UMAP y).
    text_col : str
        Name of the DataFrame column that contains text (e.g., the abstract) to search.
    n_cols : int
        Number of columns in the subplot layout.
    marker_size : int
        Size of the markers in the scatter plot.
    fig_width : int
        Width of the resulting figure (in pixels).
    fig_height : int
        Height of the resulting figure (in pixels).
    max_points : int, optional
        If specified, randomly downsample the dataset to this many points to improve performance.

    Returns
    -------
    fig : plotly.graph_objects.Figure
        The constructed Plotly figure with subplots.
    """

    # Ensure the text column is string type
    df[text_col] = df[text_col].astype(str)

    # Optional downsampling
    if max_points is not None and len(df) > max_points:
        df = df.sample(n=max_points, random_state=42).copy()

    # Compute how many rows are needed
    n_rows = (len(terms) - 1) // n_cols + 1

    # Create the subplots
    subplot_titles = [f'"{term.replace("|", " or ")}"' for term in terms]
    fig = make_subplots(
        rows=n_rows,
        cols=n_cols,
        subplot_titles=subplot_titles,
        horizontal_spacing=0.05,  # Adjust as needed
        vertical_spacing=0.12     # Adjust as needed
    )

    # For each keyword, create a subplot highlighting documents that mention it
    for i, term in enumerate(terms, start=1):
        row = (i - 1) // n_cols + 1
        col = (i - 1) % n_cols + 1

        # Check for keyword presence (case-insensitive)
        presence = df[text_col].str.contains(term, case=False, na=False)

        # Convert the boolean presence to an integer array (1 if True, 0 if False)
        presence_numeric = presence.astype(int)

        # Define a custom color scale for presence:
        #   values of 0 -> lightgrey
        #   values of 1 -> black
        # We can define a two-step colorscale by specifying
        # (value, color) pairs in ascending order.
        colorscale = [
            [0, 'lightgrey'],  # For numeric val = 0
            [1, 'black']       # For numeric val = 1
        ]

        # Add a single Scattergl trace for this keyword
        fig.add_trace(
            go.Scattergl(
                x=df[x_col],
                y=df[y_col],
                mode='markers',
                marker=dict(
                    size=marker_size,
                    color=presence_numeric,   # numeric array
                    colorscale=colorscale,
                    cmin=0,                  # minimum of presence_numeric
                    cmax=1,                  # maximum of presence_numeric
                    showscale=False          # hide colorbar
                ),
                name=term,
                showlegend=False,
                hoverinfo='skip'  # turn off hover if you want faster interaction
            ),
            row=row,
            col=col
        )

    # Update figure layout
    fig.update_layout(
        plot_bgcolor='white',
        height=fig_height,
        width=fig_width,
        title_font=dict(size=24, family='Optima, sans-serif'),
        font=dict(size=18, family='Optima, sans-serif'),
        showlegend=False
    )

    # Hide axis labels, ticks, and grids for all subplots
    fig.update_xaxes(title='', showticklabels=False, showgrid=False)
    fig.update_yaxes(title='', showticklabels=False, showgrid=False)

    return fig



In [None]:
# ------------------- Example Usage -------------------

terms_to_plot = ["COVID-19|SARS-CoV-2", "Influenza|Flu"]
fig = plot_keyword_comparison(
    df,
    terms=terms_to_plot,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    n_cols=2,
    marker_size=3, #2
    fig_width=1200,
    fig_height=600,
    max_points=50000  # downsample if needed
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

### Version 2 - Colored Version
Publication version

#### Plot utils

##### Boolean Query Parser

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
def parse_boolean_query(df, text_col, query):
    """
    A simple parser to handle queries containing parentheses, AND, and OR, e.g.:
        (LC-MS OR LC/MS) AND metabolomics
        "COVID-19 OR SARS-CoV-2"
        "NMR AND method"
    It strips top-level parentheses, then does a top-level split by " AND ",
    and within each part does a split by " OR ". All matching is case-insensitive.
    """
    # Remove parentheses (this won't handle multiple nested levels)
    query_stripped = query.replace("(", "").replace(")", "")
    query_upper = query_stripped.upper()

    if " AND " in query_upper:
        # Split by AND
        and_parts = query_upper.split(" AND ")
        presence_and = None

        for part in and_parts:
            # Check if the part has an OR
            if " OR " in part:
                or_parts = part.split(" OR ")
                presence_or = df[text_col].str.contains(or_parts[0], case=False, na=False)
                for or_sub in or_parts[1:]:
                    presence_or |= df[text_col].str.contains(or_sub, case=False, na=False)

                current_presence = presence_or
            else:
                # Single token with no OR
                current_presence = df[text_col].str.contains(part, case=False, na=False)

            if presence_and is None:
                # First part
                presence_and = current_presence
            else:
                # Combine with logical AND
                presence_and &= current_presence

        return presence_and

    elif " OR " in query_upper:
        # No AND, just split by OR
        or_parts = query_upper.split(" OR ")
        presence_or = df[text_col].str.contains(or_parts[0], case=False, na=False)
        for or_sub in or_parts[1:]:
            presence_or |= df[text_col].str.contains(or_sub, case=False, na=False)

        return presence_or

    else:
        # Single token with no AND or OR
        return df[text_col].str.contains(query_upper, case=False, na=False)

##### Embeddings Plot

In [None]:
def plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='year',
    queries=("COVID-19 OR SARS-CoV-2", "LC-MS AND Cancer"),
    query_colors=("orange", "red"),
    marker_size=3,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
):
    """
    Creates two side-by-side plots of the same 2D embeddings:
      LEFT: Colored by Boolean query presence.
      RIGHT: Colored by publication year (blue→red) ONLY for points matching at least one query.

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing embedding coordinates, abstracts, and year data.
    x_col : str
        DataFrame column for the x-axis of the embedding.
    y_col : str
        DataFrame column for the y-axis of the embedding.
    text_col : str
        Column name containing text (e.g., 'abstract') for keyword searches.
    year_col : str
        Column containing numeric publication year.
    queries : tuple of str
        Each string can include "AND" / "OR".
        Example: ("COVID-19 OR SARS-CoV-2", "LC-MS AND Cancer")
    query_colors : tuple of str
        Colors to represent each Boolean query match on the LEFT subplot.
    marker_size : int
        Size of the scatter markers.
    fig_width : int
        Width of the figure in pixels.
    fig_height : int
        Height of the figure in pixels.
    custom_title : str
        Title for the entire figure.

    Returns
    -------
    fig : plotly.graph_objects.Figure
        The Plotly figure object containing the side-by-side subplots.
    """

    # Ensure text column is string
    df[text_col] = df[text_col].astype(str)

    # Parse queries, find union of all matches
    presence_list = [parse_boolean_query(df, text_col, q) for q in queries]
    presence_any = presence_list[0]
    for i in range(1, len(presence_list)):
        presence_any |= presence_list[i]

    # Create figure with 1 row, 2 columns
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=["Colored by Boolean Query Presence", "Colored by Year"]
    )

    # ------------------- LEFT SUBPLOT (BOOLEAN QUERIES) -------------------
    # Docs with no match (light grey)
    none_mask = ~presence_any
    fig.add_trace(
        go.Scattergl(
            x=df.loc[none_mask, x_col],
            y=df.loc[none_mask, y_col],
            mode='markers',
            marker=dict(color='lightgrey', size=marker_size),
            name='No match',
            hoverinfo='skip',  # disable hover
            showlegend=True
        ),
        row=1, col=1
    )

    # Plot each query in its own color
    for query, color in zip(queries, query_colors):
        mask = parse_boolean_query(df, text_col, query)
        fig.add_trace(
            go.Scattergl(
                x=df.loc[mask, x_col],
                y=df.loc[mask, y_col],
                mode='markers',
                marker=dict(color=color, size=marker_size),
                name=query,
                hoverinfo='skip',  # disable hover
                showlegend=True
            ),
            row=1, col=1
        )

    # ------------------- RIGHT SUBPLOT (YEAR) -------------------
    # 1) Subset your DataFrame
    df_presence = df.loc[presence_any].copy()    # matched
    df_no_match = df.loc[~presence_any].copy()   # no match

    # 2) First trace: plot the no-match points in grey
    fig.add_trace(
        go.Scattergl(
            x=df_no_match[x_col],
            y=df_no_match[y_col],
            mode='markers',
            marker=dict(
                size=marker_size,
                color='lightgrey'
            ),
            #name='No match (by year subplot)',
            hoverinfo='skip',
            showlegend=False  # if you want a legend entry
        ),
        row=1, col=2
    )

    # 3) Second trace: color matched points by year (blue→yellow→red)
    year_min = 1998 #df_presence[year_col].min()
    year_max = 2024 #df_presence[year_col].max()

    fig.add_trace(
        go.Scattergl(
            x=df_presence[x_col],
            y=df_presence[y_col],
            mode='markers',
            marker=dict(
                size=marker_size,
                color=df_presence[year_col],
                cmin=year_min,
                cmax=year_max,
                colorscale=[
                    [0, 'blue'],
                    [0.8, 'yellow'],
                    [1, 'red']
                ],
                showscale=True,
                colorbar=dict(
                    title='Year',
                    x=1.08,
                    y=0.5,
                    len=0.75
                )
            ),
            name='Matched (by year)',
            hoverinfo='skip',
            showlegend=False
        ),
        row=1, col=2
    )

    # ------------------- FIGURE LAYOUT -------------------
    fig.update_layout(
        height=fig_height,
        width=fig_width,
        title_text=custom_title,
        font=dict(family='Optima, sans-serif', size=16),
        plot_bgcolor='white',
        # Position legend in top-left corner of the entire figure
        legend=dict(
            x=0.40,
            y=0.98,
            xanchor='left',
            yanchor='top',
            orientation='v',
            font=dict(size=12)
        )
    )

    # Remove axis ticks & grids
    fig.update_xaxes(showticklabels=False, showgrid=False)
    fig.update_yaxes(showticklabels=False, showgrid=False)

    return fig

#### Categories Considered

##### COVID 19

In [None]:
# Then call the function with real queries:
example_queries = (
    "COVID-19 OR SARS-CoV-2",  # OR example #COVID-19 OR SARS-CoV-2
    #"SARS"         # AND example
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=3,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the figure to a file
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/covid19 by year.png')  # Saves as PNG
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/covid19 by year.svg')  # Saves as SVG
#pio.write_image(fig, '/content/gdrive/My Drive/code/Metabolomics Landscape//Figures/covid19 by year.pdf')  # Saves as PDF

##### Advanced Data Analysis

- (Machine learning OR deep learning) AND metabolomics
- (Chemometrics OR multivariate analysis) AND metabolomics <br>
Rationale: Contrasts classic approaches (chemometrics, e.g., PLS-DA, PCA) with newer AI/ML methods, demonstrating how the computational side of metabolomics has evolved.

In [None]:
example_queries = (
    "Deep learning OR Neural network",
    "Chemometrics OR multivariate analysis"
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=4,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the figure to a file
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Deep Learning and Chemometrics.png')  # Saves as PNG
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Deep Learning and Chemometrics.svg')  # Saves as SVG
#pio.write_image(fig, '/content/gdrive/My Drive/code/Metabolomics Landscape//Figures/Deep Learning and Chemometrics.pdf')  # Saves as PDF

In [None]:
example_queries = (
    "Deep learning",
    #"Chemometrics OR multivariate analysis"
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=4,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the figure to a file
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Deep Learning.png')  # Saves as PNG
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Deep Learning.svg')  # Saves as SVG
#pio.write_image(fig, '/content/gdrive/My Drive/code/Metabolomics Landscape//Figures/Deep Learning.pdf')  # Saves as PDF

##### Biomarker Discovery or Metabolic Pathway

In [None]:
example_queries = (
    "Biomarker discovery",
    "(Pathway analysis OR metabolic pathway) AND mechanism"
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=4,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the figure to a file
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Biomarker and Pathways.png')  # Saves as PNG
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Biomarker and Pathways.svg')  # Saves as SVG
#pio.write_image(fig, '/content/gdrive/My Drive/code/Metabolomics Landscape//Figures/Biomarker and Pathways.pdf')  # Saves as PDF

In [None]:
example_queries = (
    "Biomarker discovery",
    #"(Pathway analysis OR metabolic pathway) AND mechanism"
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=4,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
# Save the figure to a file
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Biomarker.png')  # Saves as PNG
pio.write_image(fig, '/content/gdrive/My Drive/current/Metabolomics Landscape/Figures/Biomarker.svg')  # Saves as SVG
#pio.write_image(fig, '/content/gdrive/My Drive/code/Metabolomics Landscape//Figures/Biomarker.pdf')  # Saves as PDF

In [None]:
example_queries = (
    #"Biomarker discovery",
    "(Pathway analysis OR metabolic pathway) AND mechanism",
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=4,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.

##### Diabetes _vs_ Cancer

In [None]:
example_queries = (
    "Diabetes OR type 2 diabetes",
    "Cancer"
)
example_colors = ("mediumorchid", "teal")

fig = plot_side_by_side_embeddings_boolean(
    df,
    x_col='tsne_2D_x',
    y_col='tsne_2D_y',
    text_col='abstract',
    year_col='pub_year',
    queries=example_queries,
    query_colors=example_colors,
    marker_size=2,
    fig_width=1200,
    fig_height=600,
    custom_title="Boolean Query Matching vs. Year"
)
fig.show()

Output hidden; open in https://colab.research.google.com to view.