# Research Field Analysis

## ✋Set Up Workspace

### Set up GPUs

In [None]:
# GPU information:

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In order to use a GPU with your notebook, select the **Runtime > Change runtime** type menu, and then set the hardware accelerator dropdown to GPU.

### High RAM

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


Users who have purchased one of Colab's paid plans have access to high-memory VMs when they are available.

You can see how much memory you have available at any time by running the following code cell. If the execution result of running the code cell below is "Not using a high-RAM runtime", then you can enable a high-RAM runtime via **Runtime > Change runtime** type in the menu. Then select High-RAM in the Runtime shape dropdown. After, re-execute the code cell.

### Install libraries

In [None]:
# Install for settings.
!pip install --quiet numpy==1.25.2 pandas==1.5.3 plotly==5.15.0 kaleido==0.2.1

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.5/15.5 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.10.1 requires pandas<2.2.3dev0,>=2.0, but you have pandas 1.5.3 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 1.5.3 which is incompatible.
mizani 0.13.1 requires pandas>=2.2.0, but you have pandas 1.5.3 which is incompatible.
plotnine 0.14.4 requires pandas>=2.2.0, but you have pandas 

In [None]:
#if the torch version from the utput above differ from the one from this cell,
#this is the correct one.
import torch
import sklearn
print(sklearn.__version__)
print(torch.__version__)

1.2.2
2.2.1+cu121


In [None]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.io as pio
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from sklearn.manifold import TSNE

import kaleido
import re  # Regular expression library

import warnings
warnings.filterwarnings;

## Load Data

Import datasets, that contains full embeddings and reduced embeddings.

In [None]:
df = pd.read_hdf('Files/embeddings_full_tSNE_uMAP_01MAR2024.h5', key='embeddings')

In [None]:
df.head(3);

In [None]:
# Updated color map with new categories
color_map = {
    'Cancer Research': '#1f77b4',  # muted blue
    'Plant Biology': '#ff7f0e',  # safety orange
    'Nephrology': '#2ca02c',  # cooked asparagus green
    'Endocrinology': '#d62728',  # brick red
    'Microbiology': '#9467bd',  # muted purple
    'Analytical Chemistry': '#8c564b',  # chestnut brown
    'Pharmacology': '#e377c2',  # raspberry yogurt pink
    'Neuroscience': '#7f7f7f',  # middle gray
    'Food Science & Nutrition': '#bcbd22',  # curry yellow-green
    'Toxicology': '#1a55FF',  # bright blue
    'Environmental Science': '#17becf',  # blue-teal
    'Animal Science': '#e7298a',  # dark pink
    'Sports Science & Medicine': '#66a61e',  # lime green
    'Epidemiology & Public Health': '#e6ab02',  # golden
    'Developmental Biology': '#a6761d',  # bronze
    'Aging & Gerontology': '#666666',  # dark gray
    'Immunology & Vaccine Research': '#1b9e77',  # greenish teal
    'Computational Biology': '#d95f02',  # burnt orange
    'Genetics & Genomics': '#2f4f4f',  # dark slate grey
    'unlabeled': 'rgba(0, 0, 0, 0.1)'  # Nearly transparent black, change alpha to change transparency.
}

## uMAP Embeddings w/ Labeled Axis

In [None]:
# Plot 2D tSNE with color coding
fig_2d = px.scatter(df, x='umap_2D_x', y='umap_2D_y',
                    color='predicted_category',
                    hover_data=['title'],
                    color_discrete_map=color_map)

# Update marker size
fig_2d.update_traces(marker=dict(size=4))  # Increase default size from 3 to 4

# Update layout to hide axis names and labels
fig_2d.update_layout(
    plot_bgcolor='white',
    height=600, width=1100,
    title_font=dict(size=24, family='Optima, sans-serif'),
    font=dict(size=18, family='Optima, sans-serif'),
    xaxis=dict(
        title='UMAP 1',  # Empty title for x-axis
        showticklabels=True,  # Hide x-axis tick labels
        showgrid=False  # Optional: hide grid lines for x-axis
    ),
    yaxis=dict(
        title='UMAP 2',  # Empty title for y-axis
        showticklabels=True,  # Hide y-axis tick labels
        showgrid=False  # Optional: hide grid lines for y-axis
    )
)

# Show the plot
fig_2d.show()

Output hidden; open in https://colab.research.google.com to view.

## tSNE Embeddings w/ Labeled Axis

In [None]:
# Plot 2D tSNE with color coding
fig_2d = px.scatter(df, x='tsne_2D_x', y='tsne_2D_y',
                    color='predicted_category',
                    hover_data=['title'],
                    color_discrete_map=color_map)

# Update marker size
fig_2d.update_traces(marker=dict(size=4))  # Increase default size from 3 to 4

# Update layout to hide axis names and labels
fig_2d.update_layout(
    plot_bgcolor='white',
    height=600, width=1100,
    title_font=dict(size=24, family='Optima, sans-serif'),
    font=dict(size=18, family='Optima, sans-serif'),
    xaxis=dict(
        title='tSNE 1',  # Empty title for x-axis
        showticklabels=True,  # Hide x-axis tick labels
        showgrid=False  # Optional: hide grid lines for x-axis
    ),
    yaxis=dict(
        title='tSNE 2',  # Empty title for y-axis
        showticklabels=True,  # Hide y-axis tick labels
        showgrid=False  # Optional: hide grid lines for y-axis
    )
)

# Show the plot
fig_2d.show()

## Research Fields Trends

In [None]:
# Plot 2D tSNE with color coding
fig_2d = px.scatter(df, x='tsne_2D_x', y='tsne_2D_y',
                    color='predicted_category',
                    hover_data=['title'],
                    color_discrete_map=color_map)

# Update marker size
fig_2d.update_traces(marker=dict(size=4))  # Increase default size from 3 to 4

# Update layout to hide axis names and labels
fig_2d.update_layout(
    plot_bgcolor='white',
    height=600, width=1100,
    title_font=dict(size=24, family='Optima, sans-serif'),
    font=dict(size=18, family='Optima, sans-serif'),
    xaxis=dict(
        title='',  # Empty title for x-axis
        showticklabels=False,  # Hide x-axis tick labels
        showgrid=False  # Optional: hide grid lines for x-axis
    ),
    yaxis=dict(
        title='',  # Empty title for y-axis
        showticklabels=False,  # Hide y-axis tick labels
        showgrid=False  # Optional: hide grid lines for y-axis
    )
)

# Show the plot
fig_2d.show()

### Plant Sciences & Analytical Chemistry

In [None]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Filter the data for Plant Biology and Analytical Chemistry
df_plant = df[df['predicted_category'] == 'Plant Biology']
df_chem = df[df['predicted_category'] == 'Analytical Chemistry']

# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=('Plant Biology', 'Analytical Chemistry'))

# Add Plant Biology scatter plot
fig.add_trace(
    go.Scatter(x=df_plant['tsne_2D_x'], y=df_plant['tsne_2D_y'], mode='markers',
               marker=dict(color=color_map['Plant Biology'], size=4),
               name='Plant Biology'),
    row=1, col=1
)

# Add Analytical Chemistry scatter plot
fig.add_trace(
    go.Scatter(x=df_chem['tsne_2D_x'], y=df_chem['tsne_2D_y'], mode='markers',
               marker=dict(color=color_map['Analytical Chemistry'], size=4),
               name='Analytical Chemistry'),
    row=1, col=2
)

# Update layout
fig.update_layout(
    plot_bgcolor='white',
    height=600, width=1500,
    title_font=dict(size=24, family='Optima, sans-serif'),
    font=dict(size=24, family='Optima, sans-serif'),
    showlegend=False
)

# Update x-axis and y-axis of both plots
fig.update_xaxes(title='', showticklabels=False, showgrid=False)
fig.update_yaxes(title='', showticklabels=False, showgrid=False)

# Show the plot
fig.show()

### Toxicology & Environmental Sciences

In [None]:
# Filter the data for Toxicology only, and for Toxicology + Environmental Science
df_toxicology = df[df['predicted_category'] == 'Toxicology']
df_toxicology_env = df[df['predicted_category'].isin(['Toxicology', 'Environmental Science'])]

# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2, subplot_titles=('Toxicology', 'Toxicology & Environmental Science'))

# Add Toxicology scatter plot
fig.add_trace(
    go.Scatter(x=df_toxicology['tsne_2D_x'], y=df_toxicology['tsne_2D_y'], mode='markers',
               marker=dict(color=color_map['Toxicology'], size=4),
               name='Toxicology'),
    row=1, col=1
)

# Add Toxicology and Environmental Science scatter plot
fig.add_trace(
    go.Scatter(x=df_toxicology_env['tsne_2D_x'], y=df_toxicology_env['tsne_2D_y'], mode='markers',
               marker=dict(color=[color_map[cat] for cat in df_toxicology_env['predicted_category']], size=4),
               name='Toxicology & Environmental Science'),
    row=1, col=2
)

# Update layout
fig.update_layout(
    plot_bgcolor='white',
    height=600, width=1500,
    title_font=dict(size=24, family='Optima, sans-serif'),
    font=dict(size=24, family='Optima, sans-serif'),
    showlegend=False
)

# Update x-axis and y-axis of both plots
fig.update_xaxes(title='', showticklabels=False, showgrid=False)
fig.update_yaxes(title='', showticklabels=False, showgrid=False)

# Show the plot
fig.show()