In [None]:
#@title Setup & Installation { display-mode: "form" }

!sudo apt install tesseract-ocr -q
!sudo apt-get install poppler-utils -q
!pip install -qq pytesseract pdf2image zero-shot-re neo4j boto3 datasets
!pip -qq install boto3
!pip -qq install pyyaml==5.4.1
!pip -qq install astrodendro
!pip -qq install openai
!pip -qq install igraph
!pip -qq install trimap
!pip -qq install transformers
import boto3
from botocore.exceptions import NoCredentialsError

In [None]:
%config InlineBackend.figure_format = "retina"

# Downloading nodes from AWS

We're downloading these from an AWS bucket, so we can feed the JSON strings into the Code Embedding endpoint.

### Download from S3 bucket

In [None]:
import boto3
from botocore.exceptions import NoCredentialsError

from getpass import getpass

ACCESS_KEY = getpass(prompt='AWS ACCESS KEY: ')
SECRET_KEY = getpass(prompt='AWS SECRET KEY: ')


def download_from_aws(local_file: str, bucket: str, s3_file: str):
    session = boto3.Session(
        aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY
    )
    s3 = session.resource("s3")
    try:
        s3.Bucket(bucket).download_file(s3_file, local_file)
        print(
            f"Download of {s3_file} from bucket {bucket} Successful.\n",
            f"Saved as {local_file}"
        )
        return True
    except FileNotFoundError:
        print("The file was not found")
        return False
    except NoCredentialsError:
        print("Credentials not available")
        return False


In [None]:
download = download_from_aws(
    "node_records_Biotool.json",
    "knowledge-graph-backup",
    "node_records_Biotool.json",
)
download = download_from_aws(
    "node_records_Toolset.json",
    "knowledge-graph-backup",
    "node_records_Toolset.json",
)
download = download_from_aws(
    "relationship_records_FEEDS_INTO.json",
    "knowledge-graph-backup",
    "relationship_records_FEEDS_INTO.json",
)
download = download_from_aws(
    "biotools_node_openai_embeddings.csv",
    "knowledge-graph-backup",
    "biotools_node_openai_embeddings.csv",
)

In [None]:
import codecs
import json
import time

def load_json_dict_from_file(file_name):
    """
    Loads entity list from json file
    """
    return json.load(codecs.open(file_name, 'r', 'utf-8-sig'), strict=False)


start_time = time.time()


node_records_Biotool = load_json_dict_from_file("/content/node_records_Biotool.json")
node_records_Toolset = load_json_dict_from_file("/content/node_records_Toolset.json")
relationship_records_FEEDS_INTO = load_json_dict_from_file("/content/relationship_records_FEEDS_INTO.json")

end_time = time.time()
print("Time taken: {:.4g} seconds".format(end_time - start_time))
#named_enities


# Importing nodes into Neo4j

This guide here details importing data. This is one of the hyperlinks that shows up when starting a blank sandbox.

https://neo4j.com/developer/guide-importing-data-and-etl/

In [None]:
# Creating dictionaries corresponding to each node type

# Each of the nodes in the backup are indexed.
# All the relationships have details on which indexes they go between
# This makes sure all the nodes can be easily uploaded

exclude_keys = ["identity", "labels","created_by", "created_time"]

node_records_Biotool_dict = {}
node_records_Toolset_dict = {}


for node_record in node_records_Biotool:
    identity_key = node_record["n"]["identity"]
    new_node_record = {
        k: node_record["n"]["properties"][k]
        for k in set(list(node_record["n"]["properties"].keys()))
        - set(exclude_keys)
    }
    node_records_Biotool_dict.update({identity_key: new_node_record})

for node_record in node_records_Toolset:
    identity_key = node_record["a"]["identity"]
    new_node_record = {
        k: node_record["a"]["properties"][k]
        for k in set(list(node_record["a"]["properties"].keys()))
        - set(exclude_keys)
    }
    node_records_Toolset_dict.update({identity_key: new_node_record})

print("Number of Biotool: ", len(node_records_Biotool_dict))
print("Number of Toolset: ", len(node_records_Toolset_dict))


In [None]:
exclude_keys = ["source", "target", "caption"]

# FEEDS_INTO
relationship_records_FEEDS_INTO_list = []

for rel_record in relationship_records_FEEDS_INTO[0]["edgesFEEDS_INTO"]:
    source_id = rel_record["source"]
    target_id = rel_record["target"]
    r_type = rel_record["r_type"]
    weight = rel_record["weight"] + 1

    #new_rel_record = {
    #    k: rel_record[k]
    #    for k in set(list(rel_record.keys())) - set(exclude_keys)
    #}
    relationship_records_FEEDS_INTO_list.append(
        #{"source": source_id, "target": target_id, "data_category":r_type,"weight":weight}
        {"source": source_id, "target": target_id, "weight":weight}
    )

print("Number of FEEDS_INTO: ", len(relationship_records_FEEDS_INTO_list))

### All `BioTool` nodes

In [None]:
node_records_Biotool_dict

In [None]:
node_records_Toolset_dict

In [None]:
relationship_records_FEEDS_INTO_list

In [None]:
import pandas as pd
from typing import Dict
import json

def dict_to_pandas_dataframe(dict_data: Dict[int, Dict]):
    """
    Takes in a dictionary of lists and converts it to a pandas dataframe.
    for the first column, this is all the list elements from the dictionary.
    for the second column, this is the value's key from when it was part of the dictionary.
    """
    df_list = []
    for key, value in dict_data.items():
        df_list.append([key, value['name'], value['operation'], value['language'], int(value['citationCounts']),json.dumps(value)])
    return pd.DataFrame(df_list, columns=['protocol_id', 'protocol_name', 'protocol_operation', 'protocol_language', 'protocol_citationCounts', 'protocol_value'])

df_old = dict_to_pandas_dataframe(node_records_Biotool_dict)
df_old.head()

In [None]:
len(df_old.protocol_id.unique())

In [None]:
len(df_old)

### Getting the embeddings

https://beta.openai.com/docs/guides/embeddings/use-cases?lang=python

https://github.com/openai/openai-python/tree/0b07e1e9457d3a6252431036b725b3e891f95a40/examples/embeddings

https://stackabuse.com/hierarchical-clustering-with-python-and-scikit-learn/

In [None]:
from transformers import GPT2TokenizerFast
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# remove reviews that are too long
df['n_tokens'] = df.protocol_value.apply(lambda x: len(tokenizer.encode(x)))
df.head()

In [None]:
total_tokens = df['n_tokens'].sum()

print("Price of tokenization using Ada Embeddings (Fastest): $", (total_tokens/1000 ) * 0.0040)
print("Price of tokenization using Babbage Embeddings: $", (total_tokens/1000 ) * 0.0050)
print("Price of tokenization using Curie Embeddings: $", (total_tokens/1000 ) * 0.0200)
print("Price of tokenization using DaVinci Embeddings (Most powerful): $", (total_tokens/1000 ) * 0.2000)

In [None]:
total_tokens

In [None]:
from numpy.core.numeric import True_
import openai
from getpass import getpass

CREATE_NEW_EMBEDDINGS = False #@param
EMBEDDING_ENGINE = 'code-search-ada-code-001' #@param ["code-search-ada-code-001", "code-search-ada-text-001", "code-search-babbage-code-001", "code-search-babbage-text-001", "code-search-babbage-code-001", "text-similarity-babbage-001", "text-search-babbage-doc-001"] {allow-input: false}

if CREATE_NEW_EMBEDDINGS==True:
    openai.api_key = getpass(prompt='OpenAI Access Key: ')

    def get_embedding(input_text, engine=None):
        if engine is None:
            raise ValueError("Need to choose engine")

        response = openai.Embedding.create(
            input=input_text,
            engine=engine
        )
        embeddings = response['data'][0]['embedding']
        return embeddings
    
    df[EMBEDDING_ENGINE] = df.protocol_value.apply(lambda x: get_embedding(x, engine=EMBEDDING_ENGINE))
    df.head()
else:
    print("Skipping the creation of new embeddings")
    #df = pd.read_csv('/content/biotools_node_openai_embeddings.csv')




```python
# mount it
from google.colab import drive
drive.mount('/content/drive')
# copy it there
!cp biotools_node_openai_embeddings.csv /content/drive/MyDrive
```

## Dimensionality Reduction

Use tri-map here

https://github.com/openai/openai-python/blob/main/examples/embeddings/Clustering.ipynb

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib


df = pd.read_csv('/content/biotools_node_openai_embeddings.csv')
df['code-search-ada-code-001'] = df['code-search-ada-code-001'].apply(eval).apply(np.array)
df['code-search-ada-text-001'] = df['code-search-ada-text-001'].apply(eval).apply(np.array)
matrix_code = np.vstack(df['code-search-ada-code-001'].values)
matrix_text = np.vstack(df['code-search-ada-text-001'].values)
matrix_code.shape, matrix_text.shape

In [None]:
from sklearn.cluster import KMeans

n_clusters = 30

kmeans = KMeans(n_clusters = n_clusters,init='k-means++',random_state=42)
kmeans.fit(matrix_text)
labels = kmeans.labels_
df['Cluster'] = labels

In [None]:
unique_values = df['Cluster'].unique()
unique_values


np.where(unique_values == 'Get Data')[0]

In [None]:
relationship_records_FEEDS_INTO_list

In [None]:
import igraph as ig

Edges = [
    (
        relationship_records_FEEDS_INTO_list[k]["source"],
        relationship_records_FEEDS_INTO_list[k]["target"],
        relationship_records_FEEDS_INTO_list[k]["weight"],
    )
    for k in range(len(relationship_records_FEEDS_INTO_list))
]

G = ig.Graph.TupleList(Edges, directed=True, weights=True)

In [None]:
Edges

### [TriMAP Documentation](https://github.com/eamid/trimap)

In [None]:
import random

colors_list = [
    "#201923",
    "#888888",
    "#fcff5d",
    "#7dfc00",
    "#0ec434",
    "#228c68",
    "#8ad8e8",
    "#235b54",
    "#29bdab",
    "#3998f5",
    "#37294f",
    "#277da7",
    "#3750db",
    "#f22020",
    "#991919",
    "#ffcba5",
    "#e68f66",
    "#c56133",
    "#96341c",
    "#632819",
    "#ffc413",
    "#f47a22",
    "#2f2aa0",
    "#b732cc",
    "#772b9d",
    "#f07cab",
    "#d30b94",
    "#c3a5b4",
    "#946aa2",
    "#5d4c86",
]


languages_dict = {
    "D": "#ba595e", "PHP": "#4F5D95", "Racket": "#3c5caa", "Smalltalk": "#596706",
    "Groovy": "#4298b8", "Scala": "#c22d40", "LabVIEW": "#fede06", "R": "#198CE7",
    "Pascal": "#E3F171", "Forth": "#341708", "Perl": "#0298c3", "Ruby": '#701516"',
    "JSP": "#2A6277", "C++": "#f34b7d", "JavaScript": "#f1e05a", "Mathematica": "#dd1100",
    "Verilog": "#b2b7f8", "VHDL": "#adb2cb", "Lua": "#000080", "C#": "#178600",
    "AWK": "#c30e9b", "Lisp": "#3fb68b", "SQL": "#e38c00", "Java": "#b07219",
    "OCaml": "#3be133", "Haskell": "#5e5086", "Maple": "#c22d40", "MATLAB": "#e16737",
    "Prolog": "#74283c", "Other": "#cccccc", "Delphi": "#E3F171", "Elm": "#60B5CC",
    "Julia": "#a270ba", "Scheme": "#1e4aec", "Ada": "#02f88c", "PyMOL": "807F01",
    "ActionScript": "#882B0F", "Fortran": "#4d41b1", "Visual Basic": "#945db7",
    "Bash": "#89e051", "Python": "#3572A5", "Shell": "#89e051", "SAS": "#B34936",
    "CWL": "#B5314C", "C": "#555555", "None": "#bbbbbb",
}

In [None]:
hex_to_rgb('#c22d40')

In [None]:

def hex_to_rgb(hex_color: str):
    h = hex_color.lstrip("#")
    try:
        r = int(h[0 : 2], 16)
        g = int(h[2 : 4], 16)
        b = int(h[4 : 6], 16)
    except:
        raise(ValueError(f"Error with input {h}"))

    return tuple((r, g, b))


def average_rgb_colors(rgb_colors: list):
    r = 0
    g = 0
    b = 0
    for color in rgb_colors:
        r += color[0]
        g += color[1]
        b += color[2]
    r = int(r / len(rgb_colors))
    g = int(g / len(rgb_colors))
    b = int(b / len(rgb_colors))
    return (r, g, b)


def rgb_to_hex(rgb):
    return "#{:02x}{:02x}{:02x}".format(rgb[0], rgb[1], rgb[2])


def average_hex_colors(hex_colors: list):
    # convert hex colors to rgb
    rgb_colors = [hex_to_rgb(hex_color) for hex_color in hex_colors]
    # average the rgb colors
    avg_rgb_color = average_rgb_colors(rgb_colors)
    # convert the average rgb color back to hex
    avg_hex_color = rgb_to_hex(avg_rgb_color)
    return avg_hex_color


def correct_hex_format(hex_color_string: str):
    if not hex_color_string.startswith('#'):
        hex_color_string = '#' + hex_color_string
    if hex_color_string[-1] == '"':
        hex_color_string = hex_color_string[:-1]
    return hex_color_string


def add_language_colors_to_dataframe(
    df: pd.DataFrame, language_color_dict: Dict[str, str]
):
    df["language_colors"] = df["protocol_language"].apply(
        lambda x: language_color_dict[x]
        if x in language_color_dict
        else average_hex_colors([language_color_dict[lang] for lang in x.split(",")])
    )
    df["language_colors"] = df["language_colors"].apply(
        lambda x: correct_hex_format(x)
    )
    return df

def add_labeltext_to_dataframe(
    df: pd.DataFrame,
):
    df["hovertext"] = "Name: " + df["protocol_name"] + "<br>Operation Type: " + df["protocol_operation"] + "<br>Language(s): " + df["protocol_language"]
    return df


def add_colors_to_dataframe(df: pd.DataFrame, col_name: str):
    """
    For a given column, creates a color palette based on the unique values in the column.
    adds a new column to the dataframe with the color palette.
    """
    random.seed(42)
    unique_values = df[col_name].unique()
    colors = [random.choice(colors_list) for i in range(len(unique_values))]
    df[col_name + "_color"] = df[col_name].map(
        lambda x: colors_list[np.where(unique_values == x)[0][0]]
    )
    return df



In [None]:
df = add_language_colors_to_dataframe(df, languages_dict)

In [None]:
df = add_labeltext_to_dataframe(df)

In [None]:
df

https://github.com/eamid/examples/blob/master/TriMap.ipynb

### 3D Plotting

In [None]:
import pandas as pd
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

# Call once to configure Bokeh to display plots inline in the notebook.
output_notebook()

In [None]:
from sklearn.manifold import TSNE
import matplotlib
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
import jax.random as random
import trimap
import numpy as np
import matplotlib.pyplot as plt

#@markdown TriMAP parameters
#@markdown n_dims: Number of dimensions of the embedding (default = 2)
N_DIMS = 3 #@param
#@markdown n_inliers: Number of nearest neighbors for forming the nearest neighbor triplets (default = 12).
N_INLIERS = 12 #@param
#@markdown n_outliers: Number of outliers for forming the nearest neighbor triplets (default = 4).
N_OUTLIERS = 4 #@param
#@markdown n_random: Number of random triplets per point (default = 3).
N_RANDOM = 6 #@param
#@markdown distance: Distance measure ('euclidean' (default), 'manhattan', 'angular' (or 'cosine'), 'hamming')
DISTANCE = 'cosine' #@param
#@markdown weight_temp: Temperature of the logarithm applied to the weights. Larger temperatures generate more compact embeddings. weight_temp=0. corresponds to no transformation (default=0.5).
WEIGHT_TEMP = 0.5 #@param
#@markdown weight_adj **(deprecated)**: The value of gamma for the log-transformation (default = 500.0).
WEIGHT_ADJ = 500.0 #@param
#@markdown lr: Learning rate (default = 0.1).
LR = 0.1 #@param
#@markdown n_iters: Number of iterations (default = 400).
N_ITERS = 400 #@param
#@markdown The other parameters include:
#@markdown knn_tuple: Use the precomputed nearest-neighbors information in form of a tuple (knn_nbrs, knn_distances) (default = None)
KNN_TUPLE = None #@param
#@markdown use_dist_matrix: Use the precomputed pairwise distance matrix (default = False)
USE_DIST_MATRIX = False #@param
#@markdown apply_pca: Reduce the number of dimensions of the data to 100 if necessary before applying the nearest-neighbor search (default = True).
APPLY_PCA = True #@param
#@markdown opt_method: Optimization method {'sd' (steepest descent), 'momentum' (GD with momentum), 'dbd' (delta-bar-delta, default)}.
OPT_METHOD = 'dbd' #@param
#@markdown verbose: Print the progress report (default = False).
VERBOSE = True #@param
#@markdown return_seq: Store the intermediate results and return the results in a tensor (default = False).
RETURN_SEQ = False #@param

# TriMap
key = random.PRNGKey(123)
vis_dims3 = trimap.TRIMAP(
    n_dims=N_DIMS,
    n_inliers=N_INLIERS,
    n_outliers=N_OUTLIERS,
    n_random=N_RANDOM,
    distance=DISTANCE,
    weight_temp=WEIGHT_TEMP,
    weight_adj=WEIGHT_ADJ,
    lr=LR,
    n_iters=N_ITERS,
    knn_tuple=KNN_TUPLE,
    use_dist_matrix=USE_DIST_MATRIX,
    apply_pca=APPLY_PCA,
    opt_method=OPT_METHOD,
    verbose=VERBOSE,
    return_seq=RETURN_SEQ,
).fit_transform(matrix_code)

# Define Data
 
x = [x for x, y, z in vis_dims3]
y = [y for x, y, z in vis_dims3]
z = [z for x, y, z in vis_dims3]


# Create Figure

fig = plt.figure(figsize=(10, 7))
ax = plt.axes(projection="3d")

# Create Plot

# scatter3D(x, y, z, color=None)
for category, color in enumerate(colors_list):
    xs = np.array(x)[df.Cluster == category]
    ys = np.array(y)[df.Cluster == category]
    zs = np.array(z)[df.Cluster == category]
    ax.scatter3D(xs, ys, zs, color=color, alpha=0.3)

# Show plot
plt.title("Clusters identified visualized in language 3D using TriMAP")
plt.show()

In [None]:
import pandas as pd
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

# Call once to configure Bokeh to display plots inline in the notebook.
output_notebook()

In [None]:
Xn = x  # x-coordinates of nodes
Yn = y  # y-coordinates
Zn = z  # z-coordinates
Xe = []
Ye = []
Ze = []
for e in Edges:
    Xe += [vis_dims3[e[0]][0], vis_dims3[e[1]][0], None]  # x-coordinates of edge ends
    Ye += [vis_dims3[e[0]][1], vis_dims3[e[1]][1], None]
    Ze += [vis_dims3[e[0]][2], vis_dims3[e[1]][2], None]


https://plotly.com/python/v3/3d-network-graph/

In [None]:
import plotly as py
import plotly.graph_objects as go

#rs = np.random.RandomState()
#rs.seed(0)
#
#fig = go.Figure(
#    data=go.Scatter3d(
#        x=x,
#        y=y,
#        z=z,
#        marker=dict(
#            size=2,
#            color=df.Cluster_color.values,
#            #colorscale="Viridis",
#        ),
#        line=dict(color="darkblue", width=0.01),
#        hovertext=df.protocol_description.values,
#        hoverinfo='text'
#    )
#)
#
#fig.update_layout(
#    title="3D Plotted Biotool OpenAI Ada-code embeddings (TriMAP)",
#    width=1000,
#    height=1000,
#    autosize=False,
#    scene=dict(
#        camera=dict(
#            up=dict(x=0, y=0, z=1),
#            eye=dict(
#                x=0,
#                y=1.0707,
#                z=1,
#            ),
#        ),
#        aspectratio=dict(x=1, y=1, z=0.7),
#        aspectmode="manual",
#    ),
#)
#
#fig.show()

protocol_colors_list = []

trace1 = go.Scatter3d(
    x=Xe,
    y=Ye,
    z=Ze,
    mode="lines",
    line=dict(color="rgb(125,125,125)", width=0.001),
    hoverinfo="none",
)

trace2 = go.Scatter3d(
    x=Xn,
    y=Yn,
    z=Zn,
    mode="markers",
    name="actors",
    marker=dict(
        symbol="circle",
        size=2,
        color=df.language_colors.values,
    ),
    hovertext=df.hovertext.values,
    hoverinfo="text",
)

axis = dict(
    showbackground=True,
    showline=True,
    zeroline=True,
    title="",
)

layout = go.Layout(
    title="Network of 3D Plotted Biotool OpenAI Ada-code embeddings<br> created using TriMAP",
    width=1000,
    height=1000,
    showlegend=False,
    scene=dict(
        xaxis=dict(axis),
        yaxis=dict(axis),
        zaxis=dict(axis),
    ),
    margin=dict(t=100),
)

data = [trace1, trace2]
fig = go.Figure(data=data, layout=layout)

# Add drowdowns
# button_layer_1_height = 1.08
#button_layer_1_height = 1.12
#button_layer_2_height = 1.065
#
#fig.update_layout(
#    updatemenus=[
#        dict(
#            buttons=list([
#                dict(
#                    args=["colorscale", "Viridis"],
#                    label="Viridis",
#                    method="restyle"
#                ),
#                dict(
#                    args=["colorscale", "Cividis"],
#                    label="Cividis",
#                    method="restyle"
#                ),
#                dict(
#                    args=["colorscale", "Blues"],
#                    label="Blues",
#                    method="restyle"
#                ),
#                dict(
#                    args=["colorscale", "Greens"],
#                    label="Greens",
#                    method="restyle"
#                ),
#            ]),
#            type = "buttons",
#            direction="right",
#            pad={"r": 10, "t": 10},
#            showactive=True,
#            x=0.1,
#            xanchor="left",
#            y=button_layer_1_height,
#            yanchor="top"
#        ),
#        dict(
#            buttons=list([
#                dict(
#                    args=["reversescale", False],
#                    label="False",
#                    method="restyle"
#                ),
#                dict(
#                    args=["reversescale", True],
#                    label="True",
#                    method="restyle"
#                )
#            ]),
#            type = "buttons",
#            direction="right",
#            pad={"r": 10, "t": 10},
#            showactive=True,
#            x=0.13,
#            xanchor="left",
#            y=button_layer_2_height,
#            yanchor="top"
#        ),
#        dict(
#            buttons=list([
#                dict(
#                    args=[{"contours.showlines": False, "type": "contour"}],
#                    label="Hide lines",
#                    method="restyle"
#                ),
#                dict(
#                    args=[{"contours.showlines": True, "type": "contour"}],
#                    label="Show lines",
#                    method="restyle"
#                ),
#            ]),
#            type = "buttons",
#            direction="right",
#            pad={"r": 10, "t": 10},
#            showactive=True,
#            x=0.5,
#            xanchor="left",
#            y=button_layer_2_height,
#            yanchor="top"
#        ),
#    ]
#)
#
#fig.update_layout(
#    annotations=[
#        dict(text="colorscale", x=0, xref="paper", y=1.1, yref="paper",
#                             align="left", showarrow=False),
#        dict(text="Reverse<br>Colorscale", x=0, xref="paper", y=1.06,
#                             yref="paper", showarrow=False),
#        dict(text="Lines", x=0.47, xref="paper", y=1.045, yref="paper",
#                             showarrow=False)
#    ])

fig.show()

In [None]:
fig.write_html("3D_embedding_exploration_tool.html")