In [5]:
import numpy as np

In [4]:
import torch
from transformers import BertModel, BertTokenizer
import pickle
import pickle
from scipy.spatial.distance import cosine


# Load pre-trained model tokenizer (vocabulary) and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# Function to embed descriptions using BERT
def get_bert_embeddings(texts):
    model.eval()  # Set model to evaluation mode
    embeddings = []
    with torch.no_grad():  # No need to compute gradients
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            # Get the embeddings from the last hidden state (mean pooling)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

In [3]:

# Function to load embeddings from a file
def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        embeddings = pickle.load(f)
    return embeddings

# Function to compute cosine similarity between two embeddings
def cosine_similarity(embedding1, embedding2):
    # Cosine similarity is the inverse of cosine distance
    return 1 - cosine(embedding1, embedding2)


In [6]:
import pandas as pd

df = pd.read_csv('../Data/Curated/UseCaseDataModeling.csv')

  df = pd.read_csv('/home/matt/Proj/QSURv3/Data/Curated/UseCaseDataModeling.csv')


In [7]:
functions = df['Harmonized Functional Use'].unique()

In [11]:
import pandas as pd
import numpy as np

def get_bert_embeddings_with_labels(texts, labels):
    model.eval()
    embeddings_dict = {}
    
    with torch.no_grad():
        for text, label in zip(texts, labels):
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings_dict[label] = embedding
    
    return embeddings_dict

# Create DataFrame from embeddings
def create_embeddings_df(embeddings_dict):
    # Convert dictionary to DataFrame
    df = pd.DataFrame({
        'label': list(embeddings_dict.keys()),
        'embedding': list(embeddings_dict.values())
    })
    return df

# Usage
labels = functions
embeddings_dict = get_bert_embeddings_with_labels(functions, labels)

# Create DataFrame
df_embeddings = create_embeddings_df(embeddings_dict)

# Save to file (multiple options)
# 1. Save as pickle
df_embeddings.to_pickle('FunctionEmbeddings.pkl')

# 2. Save as numpy array with labels
np.savez('FunctionEmbeddings.npz', 
         embeddings=np.array([e for e in embeddings_dict.values()]), 
         labels=np.array(list(embeddings_dict.keys())))

In [12]:
df_embeddings

Unnamed: 0,label,embedding
0,Solvent,"[0.102317356, 0.117060356, -0.06408139, 0.1943..."
1,Surfactant (surface active agent),"[-0.11199236, 0.025828337, 0.14529149, -0.1922..."
2,Humectant,"[0.18350892, -0.39270502, -0.20201614, 0.08241..."
3,Binder,"[-0.15956144, -0.32352757, -0.15509589, -0.122..."
4,Fragrance,"[0.20288669, -0.03972793, -0.122679375, -0.163..."
...,...,...
90,Alloying element,"[0.014706743, -0.05031349, -0.43574685, 0.1439..."
91,Anti-streaking agent,"[-0.38540557, -0.46419767, -0.4762463, -0.2515..."
92,Color scavenger (EPA),"[-0.01568828, -0.10239726, -0.45295116, -0.093..."
93,Flocculating agent,"[-0.08261341, -0.25251776, -0.0062987506, -0.0..."
