In [2]:
import torch
from transformers import BertModel, BertTokenizer
import pandas as pd
import pickle
from scipy.spatial.distance import cosine
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [11]:

# Load pre-trained model tokenizer (vocabulary) and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# Function to embed descriptions using BERT
def get_bert_embeddings(texts):
    model.eval()  # Set model to evaluation mode
    embeddings = []
    with torch.no_grad():  # No need to compute gradients
        for text in texts:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            # Get the embeddings from the last hidden state (mean pooling)
            embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

In [10]:

# Function to load embeddings from a file
def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        embeddings = pickle.load(f)
    return embeddings

# Function to compute cosine similarity between two embeddings
def cosine_similarity(embedding1, embedding2):
    # Cosine similarity is the inverse of cosine distance
    return 1 - cosine(embedding1, embedding2)


In [3]:


df = pd.read_csv('../CPCProcessing/CPCDescriptions.csv')
df.columns

Index(['CPCcode', 'CPCDescription'], dtype='object')

In [9]:
CPCs = df['CPCDescription'].unique()

In [12]:


def get_bert_embeddings_with_labels(texts, labels):
    model.eval()
    embeddings_dict = {}
    
    with torch.no_grad():
        for text, label in zip(texts, labels):
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings_dict[label] = embedding
    
    return embeddings_dict

# Create DataFrame from embeddings
def create_embeddings_df(embeddings_dict):
    # Convert dictionary to DataFrame
    df = pd.DataFrame({
        'label': list(embeddings_dict.keys()),
        'embedding': list(embeddings_dict.values())
    })
    return df

# Usage
labels = CPCs
embeddings_dict = get_bert_embeddings_with_labels(CPCs, labels)

# Create DataFrame
df_embeddings = create_embeddings_df(embeddings_dict)

# Save to file (multiple options)


In [13]:
df_embeddings

Unnamed: 0,label,embedding
0,SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMP...,"[0.18587406, 0.3988257, -0.0760595, -0.2632123..."
1,"MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT ...","[0.36048418, 0.08068754, 0.30791503, 0.2654474..."
2,ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR C...,"[-0.15903594, 0.42084968, 0.19312954, 0.077889..."
3,HETEROCYCLIC COMPOUNDS / HETEROCYCLIC COMPOUNDS,"[-0.07435591, 0.391528, 0.05797588, -0.0220640..."
4,"ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS...","[-0.42760912, 0.3552434, 0.22935525, 0.0432389..."
...,...,...
247,SLAUGHTERING/ SLAUGHTERING,"[0.6271786, 0.064747326, -0.05660874, 0.133950..."
248,INFORMATION AND COMMUNICATION TECHNOLOGY [ICT]...,"[0.30049437, 0.4705137, 0.17368175, 0.07256717..."
249,TIME-INTERVAL MEASURING / TIME-INTERVAL MEASUR...,"[0.1778687, -0.07610254, 0.010347518, -0.04183..."
250,"SERVICING, CLEANING, REPAIRING, SUPPORTING, LI...","[-0.07927508, 0.33253393, 0.45123893, 0.278501..."


In [14]:
df_embeddings['code'] = df['CPCcode']

In [18]:
df_embeddings

Unnamed: 0,label,embedding,code
0,SPECIFIC THERAPEUTIC ACTIVITY OF CHEMICAL COMP...,"[0.18587406, 0.3988257, -0.0760595, -0.2632123...",A61P
1,"MATERIALS FOR MISCELLANEOUS APPLICATIONS, NOT ...","[0.36048418, 0.08068754, 0.30791503, 0.2654474...",C09K
2,ACYCLIC OR CARBOCYCLIC COMPOUNDS/ ACYCLIC OR C...,"[-0.15903594, 0.42084968, 0.19312954, 0.077889...",C07C
3,HETEROCYCLIC COMPOUNDS / HETEROCYCLIC COMPOUNDS,"[-0.07435591, 0.391528, 0.05797588, -0.0220640...",C07D
4,"ACYCLIC, CARBOCYCLIC OR HETEROCYCLIC COMPOUNDS...","[-0.42760912, 0.3552434, 0.22935525, 0.0432389...",C07F
...,...,...,...
247,SLAUGHTERING/ SLAUGHTERING,"[0.6271786, 0.064747326, -0.05660874, 0.133950...",A22B
248,INFORMATION AND COMMUNICATION TECHNOLOGY [ICT]...,"[0.30049437, 0.4705137, 0.17368175, 0.07256717...",G06Q
249,TIME-INTERVAL MEASURING / TIME-INTERVAL MEASUR...,"[0.1778687, -0.07610254, 0.010347518, -0.04183...",G04F
250,"SERVICING, CLEANING, REPAIRING, SUPPORTING, LI...","[-0.07927508, 0.33253393, 0.45123893, 0.278501...",B60S


In [19]:
# Save to file (multiple options)
# 1. Save as pickle
df_embeddings.to_pickle('CPCEmbeddings.pkl')

# 2. Save as numpy array with labels
np.savez('CPCEmbeddings.npz', 
         embeddings=np.array([e for e in embeddings_dict.values()]), 
         labels=np.array(list(embeddings_dict.keys())),
         code = np.array(list(embeddings_dict.keys())))