In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [4]:
# SAMPLE

# Our sentences to encode
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog."
]

# Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

# Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: This framework generates embeddings for each input sentence
Embedding: [-1.37173254e-02 -4.28516120e-02 -1.56285930e-02  1.40537210e-02
  3.95537578e-02  1.21796302e-01  2.94333138e-02 -3.17523815e-02
  3.54959927e-02 -7.93140307e-02  1.75878219e-02 -4.04369459e-02
  4.97259572e-02  2.54912656e-02 -7.18700364e-02  8.14968050e-02
  1.47067558e-03  4.79627065e-02 -4.50335555e-02 -9.92174819e-02
 -2.81769410e-02  6.45045340e-02  4.44670469e-02 -4.76217791e-02
 -3.52951847e-02  4.38672043e-02 -5.28566130e-02  4.33022535e-04
  1.01921432e-01  1.64072886e-02  3.26996781e-02 -3.45987156e-02
  1.21339392e-02  7.94871077e-02  4.58342629e-03  1.57778524e-02
 -9.68207233e-03  2.87626460e-02 -5.05807325e-02 -1.55793736e-02
 -2.87907049e-02 -9.62283742e-03  3.15556601e-02  2.27349307e-02
  8.71450230e-02 -3.85027342e-02 -8.84718671e-02 -8.75500683e-03
 -2.12343670e-02  2.08924022e-02 -9.02078301e-02 -5.25732227e-02
 -1.05638765e-02  2.88311578e-02 -1.61454566e-02  6.17833249e-03
 -1.23234

In [5]:
# Define base name
base_name = "organization_descriptions"

data_path = f"/Users/tom.willcocks/Downloads/{base_name}.csv"

if base_name == 'organization_descriptions':
    col = "description"
elif base_name == 'organizations':
    col = "short_description"

In [6]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv(data_path)

In [7]:
# Convert the 'description' column to strings, replacing NaN values with an empty string
sample_descriptions = df[col].fillna('').astype(str).tolist()

In [9]:
# Select only the specified columns
df = df[['id', 'name', col]]

In [13]:
import numpy as np

# Determine the number of documents
num_docs = len(df)

# Encode a single document to determine the embedding dimension
sample_embedding = model.encode([df[col].iloc[0]])  # Encode in a batch for consistent output shape
embedding_dim = sample_embedding.shape[1]

# Initialize a numpy array with zeros, of shape (num_docs, embedding_dim)
embeddings_array = np.zeros((num_docs, embedding_dim))

# Define the batch size
batch_size = 32  # Adjust this based on your machine's memory and the model's requirements

# Process documents in batches
for i in range(0, num_docs, batch_size):
    batch_texts = df[col].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch_texts)
    # Store the batch embeddings
    embeddings_array[i:i+batch_size] = batch_embeddings

# Now 'embeddings_array' contains all your embeddings, stored efficiently in memory


In [11]:
import numpy as np

# Number of documents
num_docs = len(df[col])

# Dimensionality of the embeddings (e.g., 768 for BERT-based models)
embedding_dim = 768

# Preallocate the array with zeros or another placeholder value
embeddings_array = np.zeros((num_docs, embedding_dim))

for i, text in enumerate(df[col]):
    embeddings_array[i] = model.encode(text)

ValueError: could not broadcast input array from shape (384,) into shape (768,)