In [1]:
import json
import torch

import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

MODELS = [
    "Alibaba-NLP/gte-large-en-v1.5",
    "intfloat/multilingual-e5-large",
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L6-v2"
]


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/jonathanmichala/All Documents/spatial_geometry/spatgeo-env/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_ins

In [2]:
DATA_PATH = 'https://huggingface.co/datasets/matthieunlp/spatial_geometry/resolve/main/src/data/sentence-embeddings'
RELATIONS_JSON_PATH = '../data/relations.json'

In [3]:
with open(RELATIONS_JSON_PATH, 'r') as f:
    relations = json.load(f)['spatial_relations']

In [4]:
def get_relations_lookup(relations):
    relations_lookup = {}
    for category, category_pairs in relations.items():
        for first, second in category_pairs:
            relations_lookup[first] = {'category': category, 'opposite': second, 'position': 0}
            relations_lookup[second] = {'category': category, 'opposite': first, 'position': 1}
    return relations_lookup

relations_lookup = get_relations_lookup(relations)

In [5]:
def load_embeddings_for_model(model_name, datapoint='relation'):
    import requests
    from io import BytesIO

    embeddings = []
    labels = []
    url = f'{DATA_PATH}/{model_name.replace("/", "_")}.pt'
    response = requests.get(url)
    response.raise_for_status()
    raw_data = torch.load(BytesIO(response.content), weights_only=False)
    
    for data_point in raw_data:
        embeddings.append(data_point['embedding'])
        labels.append(data_point[datapoint])
    return np.array(embeddings), np.array(labels).reshape(-1, 1)

# Probes for all relations

We expect that we will find linear relations, but we also have to isolate the syntaxic elements. 

We train a couple series of probes. 

In [6]:
def train_multitask_probe_for_datapoint(results_dict, models_dict, encoder):
    for model_name in MODELS:
        print(f"Training multitask probe for {model_name} on all datapoints...")
        X, y_relation = load_embeddings_for_model(model_name, datapoint='relation')
        X, y_subject = load_embeddings_for_model(model_name, datapoint='subject')
        X, y_object = load_embeddings_for_model(model_name, datapoint='object')

        # One-hot encode all labels together
        y_combined = np.column_stack([y_relation, y_subject, y_object])  # Stack labels side by side
        y_encoded = encoder.fit_transform(y_combined).todense()
        
        y_encoded = encoder.fit_transform(y).todense()
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2)

        y_train = np.asarray(y_train)
        y_test = np.asarray(y_test)

        # Train probe
        clf = MLPClassifier(hidden_layer_sizes=(), 
                            early_stopping=True, 
                            activation='identity')
        clf.fit(X_train, y_train)
        models_dict[model_name] = clf

        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results_dict[model_name] = accuracy
        print(f"Accuracy for {model_name}: {accuracy:.2f}")

In [7]:
def train_probe_for_datapoint(datapoint, results_dict, models_dict, encoder):
    for model_name in MODELS:
        print(f"Training probe for {model_name} on {datapoint}...")
        X, y = load_embeddings_for_model(model_name, datapoint=datapoint)
        y_encoded = encoder.fit_transform(y).todense()
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2)

        y_train = np.asarray(y_train)
        y_test = np.asarray(y_test)

        # Train probe
        clf = MLPClassifier(hidden_layer_sizes=(), early_stopping=True, activation='identity')
        clf.fit(X_train, y_train)
        models_dict[model_name] = clf

        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results_dict[model_name] = accuracy
        print(f"Accuracy for {model_name}: {accuracy:.2f}")


results = {}
models = {}
one_hot_encoder = OneHotEncoder()

subject_results = {}
subject_models = {}
subject_one_hot_encoder = OneHotEncoder()

object_results = {}
object_models = {}
object_one_hot_encoder = OneHotEncoder()

multitask_results = {}
multitask_models = {}
multitask_one_hot_encoder = OneHotEncoder()

train_probe_for_datapoint('relation', results, models, one_hot_encoder)
train_probe_for_datapoint('subject', subject_results, subject_models, subject_one_hot_encoder)
train_probe_for_datapoint('object', object_results, object_models, object_one_hot_encoder)
train_multitask_probe_for_datapoint(multitask_results, multitask_models, multitask_one_hot_encoder)

Training probe for Alibaba-NLP/gte-large-en-v1.5 on relation...
Accuracy for Alibaba-NLP/gte-large-en-v1.5: 1.00
Training probe for intfloat/multilingual-e5-large on relation...
Accuracy for intfloat/multilingual-e5-large: 1.00
Training probe for sentence-transformers/all-mpnet-base-v2 on relation...
Accuracy for sentence-transformers/all-mpnet-base-v2: 0.99
Training probe for sentence-transformers/all-MiniLM-L6-v2 on relation...
Accuracy for sentence-transformers/all-MiniLM-L6-v2: 1.00
Training probe for Alibaba-NLP/gte-large-en-v1.5 on subject...
Accuracy for Alibaba-NLP/gte-large-en-v1.5: 1.00
Training probe for intfloat/multilingual-e5-large on subject...
Accuracy for intfloat/multilingual-e5-large: 0.99
Training probe for sentence-transformers/all-mpnet-base-v2 on subject...
Accuracy for sentence-transformers/all-mpnet-base-v2: 1.00
Training probe for sentence-transformers/all-MiniLM-L6-v2 on subject...
Accuracy for sentence-transformers/all-MiniLM-L6-v2: 1.00
Training probe for A



Accuracy for sentence-transformers/all-mpnet-base-v2: 0.98
Training probe for sentence-transformers/all-MiniLM-L6-v2 on object...




Accuracy for sentence-transformers/all-MiniLM-L6-v2: 0.97
Training multitask probe for Alibaba-NLP/gte-large-en-v1.5 on all datapoints...


NameError: name 'y' is not defined

In [8]:
import pickle

class PickleSaver:
    def __init__(self, data, filename):
        self.data = data
        self.filename = filename

    def save(self):
        with open(self.filename, 'wb') as f:
            pickle.dump(self.data, f)

# Create instances of PickleSaver for each data object
pickle_savers = [
    PickleSaver(results, 'results.pkl'),
    PickleSaver(models, 'models.pkl'),
    PickleSaver(one_hot_encoder, 'one_hot_encoder.pkl'),
    PickleSaver(subject_results, 'subject_results.pkl'),
    PickleSaver(subject_models, 'subject_models.pkl'),
    PickleSaver(subject_one_hot_encoder, 'subject_one_hot_encoder.pkl'),
    PickleSaver(object_results, 'object_results.pkl'),
    PickleSaver(object_models, 'object_models.pkl'),
    PickleSaver(object_one_hot_encoder, 'object_one_hot_encoder.pkl'),
    PickleSaver(multitask_results, 'multitask_results.pkl'),
    PickleSaver(multitask_models, 'multitask_models.pkl'),
    PickleSaver(multitask_one_hot_encoder, 'multitask_one_hot_encoder.pkl')
]

# Save all data using the PickleSaver instances
for saver in pickle_savers:
    saver.save()



# Finding all the elements

We then build a dictionnary with the representation

# Transformation between subject and object

In [26]:
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

relation_representations = {}
for model_name, clf in models.items():
    print(f"Processing model: {model_name}")
    weights = clf.coefs_[0]
    class_names = one_hot_encoder.get_feature_names_out()
    relation_representations[model_name] = {
        class_name.replace('x0_', ''): weights[:, i]
        for i, class_name in enumerate(class_names)
    }

subject_class_representations = {}
for model_name, clf in subject_models.items():
    print(f"Processing subject model: {model_name}")
    weights = clf.coefs_[0]
    class_names = subject_one_hot_encoder.get_feature_names_out()
    subject_class_representations[model_name] = {
        class_name.replace('x0_', ''): weights[:, i]
        for i, class_name in enumerate(class_names)
    }

# For object models
object_class_representations = {}
for model_name, clf in object_models.items():
    print(f"Processing object model: {model_name}")
    weights = clf.coefs_[0]
    class_names = object_one_hot_encoder.get_feature_names_out()
    object_class_representations[model_name] = {
        class_name.replace('x0_', ''): weights[:, i]
        for i, class_name in enumerate(class_names)
    }


Processing model: Alibaba-NLP/gte-large-en-v1.5
Processing model: intfloat/multilingual-e5-large
Processing model: sentence-transformers/all-mpnet-base-v2
Processing model: sentence-transformers/all-MiniLM-L6-v2
Processing subject model: Alibaba-NLP/gte-large-en-v1.5
Processing subject model: intfloat/multilingual-e5-large
Processing subject model: sentence-transformers/all-mpnet-base-v2
Processing subject model: sentence-transformers/all-MiniLM-L6-v2
Processing object model: Alibaba-NLP/gte-large-en-v1.5
Processing object model: intfloat/multilingual-e5-large
Processing object model: sentence-transformers/all-mpnet-base-v2
Processing object model: sentence-transformers/all-MiniLM-L6-v2


In [27]:
merged_class_representations = []

for model_name in subject_class_representations.keys():
    if model_name in object_class_representations:
        merged_class_representations.append({
            "model_name": model_name,
            "relation_embedding": relation_representations[model_name],
            "subject_embedding": subject_class_representations[model_name],
            "object_embedding": object_class_representations[model_name]
        })

merged_class_representations_df = pd.DataFrame(merged_class_representations)
print(merged_class_representations_df)

                                model_name  \
0            Alibaba-NLP/gte-large-en-v1.5   
1           intfloat/multilingual-e5-large   
2  sentence-transformers/all-mpnet-base-v2   
3   sentence-transformers/all-MiniLM-L6-v2   

                                  relation_embedding  \
0  {'above': [0.5404442, 0.021074712, 0.062373973...   
1  {'above': [2.1064472, -7.9145646, 5.6887074, 1...   
2  {'above': [-0.3481432, 0.79766315, -3.3663204,...   
3  {'above': [-11.558843, -6.817552, 2.4517975, -...   

                                   subject_embedding  \
0  {'backpack': [0.49934778, 0.23872255, 0.046050...   
1  {'backpack': [1.4004223, -2.450059, -2.2688642...   
2  {'backpack': [0.19556403, 0.6709004, 1.932555,...   
3  {'backpack': [7.311962, -3.0467436, 1.1281406,...   

                                    object_embedding  
0  {'backpack': [-0.28748193, -0.30316794, -0.330...  
1  {'backpack': [-0.45119685, -1.0294902, -0.1519...  
2  {'backpack': [-0.43592635, 1.6827934, -

In [39]:
# Create dictionaries to store embeddings for each model and class
relations_by_model = {}
subject_embeddings_by_model = {}
object_embeddings_by_model = {}

# Group by model name and class
for model_name, row in merged_class_representations_df.iterrows():
    relations_by_model[model_name] = row['relation_embedding']
    subject_embeddings_by_model[model_name] = row['subject_embedding']
    object_embeddings_by_model[model_name] = row['object_embedding']

    
    # Initialize dicts for new models
    if model_name not in relations_by_model:
        relations_by_model[model_name] = {}
    if model_name not in subject_embeddings_by_model:
        subject_embeddings_by_model[model_name] = {}
    if model_name not in object_embeddings_by_model:
        object_embeddings_by_model[model_name] = {}
    
    # Store all class representations
    for class_name, relation_embedding in row['relation_embedding'].items():
        relations_by_model[model_name][class_name] = relation_embedding
    for class_name, subject_embedding in row['subject_embedding'].items():
        subject_embeddings_by_model[model_name][class_name] = subject_embedding
    for class_name, object_embedding in row['object_embedding'].items():
        object_embeddings_by_model[model_name][class_name] = object_embedding

# Usage case

# Get embeddings for a specific model and class
model_name = 0
class_name = 'above'
relation_embedding = relations_by_model[model_name][class_name]
print(relation_embedding)

[ 0.5404442   0.02107471  0.06237397 ... -0.0452004   0.10567422
  0.2879719 ]


In [51]:
# read generated_sentences.csv
df = pd.read_csv("../../generated_sentences.csv")

# find sentences with table as subject
table_sentences = df[df['subject'] == 'table']

# find sentences with chair as subject
chair_sentences = df[df['subject'] == 'chair']

# Create X and y such that X are the table sentences, but y are the corresponding chair sentences
# The sentences should be identical except for the subject
X = []
y = []

for table_sentence in table_sentences['sentence']:
    corresponding_chair_sentence = table_sentence.replace('table', 'chair')
    if corresponding_chair_sentence in chair_sentences['sentence'].values:
        X.append(table_sentence)
        y.append(corresponding_chair_sentence)

# Convert X and y to DataFrame for better visualization
X_df = pd.DataFrame(X, columns=['table_sentence'])
y_df = pd.DataFrame(y, columns=['chair_sentence'])

# Display the DataFrames
print(X_df.head())
print(y_df.head())

                          table_sentence
0           The table is above the lamp.
1            The table is over the lamp.
2       The table is on top of the lamp.
3     The table is higher than the lamp.
4  The table is elevated above the lamp.
                          chair_sentence
0           The chair is above the lamp.
1            The chair is over the lamp.
2       The chair is on top of the lamp.
3     The chair is higher than the lamp.
4  The chair is elevated above the lamp.


In [53]:
# Retrieve sentence embeddings

models = ["Alibaba-NLP_gte-large-en-v1.5",
          "intfloat_multilingual-e5-large",
          "sentence-transformers_all-mpnet-base-v2",
          "sentence-transformers_all-MiniLM-L6-v2"]

file_paths = [f"../data/{model}.pt" for model in models]

# Dictionary to store the data
embeddings = {}

# Load the data
for model in models:
    try:
        file_path = f"../data/{model}.pt"
        data = torch.load(file_path, map_location="cpu")  # Load on CPU to avoid GPU issues
        print(f"Data loaded successfully from {file_path}!")

        # Store the data in the dictionary
        embeddings[model] = data

        # Print the type of the loaded object
        print("Loaded data type:", type(data))
        print("List length:", len(data))
        print("First item:", data[0])

    except Exception as e:
        print(f"Error loading file {file_path}: {e}")

# Now embeddings dictionary contains data from all the models

Data loaded successfully from ../data/Alibaba-NLP_gte-large-en-v1.5.pt!
Loaded data type: <class 'list'>
List length: 142100
First item: {'sentence': 'The table is above the chair.', 'relation': 'above', 'subject': 'table', 'object': 'chair', 'embedding': array([ 0.32811892,  0.24800315,  0.28813255, ..., -0.2839055 ,
       -0.95515853, -0.04943762], shape=(1024,), dtype=float32)}
Data loaded successfully from ../data/intfloat_multilingual-e5-large.pt!
Loaded data type: <class 'list'>
List length: 142100
First item: {'sentence': 'The table is above the chair.', 'relation': 'above', 'subject': 'table', 'object': 'chair', 'embedding': array([ 0.02130731, -0.01715695, -0.0045109 , ..., -0.02039933,
        0.0191653 , -0.02246715], shape=(1024,), dtype=float32)}
Data loaded successfully from ../data/sentence-transformers_all-mpnet-base-v2.pt!
Loaded data type: <class 'list'>
List length: 142100
First item: {'sentence': 'The table is above the chair.', 'relation': 'above', 'subject': 'tab

In [59]:
# Retrieve embeddings for X and y
X_embeddings = {model: [] for model in models}
y_embeddings = {model: [] for model in models}

for model in models:
    for data_point in embeddings[model]:
        if data_point['sentence'] in X_df['table_sentence'].values:
            X_embeddings[model].append(data_point['embedding'])
        if data_point['sentence'] in y_df['chair_sentence'].values:
            y_embeddings[model].append(data_point['embedding'])

# Convert lists to numpy arrays
X_embeddings = {model: np.array(embeddings) for model, embeddings in X_embeddings.items()}
y_embeddings = {model: np.array(embeddings) for model, embeddings in y_embeddings.items()}

print("Number of X embeddings per model:", {model: len(embeddings) for model, embeddings in X_embeddings.items()})
print("Number of y embeddings per model:", {model: len(embeddings) for model, embeddings in y_embeddings.items()})

Number of X embeddings per model: {'Alibaba-NLP_gte-large-en-v1.5': 2784, 'intfloat_multilingual-e5-large': 2784, 'sentence-transformers_all-mpnet-base-v2': 2784, 'sentence-transformers_all-MiniLM-L6-v2': 2784}
Number of y embeddings per model: {'Alibaba-NLP_gte-large-en-v1.5': 2784, 'intfloat_multilingual-e5-large': 2784, 'sentence-transformers_all-mpnet-base-v2': 2784, 'sentence-transformers_all-MiniLM-L6-v2': 2784}


In [65]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def train_linear_mapping_on_embeddings(X_embeddings, y_embeddings, results_dict, models_dict, model_name):

    X = X_embeddings[model_name]
    y = y_embeddings[model_name]
    # print(X.shape, y.shape)

    linear_model = LinearRegression()
    linear_model.fit(X, y)
    model_name = f"linear_model_{model_name}"
    models_dict[model_name] = linear_model

    y_pred = linear_model.predict(X)
    mse = mean_squared_error(y, y_pred)
    results_dict[model_name] = mse
    print(f"MSE for {model_name}: {mse:.2f}")

# Example usage
mapping_results = {}
mapping_models = {}

for model_name in models:
    train_linear_mapping_on_embeddings(X_embeddings, y_embeddings, mapping_results, mapping_models, model_name)

MSE for linear_model_Alibaba-NLP_gte-large-en-v1.5: 0.00
MSE for linear_model_intfloat_multilingual-e5-large: 0.00
MSE for linear_model_sentence-transformers_all-mpnet-base-v2: 0.00
MSE for linear_model_sentence-transformers_all-MiniLM-L6-v2: 0.00


# More complex transformation

Now let's try to go from sentences "[subject] [relation] [object]" to "[object] [opposite relation] [table]".
This will decrease the number of dimensions, as it's a many-to-one mapping.

In [85]:
model = models[0]
embedding_info = embeddings[model]
#List of {sentence, relation, subject, object, embedding}

RELATIONS_JSON_PATH = '../data/relations.json'
with open(RELATIONS_JSON_PATH, 'r') as f:
    relations = json.load(f)['spatial_relations']
def get_relations_lookup(relations):
    relations_lookup = {}
    for category, category_pairs in relations.items():
        for first, second in category_pairs:
            relations_lookup[first] = {'category': category, 'opposite': second, 'position': 0}
            relations_lookup[second] = {'category': category, 'opposite': first, 'position': 1}
    return relations_lookup

relations_lookup = get_relations_lookup(relations)
print(relations_lookup['below'])

embedding_dict = {info['sentence']: info['embedding'] for info in embedding_info}

X_sentences = [info['sentence'] for info in embedding_info]
X = []
y_info = embedding_info.copy()
y = []
count = 0
for info in y_info:
    sentence = info['sentence']
    ob = info['object']
    rel = info['relation']
    rel_op = relations_lookup[rel]['opposite']
    new_sentence = f"The {ob} is {rel_op} the table."
    if count % 100 == 0 and count < 4000:
        print(count,info['sentence'],new_sentence)
    if new_sentence in embedding_dict:
        X.append(embedding_dict[sentence])
        y.append(embedding_dict[new_sentence])
    count += 1



{'category': 'vertical', 'opposite': 'above', 'position': 1}
0 The table is above the chair. The chair is below the table.
100 The table is near the bed. The bed is distant from the table.
200 The table is perpendicular to the cup. The cup is parallel to the table.
300 The table is within the car. The car is beyond the table.
400 The table is oriented toward the clock. The clock is oriented away from the table.
500 The table is ahead of the television. The television is at the back of the table.
600 The table is connected to the backpack. The backpack is disconnected from the table.
700 The table is elevated above the key. The key is dropped below the table.
800 The table is on the towel. The towel is off the table.
900 The table is over the mirror. The mirror is under the table.
1000 The table is next to the camera. The camera is away from the table.
1100 The table is equidistant from the umbrella. The umbrella is closer to the table.
1200 The table is enclosed in the toy. The toy is 

In [87]:
X_embeddings = np.array(X)
y_embeddings = np.array(y)
print(X_embeddings.shape, y_embeddings.shape)

results_dict = {}
models_dict = {}

linear_model = LinearRegression()
linear_model.fit(X, y)
model_name = f"linear_model_{model_name}"
models_dict[model_name] = linear_model

y_pred = linear_model.predict(X)
mse = mean_squared_error(y, y_pred)
results_dict[model_name] = mse
print(f"MSE for {model_name}: {mse:.2f}")

(139258, 1024) (139258, 1024)
MSE for linear_model_Alibaba-NLP_gte-large-en-v1.5: 0.06
