In [2]:
import json
import torch

import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.manifold import TSNE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

MODELS = [
    "Alibaba-NLP/gte-large-en-v1.5",
    "intfloat/multilingual-e5-large",
    "sentence-transformers/all-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L6-v2"
]

DATA_PATH = '../data/sentence-embeddings'
RELATIONS_JSON_PATH = '../data/relations.json'


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/Cellar/python@3.10/3.10.15/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/jonathanmichala/All Documents/spatial_geometry/spatgeo-env/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_ins

In [3]:
with open(RELATIONS_JSON_PATH, 'r') as f:
    relations = json.load(f)['spatial_relations']

In [4]:
def get_relations_lookup(relations):
    relations_lookup = {}
    for category, category_pairs in relations.items():
        for first, second in category_pairs:
            relations_lookup[first] = {'category': category, 'opposite': second, 'position': 0}
            relations_lookup[second] = {'category': category, 'opposite': first, 'position': 1}
    return relations_lookup

relations_lookup = get_relations_lookup(relations)

In [6]:
def load_embeddings_for_model(model_name):
    #label = "subject", "object", or "relation"
    embeddings = []
    sub_labels = []
    ob_labels = []
    rel_labels = []
    raw_data = torch.load(f'{DATA_PATH}/{model_name.replace("/", "_")}.pt', weights_only=False)
    for data_point in raw_data:
        embeddings.append(data_point['embedding'])
        sub_labels.append(data_point['subject'])
        ob_labels.append(data_point['object'])
        rel_labels.append(data_point['relation'])
    return np.array(embeddings), np.array(sub_labels).reshape(-1, 1), np.array(ob_labels).reshape(-1, 1), np.array(rel_labels).reshape(-1, 1)

In [12]:
results = {}
models = {}
one_hot_encoder = OneHotEncoder()

for model_name in MODELS:
    print(f"Training probe for {model_name}...")
    X, y_sub, y_ob, y_rel = load_embeddings_for_model(model_name)
    y_sub_encoded = one_hot_encoder.fit_transform(y_sub).todense()
    y_ob_encoded = one_hot_encoder.fit_transform(y_ob).todense()
    y_rel_encoded = one_hot_encoder.fit_transform(y_rel).todense()
    X_train, X_test, y_sub_train, y_sub_test, y_ob_train, y_ob_test, y_rel_train, y_rel_test = \
        train_test_split((X, y_sub_encoded, y_ob_encoded, y_rel_encoded), test_size=0.2) 

    y_sub_train = np.asarray(y_sub_train)
    y_sub_test = np.asarray(y_sub_test)
    y_ob_train = np.asarray(y_ob_train)
    y_ob_test = np.asarray(y_ob_test)
    y_rel_train = np.asarray(y_rel_train)
    y_rel_test = np.asarray(y_rel_test)
    
    # Train probes
    clf_sub = MLPClassifier(activation='identity')
    clf_sub.fit(X_train, y_sub_train)

    clf_ob = MLPClassifier(activation='identity')
    clf_ob.fit(X_train, y_ob_train)

    clf_rel = MLPClassifier(activation='identity')
    clf_rel.fit(X_train, y_rel_train)
    models[model_name] = (clf_sub, clf_ob, clf_rel)

    y_sub_pred = clf_sub.predict(X_test)
    y_ob_pred = clf_ob.predict(X_test)
    y_rel_pred = clf_rel.predict(X_test)

    accuracy_sub = accuracy_score(y_sub_test, y_sub_pred)
    accuracy_ob = accuracy_score(y_ob_test, y_ob_pred)
    accuracy_rel = accuracy_score(y_rel_test, y_rel_pred)
    joint_accuracy = np.mean((y_sub_pred == y_sub_test) & (y_ob_pred == y_ob_test)
                                                        & (y_rel_pred == y_rel_test))

    results[model_name] = {"accuracy_sub": accuracy_sub, "accuracy_ob": accuracy_ob,
                           "accuracy_rel": accuracy_rel, "joint_accuracy": joint_accuracy}
    print(f"Accuracy for {model_name}: Joint {joint_accuracy:.2f}, Subject {accuracy_sub:.2f}, \
          Object {accuracy_ob:.2f}, Relation {accuracy_rel:.2f}")

Training probe for Alibaba-NLP/gte-large-en-v1.5...


UnpicklingError: invalid load key, 'v'.