In [8]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [9]:
def plot_clusters(X_2d, cluster_labels, out_path):
    plt.figure()
    plt.scatter(X_2d[cluster_labels == 0, 0], X_2d[cluster_labels == 0, 1], label="Cluster 0", alpha=0.3, s=10, color="darkred")
    plt.scatter(X_2d[cluster_labels == 1, 0], X_2d[cluster_labels == 1, 1], label="Cluster 1", alpha=0.3, s=10, color="darkblue")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

In [10]:
def plot_boundary(X_2d, labels, classifier, out_path):
    # Meshgrid for boundary
    h = 0.1
    x_min, x_max = X_2d[:, 0].min() - .5, X_2d[:, 0].max() + .5
    y_min, y_max = X_2d[:, 1].min() - .5, X_2d[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Predict
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot
    plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.Paired)
    plt.scatter(X_2d[labels == 0, 0], X_2d[labels == 0, 1], label="Literal", alpha=0.3, s=10)
    plt.scatter(X_2d[labels == 1, 0], X_2d[labels == 1, 1], label="Figurative", alpha=0.3, s=10)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

In [11]:
def main(emb_dir, models, emb_types, classifiers):
    for model_name in models:
        print(f"{model_name}...")
        for emb_type in emb_types:
            emb_path = os.path.join(emb_dir, model_name, f"{emb_type}.npz")
            data = np.load(emb_path)
            embeddings = data["embeddings"]
            labels = data["labels"]

            # Standardize and PCA 2D
            X_scaled = StandardScaler().fit_transform(embeddings)
            X_2d = PCA(n_components=2, random_state=42).fit_transform(X_scaled)

            out_path = os.path.join(model_name, emb_type)
            os.makedirs(out_path, exist_ok=True)

            # Clustering
            kmeans = KMeans(n_clusters=2, n_init=20, random_state=42)
            clusters = kmeans.fit_predict(embeddings)
            out_path = os.path.join(model_name, emb_type, "kmeans_clusters.png")
            plot_clusters(X_2d, clusters, out_path)

            # Classification
            for clf_name, classifier in classifiers.items():
                classifier.fit(X_2d, labels)
                out_path = os.path.join(model_name, emb_type, f"{clf_name}_boundary.png")
                plot_boundary(X_2d, labels, classifier, out_path)

In [12]:
emb_dir = "../2-embeddings"
emb_types = ["cls", "average", "layerwise"]
classifiers = {
    "logreg": LogisticRegression(max_iter=1000, random_state=42),
    "svm": SVC(kernel="linear", random_state=42)
}

In [13]:
models = ["bert", "roberta"]
main(emb_dir, models, emb_types, classifiers)

bert...
roberta...


In [14]:
models = ["bert_ft", "roberta_ft"]
main(emb_dir, models, emb_types, classifiers)

bert_ft...
roberta_ft...
