In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from sklearn.preprocessing import StandardScaler
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import adjusted_rand_score, accuracy_score, precision_score, recall_score, f1_score

In [2]:
def analyze_embeddings(embeddings, labels, out_path):
    # Clustering
    kmeans = KMeans(n_clusters=2, n_init=20, random_state=42)
    kmeans_labels = kmeans.fit_predict(embeddings)
    ari = adjusted_rand_score(labels, kmeans_labels)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)

    classifier = LogisticRegression(max_iter=1000, random_state=42)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)

    acc = round(accuracy_score(y_test, y_pred), 3)
    prec = round(precision_score(y_test, y_pred), 3)
    rec = round(recall_score(y_test, y_pred), 3)
    f1 = round(f1_score(y_test, y_pred), 3)

    # Plot
    X_scaled = StandardScaler().fit_transform(embeddings)
    X_2d = PCA(n_components=2, random_state=42).fit_transform(X_scaled)

    _, axes = plt.subplots(1, 3, figsize=(18, 5))

    # PCA
    axes[0].scatter(X_2d[labels == 0, 0], X_2d[labels == 0, 1], label='Literal', alpha=0.5)
    axes[0].scatter(X_2d[labels == 1, 0], X_2d[labels == 1, 1], label='Figurative', alpha=0.5)
    axes[0].set_title("PCA")

    # K-means
    axes[1].scatter(X_2d[kmeans_labels == 0, 0], X_2d[kmeans_labels == 0, 1], label='Cluster 0', alpha=0.5, color='darkred')
    axes[1].scatter(X_2d[kmeans_labels == 1, 0], X_2d[kmeans_labels == 1, 1], label='Cluster 1', alpha=0.5, color='darkblue')
    axes[1].set_title(f"K-Means Clustering (ARI={ari:.2f})")

    # Classification boundary
    classifier.fit(X_2d, labels)
    x_min, x_max = X_2d[:, 0].min() - 1, X_2d[:, 0].max() + 1
    y_min, y_max = X_2d[:, 1].min() - 1, X_2d[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500), np.linspace(y_min, y_max, 500))
    Z = classifier.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)

    axes[2].contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.Paired)
    axes[2].scatter(X_2d[labels == 0, 0], X_2d[labels == 0, 1], label='Literal', alpha=0.5)
    axes[2].scatter(X_2d[labels == 1, 0], X_2d[labels == 1, 1], label='Figurative', alpha=0.5)
    axes[2].set_title(f"Logistic Regression\nAcc={acc:.2f} Prec={prec:.2f} Rec={rec:.2f} F1={f1:.2f}")

    for ax in axes:
        ax.grid(True)
        ax.legend()

    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

In [3]:
emb_path = "top_words_embeddings.npz"
data = np.load(emb_path)
words = data['words']
embeddings = data['embeddings']
labels = data['labels']

for word in set(words):
    mask = (words == word)
    filtered_embeddings = embeddings[mask]
    filtered_labels = labels[mask]
    out_dir = "top_words_analysis"
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{word}.png")
    analyze_embeddings(filtered_embeddings, filtered_labels, out_path)