In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import adjusted_rand_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
def main(emb_dir, models, emb_types, classifiers):
    for model_name in models:
        print(f"{model_name}...")
        for emb_type in emb_types:
            emb_path = os.path.join(emb_dir, model_name, f"{emb_type}.npz")
            data = np.load(emb_path)
            embeddings = data["embeddings"]
            labels = data["labels"]

            metrics = []

            # Clustering
            kmeans = KMeans(n_clusters=2, n_init=20, random_state=42)
            pred = kmeans.fit_predict(embeddings)
            ari = adjusted_rand_score(labels, pred)
            metrics.append({
                "Method": "K-Means",
                "ARI": round(ari, 3)
            })

            # Classification
            X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
            for clf_name, classifier in classifiers.items():
                classifier.fit(X_train, y_train)
                y_pred = classifier.predict(X_test)

                metrics.append({
                    "Method": clf_name,
                    "Accuracy": round(accuracy_score(y_test, y_pred), 3),
                    "Precision": round(precision_score(y_test, y_pred), 3),
                    "Recall": round(recall_score(y_test, y_pred), 3),
                    "F1": round(f1_score(y_test, y_pred), 3)
                })

            # Save metrics
            out_path = os.path.join(model_name, emb_type)
            os.makedirs(out_path, exist_ok=True)

            out_path = os.path.join(out_path, "metrics.csv")
            metrics_df = pd.DataFrame(metrics)
            metrics_df.to_csv(out_path, index=False)

In [3]:
emb_dir = "../2-embeddings"
emb_types = ["cls", "average", "layerwise"]
classifiers = {
    "LogReg": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(kernel="linear", random_state=42)
}

In [4]:
models = ["bert", "roberta"]
main(emb_dir, models, emb_types, classifiers)

bert...
roberta...


In [5]:
models = ["bert_ft", "roberta_ft"]
main(emb_dir, models, emb_types, classifiers)

bert_ft...
roberta_ft...
