In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from torch.nn.functional import normalize
from torch.amp import GradScaler, autocast
from sklearn.metrics import classification_report
from tqdm import tqdm
import torch
import pandas as pd
import numpy as np
import random
import platform
import sys
import sklearn
import transformers
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder


seed = 677
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


tokenizer_model = "roberta-base"
training_group = "whole"  # AA, White, whole
percent_per_cluster = 0.25
epochs = 2
batch_size = 32
code = "KMeans"


training_df = pd.read_csv("FDCL18_train.csv", sep="\t")
validation_df = pd.read_csv("FDCL18_validation.csv", sep="\t")
test_df = pd.read_csv("FDCL18_test.csv", sep="\t")
print(training_df.columns)


X_train = training_df.drop(columns='label')
X_valid = validation_df.drop(columns='label')
X_test = test_df.drop(columns='label')

label_encoder = LabelEncoder()

y_train = pd.Series(label_encoder.fit_transform(training_df['label']), index=training_df.index)
y_valid = pd.Series(label_encoder.transform(validation_df['label']), index=validation_df.index)
y_test = pd.Series(label_encoder.transform(test_df['label']), index=test_df.index)

print("\nLabel Encoding Map (index → label):")
for i, label in enumerate(label_encoder.classes_):
    print(f"{i}: {label}")


X_train_AA = X_train[X_train['dialect'] == 'AA'].copy()
y_train_AA = y_train[X_train['dialect'] == 'AA']
X_valid_AA = X_valid[X_valid['dialect'] == 'AA'].copy()
y_valid_AA = y_valid[X_valid['dialect'] == 'AA']
X_test_White = X_test[X_test['dialect'] == 'White'].copy()
y_test_White = y_test[X_test['dialect'] == 'White']
X_test_AA = X_test[X_test['dialect'] == 'AA'].copy()
y_test_AA = y_test[X_test['dialect'] == 'AA']


assert X_train_AA['dialect'].nunique() == 1 and X_train_AA['dialect'].iloc[0] == 'AA'
assert X_valid_AA['dialect'].nunique() == 1 and X_valid_AA['dialect'].iloc[0] == 'AA'
assert X_test_AA['dialect'].nunique() == 1 and X_test_AA['dialect'].iloc[0] == 'AA'
assert X_test_White['dialect'].nunique() == 1 and X_test_White['dialect'].iloc[0] == 'White'


if training_group == "AA":
    X_train_group, y_train_group = X_train_AA['tweet'], y_train_AA
    X_valid_group, y_valid_group = X_valid_AA['tweet'], y_valid_AA
elif training_group == "White":
    X_train_White = X_train[X_train['dialect'] == 'White'].copy()
    y_train_White = y_train[X_train['dialect'] == 'White']
    X_valid_White = X_valid[X_valid['dialect'] == 'White'].copy()
    y_valid_White = y_valid[X_valid['dialect'] == 'White']
    X_train_group, y_train_group = X_train_White['tweet'], y_train_White
    X_valid_group, y_valid_group = X_valid_White['tweet'], y_valid_White

    assert X_train_White['dialect'].nunique() == 1 and X_train_White['dialect'].iloc[0] == 'White'
    assert X_valid_White['dialect'].nunique() == 1 and X_valid_White['dialect'].iloc[0] == 'White'

elif training_group == "whole":
    assert set(X_train['dialect'].unique()) == {'White', 'AA'}
    assert set(X_valid['dialect'].unique()) == {'White', 'AA'}
    X_train_group, y_train_group = X_train['tweet'], y_train
    X_valid_group, y_valid_group = X_valid['tweet'], y_valid

print(f"[INFO] Training group: {training_group} — Number of available training examples: {len(X_train_group)}")


tokenizer = AutoTokenizer.from_pretrained(tokenizer_model)

def tokenize_function(texts):
    return tokenizer(list(texts), padding="max_length", truncation=True, max_length=64, return_tensors="pt")

class FDCL_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        }
    def __len__(self):
        return len(self.labels)


@torch.no_grad()
def compute_embeddings(texts, model, tokenizer, device):
    model.eval()
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Computing Embeddings"):
        batch = texts.iloc[i:i + batch_size]
        encodings = tokenizer(list(batch), padding="max_length", truncation=True, max_length=64, return_tensors="pt")
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)
        with autocast(device_type='cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True)
            cls_embeddings = outputs.hidden_states[-1][:, 0, :]
            cls_embeddings = normalize(cls_embeddings, dim=1)
            embeddings.append(cls_embeddings.cpu())
    return torch.cat(embeddings, dim=0).to(device)


def select_representative_coreset(embeddings, kmeans_model, cluster_labels, percent_per_cluster=0.25, log_path = f"KMeans_selection_{tokenizer_model}_{training_group}_{seed}.txt"):
    embeddings_np = embeddings.cpu().numpy()
    selected_indices = []

    with open(log_path, "w") as f:
        for cluster_id in tqdm(range(len(kmeans_model.cluster_centers_)), desc="Selecting representative points"):
            cluster_idx = np.where(cluster_labels == cluster_id)[0]
            cluster_embeds = embeddings_np[cluster_idx]
            cluster_center = kmeans_model.cluster_centers_[cluster_id]
            distances = np.linalg.norm(cluster_embeds - cluster_center, axis=1)
            n_select = max(1, int(percent_per_cluster * len(cluster_idx)))
            closest_indices = np.argsort(distances)[:n_select]

            f.write(f"\nCluster {cluster_id}:\n")
            f.write(f"  Total in cluster: {len(cluster_idx)}\n")
            f.write(f"  Selecting {n_select} closest points\n")
            f.write(f"  Closest distances: {np.sort(distances)[:n_select]}\n")
            f.write(f"  Farthest distance in selection: {np.sort(distances)[:n_select][-1]:.4f}\n")
            f.write(f"  Max possible distance in cluster: {np.max(distances):.4f}\n")

            selected_indices.extend(cluster_idx[closest_indices])

    return selected_indices


# Visualization
def project_embeddings_2d(embeddings):
    tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
    return tsne.fit_transform(embeddings.cpu().numpy())


def plot_kmeans_clusters_2(embeddings_2d, selected_indices, save_path):
    plt.figure(figsize=(14, 12))

    plt.scatter(
        embeddings_2d[:, 0],
        embeddings_2d[:, 1],
        c='gray',
        s=10,
        alpha=0.5,
        label='Unselected',
        edgecolors='none'
    )
    plt.scatter(
        embeddings_2d[selected_indices, 0],
        embeddings_2d[selected_indices, 1],
        c='red',
        s=10,
        alpha=0.7,
        label='Selected',
        edgecolors='none'
    )
    plt.title("t-SNE Visualization of Embedding Space\n(Selected vs Unselected Samples)")
    plt.xlabel("t-SNE Component 1")
    plt.ylabel("t-SNE Component 2")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = len(label_encoder.classes_)
embedding_model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=num_labels).to(device)

for param in embedding_model.parameters():
    param.requires_grad = False

embeddings = compute_embeddings(X_train_group, embedding_model, tokenizer, device)
candidate_ks = [2, 3, 4, 5, 10, 15] # check for the best one
scores = {}
models = {}
for k in tqdm(candidate_ks, desc="Evaluating cluster sizes"):
    km = KMeans(n_clusters=k, random_state=seed, n_init='auto')
    labels = km.fit_predict(embeddings.cpu().numpy())
    score = silhouette_score(embeddings.cpu().numpy(), labels)
    scores[k] = score
    models[k] = (km, labels)
best_k = max(scores, key=scores.get)
kmeans_model, cluster_labels = models[best_k]
print(f"Best K = {best_k} with Silhouette Score = {scores[best_k]:.4f}")

selected_indices = select_representative_coreset(embeddings, kmeans_model, cluster_labels, percent_per_cluster)

index_df = pd.DataFrame({"selected_indices": sorted(X_train_group.iloc[selected_indices].index)})
index_df.to_csv(f"KMeans_{training_group}_{tokenizer_model}_{seed}.csv", index=False)

embeddings_2d = project_embeddings_2d(embeddings)

plot_kmeans_clusters_2(embeddings_2d, selected_indices, save_path=f"KMeans_k{best_k}_{training_group}_{tokenizer_model}_{seed}.png")

X_train_core = X_train_group.iloc[selected_indices]
print(f"[INFO] Number of selected examples: {len(X_train_core)}")
y_train_core = y_train_group.iloc[selected_indices]
train_dataset = FDCL_dataset(tokenize_function(X_train_core), y_train_core)
valid_dataset = FDCL_dataset(tokenize_function(X_valid_group), y_valid_group)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)

model = AutoModelForSequenceClassification.from_pretrained(tokenizer_model, num_labels=num_labels).to(device)

# lr
if tokenizer_model == "bert-base-uncased":
    lr = 2e-5
elif tokenizer_model == "roberta-base":
    lr = 2e-5
elif tokenizer_model == "distilroberta-base":
    lr = 5e-5
optimizer = AdamW(model.parameters(), lr=lr)
scaler = GradScaler()

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        with autocast(device_type='cuda'):
            outputs = model(input_ids, attention_mask=mask, labels=labels)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask=mask).logits
            pred = torch.argmax(logits, dim=1)
            preds.extend(pred.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    decoded_preds = label_encoder.inverse_transform(preds)
    decoded_labels = label_encoder.inverse_transform(true_labels)
    print(classification_report(decoded_labels, decoded_preds, target_names=label_encoder.classes_))




test_sets = {
    "whole": (X_test['tweet'], y_test),
    "White": (X_test_White['tweet'], y_test_White),
    "AA": (X_test_AA['tweet'], y_test_AA)
}
print(f"\n*** Used code: {code}. Training group: {training_group}. Model: {tokenizer_model}. Seed: {seed} ***")
for name, (Xg, yg) in test_sets.items():
    print(f"\n--- Testing on {name.upper()} ---")
    test_data = FDCL_dataset(tokenize_function(Xg), yg)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=32)
    model.eval()
    preds, labels_all = [], []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f"Testing {name}"):
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            logits = model(input_ids, attention_mask=mask).logits
            pred = torch.argmax(logits, dim=1)
            preds.extend(pred.cpu().numpy())
            labels_all.extend(labels.cpu().numpy())
    decoded_preds = label_encoder.inverse_transform(preds)
    decoded_labels = label_encoder.inverse_transform(labels_all)
    print(classification_report(decoded_labels, decoded_preds, target_names=label_encoder.classes_))

    # SAVE TO EXCEL
    report_dict = classification_report(
        decoded_labels,
        decoded_preds,
        target_names=label_encoder.classes_,
        output_dict=True
    )

    report_df = pd.DataFrame(report_dict).transpose()

    if "accuracy" in report_df.index:
        report_df.loc["accuracy", ["precision", "recall"]] = [float("nan"), float("nan")]
        report_df.loc["accuracy", "support"] = len(decoded_labels)

    report_df["support"] = pd.to_numeric(report_df["support"], errors="coerce").round()

    report_df.loc[report_df.index != "accuracy", "support"] = (
        report_df.loc[report_df.index != "accuracy", "support"].astype("Int64")
    )

    for col in ["precision", "recall", "f1-score"]:
        if col in report_df.columns:
            report_df[col] = pd.to_numeric(report_df[col], errors="coerce").round(2)

    report_df = report_df.astype(str)
    report_df.to_excel(f"classification_report_{code}_{tokenizer_model}_{seed}_{training_group}_{name}.xlsx")
    print("#####################################################")