In [2]:
import pandas as pd
import torch
import gc

In [5]:
if torch.cuda.is_available():
    print("CUDA is available. GPU will be used for inference.")
else:
    print("CUDA is not available. Inference will run on CPU.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# load the datasets
df = pd.read_excel("D:\CJ\course\大四下\\final project\ARO\ARO Intelligent\preparation\data\corpus.xlsx")

texts = df["Content"].tolist()

CUDA is available. GPU will be used for inference.


In [9]:

import torch
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from transformers import XLMRobertaForMaskedLM


# extract embeddings
def get_embeddings(model_name, texts, max_length=128, batch_size=32):

    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
    model_path = r"D:\\CJ\\course\\大四下\\final project\ARO\ARO Intelligent\\preparation\\model\\xlm_trained_model"


    if model_name == 'xlm-roberta-base':
        model = XLMRobertaModel.from_pretrained(model_name)
    else:
        model = XLMRobertaModel.from_pretrained(model_path)
    
    model.eval()  # set model to evaluation mode 
    
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', max_length=max_length, truncation=True, padding='max_length')
        
        with torch.no_grad():  # do not calculate gradients to save memory
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)  # use mean pooling
            embeddings.append(batch_embeddings)
            del inputs, outputs, batch_embeddings  # Free up memory
            gc.collect()
            
    embeddings_tensor = torch.cat(embeddings, dim=0)
    return embeddings_tensor

# use mBERT model to extract embeddings
xlmr_trained_embeddings = get_embeddings('xlm-roberta-trained-base', texts)

# use XLM-R model to extract embeddings
xlmr_embeddings = get_embeddings('xlm-roberta-base', texts)




Some weights of XLMRobertaModel were not initialized from the model checkpoint at D:\\CJ\\course\\大四下\\final project\ARO\ARO Intelligent\\preparation\\model\\xlm_trained_model and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

def cluster_embeddings(embeddings, n_clusters=5):
    # use K-Means to cluster embeddings
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(embeddings)
    labels = kmeans.labels_
    
    # calculate silhouette score
    silhouette_avg = silhouette_score(embeddings, labels)
    print(f"silhouette score: {silhouette_avg:.2f}")
    
    return labels

def visualize_embeddings(embeddings, labels):
    # use t-SNE to reduce dimensionality 
    tsne = TSNE(n_components=2, random_state=42)
    reduced_embeddings = tsne.fit_transform(embeddings)
    
    # visualize
    plt.figure(figsize=(10, 6))
    plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='viridis', marker='.')
    plt.colorbar()
    plt.show()

# xlmr_embeddings are embeddings of XLM-R model
xlmr_labels = cluster_embeddings(xlmr_embeddings)
visualize_embeddings(xlmr_embeddings, xlmr_labels)

xlmr_trained_labels = cluster_embeddings(xlmr_trained_embeddings)
visualize_embeddings(xlmr_trained_embeddings, xlmr_trained_labels)
