In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
!fuser -v /dev/nvidia*   # Show all processes using the GPU
!kill -9 $(fuser -v /dev/nvidia* 2>/dev/null | awk '{print $1}')

                     USER        PID ACCESS COMMAND
/dev/nvidia0:        root      17831 F...m python3
/dev/nvidiactl:      root      17831 F...m python3
/dev/nvidia-uvm:     root      17831 F...m python3


In [2]:
import os
os.environ['WANDB_DISABLED'] = 'true'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["HF_HOME"] = "./.huggingface"
os.environ["HF_TOKEN"] = "./HF_token"

In [None]:
import numpy as np
from numba import cuda
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sn
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# **Pretrained Model**

In [2]:

# Load Data
positive_data = pd.read_csv('positive_sentiment_output.csv', header=None, names=['text'])
negative_data = pd.read_csv('negative_sentiment_output.csv', header=None, names=['text'])
neutral_data = pd.read_csv('neutral_sentiment_output.csv', header=None, names=['text'])

# Add Labels
positive_data['label'] = 1  # Positive sentiment
negative_data['label'] = 0  # Negative sentiment
neutral_data['label'] = 2   # Neutral sentiment

# Combine Data
data = pd.concat([positive_data, negative_data, neutral_data], ignore_index=True)
print("Dataset Loaded: ", data.shape)

# Step 3: Load tokenizer and preprocess function
model_name = "fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse"
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="zho_Hans")

# Step 5: Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=r".*llm2vec.*(self_attn\.(q|k|v|o)_proj|mlp\.(gate|up|down)_proj).*",
    bias="none",
    task_type="SEQ_CLS"
)

# Step 6: Load model with FP16 quantization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load model with LoRA configuration and FP16 quantization
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    trust_remote_code=True,
    torch_dtype=torch.float16
)
model = get_peft_model(model, lora_config)
model.to(device)

# Tokenize data
def preprocess_data(data):
    inputs = tokenizer(list(data['text']), truncation=True, padding=True, max_length=512, return_tensors="pt")
    return inputs

print("Tokenizing Data...")
inputs = preprocess_data(data)
labels = torch.tensor(data['label'].values)

# Split data into train and test
train_size = int(0.8 * len(labels))
X_train, X_test = {k: v[:train_size].to(device) for k, v in inputs.items()}, {k: v[train_size:].to(device) for k, v in inputs.items()}
y_train, y_test = labels[:train_size].to(device), labels[train_size:].to(device)

# Training Loop
print("Training Model...")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()
model.train()
for epoch in range(3):  # Training for 3 epochs
    optimizer.zero_grad()
    outputs = model(**X_train)
    loss = criterion(outputs.logits, y_train)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

# Evaluation
print("Evaluating Model...")
model.eval()
with torch.no_grad():
    outputs = model(**X_test)
    preds = torch.argmax(outputs.logits, dim=-1)
    accuracy = accuracy_score(y_test.cpu(), preds.cpu())
    precision = precision_score(y_test.cpu(), preds.cpu(), average='weighted')
    recall = recall_score(y_test.cpu(), preds.cpu(), average='weighted')
    f1 = f1_score(y_test.cpu(), preds.cpu(), average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test.cpu(), preds.cpu())

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Class 0", "Class 1", "Class 2"],
            yticklabels=["Class 0", "Class 1", "Class 2"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix - NLLB-LLM2Vec (Test Set)")
plt.show()


Dataset Loaded:  (10139, 2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of NLLBLLM2VecForSequenceClassification were not initialized from the model checkpoint at fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing Data...


**Fine-Tune**

In [6]:
import pandas as pd
from datasets import Dataset, Features, Value, ClassLabel

# 加载第一个数据集
code_switched_df = pd.read_csv('updated_sentiment_output.csv')

# 重命名列以适应代码需求
code_switched_df.rename(columns={'Text': 'text', 'Sentiment': 'label'}, inplace=True)

features = Features({
    'text': Value('string'),
    'label': ClassLabel(num_classes=3, names=['Negative', 'Neutral', 'Positive'])
})

cs_dataset = Dataset.from_pandas(code_switched_df)
cs_dataset = cs_dataset.cast(features)

print("Code Switched Dataset:")
print(cs_dataset[:10])

# 加载第二个数据集
canto_df = pd.read_csv('merged_sentiment_dataset_zh.csv')


# 删除多余的列
columns_to_keep = ['text', 'label']

canto_df = canto_df[columns_to_keep]

canto_dataset = Dataset.from_pandas(canto_df)
canto_dataset = canto_dataset.cast(features)

print("Cantonese Dataset:")
print(canto_dataset[:10])


Casting the dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Code Switched Dataset:
{'text': [' okay 哎 文 平 你 好 okay', ' okay 好 你 你 介 绍 先 啦', ' 哦 okay 我 呢 就 是 永 辉 啦 win frey 然 后 就 是 也 是 malaysian 一 个 然 后 就 是 来 自 ipoh 的 <v-noise> 然 后 呢 我 本 身 已 经 在 就 是', ' 在 年 中 已 已 经 graduate 了 了', ' okay hum', ' 哦 actually 我 已 经 就 是 erm two three erm two three months 这 样 啦 then before that 我 有 一 个 工 作 就 是 在 那 个', ' 做 那 种 customer service 的 representative lah 就 是 在 在 yio chu kang 那 边 的 hum 然 后 现 在 就 是 想 回 来 找 找 工 作', ' 哦 <v-noise> 之 前 啊 是 在 那 个 叫 apple 啊', ' 美 国 的 那 个 apple 公 司', ' 哪 里 你 知 道 吗 卖 那 个 iphone lah itouch 那 个'], 'label': [2, 2, 2, 1, 2, 1, 1, 1, 1, 1]}


Casting the dataset:   0%|          | 0/21201 [00:00<?, ? examples/s]

Cantonese Dataset:
{'text': ["购买的版本是17年第18次印刷 不算英文版的错误，中文版错误百出：漏词、曲解原意、译文不通顺、还有直接性错误（比如原文是字符串，中文版是布尔值），根本不是无意的错误，就是态度不负责，一心捞钱，都他妈第18次印刷了还能这么多错误！！ 目前看到第9章，对着英文版和o'reilly的勘误表，真尼玛一路笑着看过来的！！ 最后说一句，翻译和出版社负责人亲妈直播原地爆炸！！！操你妈逼好好一本书弄成臭狗屎，草泥马！！", '卫生 条件 差 服务 态度 差 前 台长 得很 丑 标 间里 只有 一 条 浴巾 房间 电视 打不 开 也 不 给 修 … 差差差差差差差差', '早餐 能 多 烂 就 多 烂 ! 这些年 吃 过 最 坑 爹 的 早餐 、 宵夜 也 没有 什么 吃 的 、 万般无奈 想 来 碗 泡面 都 没有 ! 住了 一 天 就 跑 了 受不了了', '太小了 ，拿到快递时，我他么都想扔了，一毛钱的硬币有你的这个钉子两个大了，这个你能指望挂什么东西。。。擦。最后悔的一次购物，垃圾亚马逊，垃圾商家。', '158元 看着便宜才选的一块表 也没指望他能给你惊喜 收到货也确实如此 关键是物流太垃圾了 延误了4天 计划周六到结果在我催了好几次的情况下周二才到 物流不显示到哪里了 到底为什么延误全靠猜 奇葩的亚马逊 没单号查不到物流 自己催也没办法催 你只有傻傻的等着 延误了4天连个为什么也不说 真想直接退了算了 想想退货也是很麻烦的 算了 这样下去我看亚马逊一辈子也赶不上 京东和天猫', '美 团团 购 的 , 看 了 好多 遍 , 还是 理解 错误 , 90 块 的 团购 卖到 158 , 丸类 还 没有 , 被 简略 归并 入 滑类 , 服务员 还 说 的 理直气壮 的 , 这 还 是 点评 必须 给 星星 , 照 我 说 一 颗 星 都 不 该 给 , 概 不 得 生意 这么 差 , 一 晚上 就 4 桌 , 整个 团 购 就 70 人 , 估计 不少 还是 我 这 种 上当 的', '如果 不 想 得病 建议 还是 不要 住 这 家 , 附近 别的 酒店 没 房间 了 选择 了 这 家 。 一 进 房间 就 有 一 股子 怪味 。 房间 很 小 , 很脏 : 枕头 发黄 , 床单 被罩 显然 没有 换 过 , 还有

In [16]:
model_name = "fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse"
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="yue_Hant") # change the source language accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=256)
tokenized_dataset = cs_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10136 [00:00<?, ? examples/s]

In [19]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2, stratify_by_column='label', seed=42)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

In [18]:
from peft.tuners.lora.config import LoraConfig

# Adjust LoRA settings for sentiment analysis
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=r".*llm2vec.*(self_attn\.(q|k|v|o)_proj|mlp\.(gate|up|down)_proj).*",
    bias="none",
    task_type="SEQ_CLS"
)

# Load model and apply LoRA
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # Sentiment analysis has three classes
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)
model = get_peft_model(model, lora_config)
model.to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of NLLBLLM2VecForSequenceClassification were not initialized from the model checkpoint at fdschmidt93/NLLB-LLM2Vec-Meta-Llama-31-8B-Instruct-mntp-unsup-simcse and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 30.81 MiB is free. Process 19186 has 39.53 GiB memory in use. Of the allocated memory 38.98 GiB is allocated by PyTorch, and 39.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Ensure predictions are converted from logits to class indices
    if isinstance(predictions, tuple):
        predictions = predictions[0]  # Extract logits


    predictions = predictions.argmax(axis=1)  # Get predicted class index
    labels = labels.astype(int)

    accuracy = accuracy_score(labels, predictions)
    return {"accuracy": accuracy}

In [None]:
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    num_train_epochs=3,
    # weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    # warmup_steps=500,
    # eval_strategy="epoch",
    # save_strategy="epoch",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    save_steps=500,  # Save model every 500 steps
    learning_rate=5e-5,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True,
    report_to='none',  # Disable logging to all integrations
    logging_dir="./logs",
    logging_steps=500,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
torch.cuda.empty_cache()
trainer.train()

trainer.save_model("./fine_tuned_NLLB-LLM2Vec-cs")
tokenizer.save_pretrained("./fine_tuned_NLLB-LLM2Vec-cs")

In [None]:
prediction_output = trainer.predict(test_dataset)

predictions = prediction_output.predictions
labels = prediction_output.label_ids

In [None]:
# Step 4: Extract the correct tensor from the tuple
predictions = predictions[0]  # Assuming the first element contains the logits or predictions

# Step 5: Convert logits to class predictions if necessary
if len(predictions.shape) == 2:  # Assuming logits are 2-dimensional
    predictions = np.argmax(predictions, axis=1)

# Step 6: Verify shapes
print("Labels shape:", labels.shape)
print("Predictions shape:", predictions.shape)

# Compute metrics

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(
    labels, predictions, average='weighted'
)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Compute the classification report
report = classification_report(labels, predictions, target_names=["Negative", "Neutral", "Positive"])

# Print the classification report
print(report)

In [None]:
# Plot the confusion matrix as a heatmap

from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Compute the confusion matrix
conf_matrix = confusion_matrix(labels, predictions)

# Plot the confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False,
            xticklabels=["Negative", "Neutral", "Positive"],
            yticklabels=["Negative", "Neutral", "Positive"])
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix (NLLB-LLM2Vec)")
plt.show()