In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install transformers peft datasets evaluate

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.

## LoRA model with a mixed dataset (small size)

### Prepare data

In [3]:
source_dir = '/content/gdrive/MyDrive/Colab_Notebooks/ML-LoRA-E5/'

In [4]:
import os
import pandas as pd
import torch

In [5]:
seed = 2024
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [23]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(False)

In [None]:
# load Yahoo news
yahoo_news = pd.read_csv(os.path.join(source_dir, "mix_data/yahoo.csv"))
yahoo_rewritten = pd.read_csv(os.path.join(source_dir, "mix_data/yahoo_rewritten.csv"))

matching_rows = yahoo_rewritten.index[yahoo_rewritten['0'] == yahoo_rewritten['rewritten_text']]
matching_rows

yahoo_news.drop(matching_rows, axis=0, inplace=True)
yahoo_news['labels'] = 0
yahoo_news = yahoo_news[['0', 'labels']]
yahoo_news.columns = ['text', 'labels']
yahoo_news.drop_duplicates(subset='text', inplace=True)
yahoo_news.head()

yahoo_rewritten['labels'] = 1
yahoo_rewritten = yahoo_rewritten[['rewritten_text', 'labels']]
yahoo_rewritten.columns = ['text', 'labels']
yahoo_rewritten.drop_duplicates(subset='text', inplace=True)
yahoo_rewritten.head()

yahoo_df = pd.concat([yahoo_news, yahoo_rewritten], axis=0, ignore_index=True)
display(yahoo_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yahoo_news.drop_duplicates(subset='text', inplace=True)


Unnamed: 0,text,labels
0,NASA chief and Democratic lawmakers urge inves...,0
1,2024 election updates: Melania Trump speaks at...,0
2,Trump backer calls Harris the ‘antichrist’ as ...,0
3,"Yen Weakens, Japan Stocks Face Pressure on Ele...",0
4,Walz plays 'Madden' with AOC on Twitch as Harr...,0
...,...,...
5190,Exploring Apple Intelligence in iOS 18.1: Disc...,1
5191,7 Timeless French Recipes by Julia Child That ...,1
5192,"A year after Matthew Perry's passing, is Matt ...",1
5193,Disgusting ‘Friends’ Thanksgiving dish that ‘t...,1


In [None]:
# load Kaggle datasets
arguGPT = pd.read_csv(os.path.join(source_dir, "mix_data/ArguGPT_Moth.csv"))
DAIGT_DK = pd.read_csv(os.path.join(source_dir, "mix_data/DAIGT_DarekKLeczek.csv"))

arguGPT['labels'] = 1
arguGPT = arguGPT[['text', 'labels']]
arguGPT.columns = ['text', 'labels']
arguGPT.drop_duplicates(subset='text', inplace=True)
arguGPT.head()

DAIGT_DK = DAIGT_DK[['text', 'label']]
DAIGT_DK.columns = ['text', 'labels']
DAIGT_DK.drop_duplicates(subset='text', inplace=True)
DAIGT_DK.head()

kaggle_df = pd.concat([arguGPT, DAIGT_DK], axis=0, ignore_index=True)
display(kaggle_df)

In [None]:
df = pd.concat([yahoo_df, kaggle_df], axis=0, ignore_index=True)
print(df['labels'].value_counts())

labels
0    32173
1    21215
Name: count, dtype: int64


In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df.to_csv(os.path.join(source_dir, 'mix_data/mix_data.csv'), index=False)

In [None]:
"""
# downsampling 0-label to balance data (needed or not?)

# Assume 'category_column' is the column with categories, and 'target_category' is the category to downsample
target_count = df['labels'].value_counts().min()  # Target count to balance data

# Separate the categories
balanced_data = []
for category in df['labels'].unique():
    category_data = df[df['labels'] == category]

    # Downsample if the category has more data than the target count
    if len(category_data) > target_count:
        category_data = category_data.sample(target_count, random_state=42)  # Set random_state for reproducibility

    balanced_data.append(category_data)

# Combine all balanced categories into one DataFrame
df = pd.concat(balanced_data).reset_index(drop=True)
"""

### Load data into dataset

In [None]:
from datasets import load_dataset

In [None]:
# loading dataset
dataset = load_dataset('csv', data_files=os.path.join(source_dir, 'mix_data/mix_data.csv'))

def is_valid_text(example):
    return example['text'] is not None and example['labels'] is not None

dataset = dataset["train"].filter(is_valid_text) # drop rows with missing value
display(dataset)
dataset = dataset.class_encode_column('labels')

Dataset({
    features: ['text', 'labels'],
    num_rows: 53388
})

In [None]:
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='labels')
print("Train label distribution:", dataset["train"].to_pandas()["labels"].value_counts())
print("Test label distribution:", dataset["test"].to_pandas()["labels"].value_counts())

Train label distribution: labels
0    25738
1    16972
Name: count, dtype: int64
Test label distribution: labels
0    6435
1    4243
Name: count, dtype: int64


### Select raw model

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score

In [None]:
# Helper function to count trainable parameters
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

# Helper function make inference with a given pre-trained model
def tokenize_data(example, tokenizer):
    return tokenizer(example['text'], truncation=True, max_length=512)

def custom_collate_fn(features):
    # Remove the 'text' and 'labels' fields to prevent errors
    filtered_features = [
        {k: v for k, v in feature.items() if k in ['input_ids', 'attention_mask', 'token_type_ids']}
        for feature in features
    ]
    return data_collator(filtered_features)

def inference_model(model_name, test_dataset):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    raw_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    raw_model.to(device)
    raw_model.eval()
    for module in raw_model.modules():
        if isinstance(module, (torch.nn.Dropout, torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.LayerNorm)):
            module.eval()
    #data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    test_data = test_dataset.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    dataloader = DataLoader(test_data, batch_size=1, collate_fn=custom_collate_fn, shuffle=False)
    predictions = []
    with torch.no_grad():
      for batch in dataloader:
          batch = {k: v.to(raw_model.device) for k, v in batch.items()}
          outputs = raw_model(**batch)
          logits = outputs.logits
          batch_predictions = logits.argmax(dim=-1)
          predictions.extend(batch_predictions.cpu().numpy())
    true_labels = test_data["labels"]
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average="weighted")
    return accuracy, f1


In [None]:
# LoRA configuration (parameters can be adjusted)
# the rank you want to decompose matrices
r = 2 # take as an example. larger r more trainable parameters
lora_config = LoraConfig(task_type=TaskType.SEQ_CLS,
                         r=r,               # Low-rank adaptation rank
                         lora_alpha=1,     # Scaling factor
                         lora_dropout=0.1,  # Dropout for LoRA
                         target_modules = ['query', 'key'],
                         )

#### 1. BERT model with LoRA

In [None]:
b_model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-cased',
    num_labels=2
)
b_model.to(device)
bert_model = get_peft_model(b_model, lora_config)
print_trainable_parameters(bert_model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 75266 || all params: 108387076 || trainable%: 0.0694418585477848


#### RoBERTa model with LoRA

In [None]:
r_model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=2
)
roberta_model = get_peft_model(r_model, lora_config)
print_trainable_parameters(roberta_model)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 665858 || all params: 125313028 || trainable%: 0.531355766137899




#### DeBERTa model with LoRA

In [None]:
from transformers import DebertaForSequenceClassification

In [None]:
d_model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-base",
    num_labels=2
)
#for name, module in d_model.named_modules():
#    print(name)
lora_config_dberta = LoraConfig(task_type=TaskType.SEQ_CLS,
                         r=r,               # Low-rank adaptation rank
                         lora_alpha=1,     # Scaling factor
                         lora_dropout=0.1,  # Dropout for LoRA
                         target_modules = ['attention.self.in_proj']
                         )
deberta_model = get_peft_model(d_model, lora_config_dberta)
print_trainable_parameters(deberta_model)

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 75266 || all params: 139269124 || trainable%: 0.05404356532033618


#### E5 model with LoRA

In [None]:
e_model = AutoModelForSequenceClassification.from_pretrained("intfloat/e5-small")

lora_config_e5 = LoraConfig(task_type=TaskType.SEQ_CLS,
                         r=r,               # Low-rank adaptation rank
                         lora_alpha=1,     # Scaling factor
                         lora_dropout=0.1,  # Dropout for LoRA
                         target_modules = ['attention.self.query','attention.self.key']
                         )
e5_model = get_peft_model(e_model, lora_config_e5)
print_trainable_parameters(e5_model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 37634 || all params: 33398404 || trainable%: 0.11268203115334494


In [None]:
inference_model("intfloat/e5-small", dataset['test'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(0.41824311668851843, 0.41355024383194977)

In [None]:
inference_model("intfloat/e5-small", dataset['test'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(0.46834613223450083, 0.4492341443571796)

### Train data with E5 model with LoRA

In [7]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, AutoModelForSequenceClassification
import evaluate
from datasets import load_dataset

import torch
import pandas as pd
import numpy as np

import os

In [8]:
source_dir = '/content/gdrive/MyDrive/Colab_Notebooks/ML-LoRA-E5/'

In [9]:
dataset = load_dataset('csv', data_files=os.path.join(source_dir, 'mix_data/mix_data.csv'))

def is_valid_text(example):
    return example['text'] is not None and example['labels'] is not None

dataset = dataset["train"].filter(is_valid_text) # drop rows with missing value
display(dataset)
dataset = dataset.class_encode_column('labels')

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/53388 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 53388
})

Stringifying the column:   0%|          | 0/53388 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/53388 [00:00<?, ? examples/s]

In [10]:
dataset = dataset.train_test_split(test_size=0.2, stratify_by_column='labels')
print("Train label distribution:", dataset["train"].to_pandas()["labels"].value_counts())
print("Test label distribution:", dataset["test"].to_pandas()["labels"].value_counts())

Train label distribution: labels
0    25738
1    16972
Name: count, dtype: int64
Test label distribution: labels
0    6435
1    4243
Name: count, dtype: int64


In [11]:
def tokenize_function(examples):
    try:
        return tokenizer(examples["text"], max_length=512, truncation=True)
    except Exception as e:
        print("Error during tokenization:", e)
        print("Offending examples:", examples["text"])
        raise e

tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-small")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/362 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



Map:   0%|          | 0/42710 [00:00<?, ? examples/s]

Map:   0%|          | 0/10678 [00:00<?, ? examples/s]

In [14]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [49]:
e_model = AutoModelForSequenceClassification.from_pretrained("intfloat/e5-small", num_labels=2)

#for name, module in e_model.named_modules():
#    print(name)
r = 8
lora_config_e5 = LoraConfig(task_type=TaskType.SEQ_CLS,
                         r=r,               # Low-rank adaptation rank
                         lora_alpha=2*r,     # Scaling factor
                         lora_dropout=0.1,  # Dropout for LoRA
                         target_modules = ['attention.self.query','attention.self.key']
                         )
e5_model = get_peft_model(e_model, lora_config_e5)
print_trainable_parameters(e5_model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 148226 || all params: 33508996 || trainable%: 0.44234688499768837


In [50]:
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [51]:
training_args = TrainingArguments(
    output_dir=os.path.join(source_dir, 'mix_data/results_LoRA_e5'),
    overwrite_output_dir=True,
    run_name='LoRA-E5-no-filter',
    #save_strategy="epoch",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    #learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=1,
    group_by_length=True,
    num_train_epochs=10
)



In [52]:
trainer = Trainer(
    model=e5_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],  # Use test split as validation data
    data_collator=data_collator,  # Dynamic padding
    compute_metrics=compute_metrics  # Evaluation metrics
)

In [53]:
os.environ["WANDB_DISABLED"] = "false"
os.environ["WANDB_MODE"] = "dryrun"

In [54]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1268,0.147354,0.942311
2,0.1016,0.161432,0.943529
3,0.0787,0.11578,0.957483
4,0.0753,0.158045,0.948773
5,0.0692,0.108834,0.962446
6,0.0766,0.117967,0.959824
7,0.0585,0.148705,0.953549
8,0.0649,0.141469,0.955703
9,0.0557,0.124927,0.95973
10,0.0565,0.125467,0.95973


TrainOutput(global_step=53390, training_loss=0.09535855480650655, metrics={'train_runtime': 4582.3386, 'train_samples_per_second': 93.206, 'train_steps_per_second': 11.651, 'total_flos': 2.0005699680552624e+16, 'train_loss': 0.09535855480650655, 'epoch': 10.0})

In [56]:
trainer.save_model(os.path.join(source_dir, 'mix_data/final_model'))