Connect Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q -U "huggingface_hub[cli]" accelerate bitsandbytes peft transformers trl datasets einops flash_attn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m112.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m544.8/544.8 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flash_attn (setup.py) ... [?25l[?25hdone


In [None]:
# First, install the necessary libraries.
# The 'trl' library provides the SFTTrainer, which simplifies the fine-tuning process.
# The 'accelerate' library is for distributed training.
# The 'bitsandbytes' library is for efficient quantization.
# The 'peft' library is for Parameter-Efficient Fine-Tuning (PEFT), like LoRA.
# The 'datasets' library is for handling and loading datasets.
# The 'transformers' library is the core for loading the model and tokenizer.
# The 'einops' library is required for the specific Gemma model.
# The 'flash_attn' library is highly recommended for faster training on GPUs.

import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

import pandas as pd
from datasets import Dataset

Hugging Face token

In [None]:
from huggingface_hub import login
from google.colab import userdata

# Access your Hugging Face token from Colab's Secrets Manager
# Make sure you have added a secret named 'HF_TOKEN' with your token
my_token = userdata.get('HF_TOKEN')
login(token=my_token)

Dataset

In [None]:
# Load data CSV file into a pandas DataFrame
csv_file_path = "/content/drive/MyDrive/Cyberbullying Classification/cyberbullying_tweets.csv"
df = pd.read_csv(csv_file_path)

# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Display the first few rows of the dataset to verify
print(dataset)
print(dataset[0])

Dataset({
    features: ['tweet_text', 'cyberbullying_type'],
    num_rows: 47692
})
{'tweet_text': 'In other words #katandandre, your food was crapilicious! #mkr', 'cyberbullying_type': 'not_cyberbullying'}


In [None]:
# Define a function to format the dataset into a conversational turn
def format_prompt(sample):
    # This format is a simple example for classification.
    # The goal is to provide a clear instruction for the model to classify the tweet.
    return f"Classify the following tweet as one of the cyberbullying types: 'not_cyberbullying', 'gender', 'religion', 'other_cyberbullying', 'age', or 'ethnicity'.\n\nTweet: {sample['tweet_text']}\n\nCyberbullying Type: {sample['cyberbullying_type']}"

Load the Model and Tokenizer

In [None]:
# We will use the BitsAndBytes library to quantize the model to 4-bit,
# which greatly reduces memory usage.
# We'll use 'bfloat16' for computation if the GPU supports it, for better numerical stability.
# The 'flash_attention_2' flag is enabled for faster training.

model_id = "google/gemma-3-270m"
# Check if a GPU is available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [None]:
device

'cuda'

In [None]:
# Load the base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    use_cache=False
    )
model.config.pretraining_tp = 1  # Required for Gemma

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # To prevent issues with attention masks

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [None]:
# --- 3. Configure PEFT (LoRA) ---
# We define the LoraConfig to set up the fine-tuning parameters.
# 'r' is the rank of the update matrices, a lower rank means fewer trainable parameters.
# 'lora_alpha' is a scaling factor.
# 'target_modules' specifies which parts of the model to apply LoRA to. For Gemma,
# these are typically the attention projection layers.
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
model = get_peft_model(model, peft_config)

# Print a summary of the trainable parameters to see the efficiency gain
model.print_trainable_parameters()


trainable params: 5,898,240 || all params: 273,996,416 || trainable%: 2.1527


Configure Training Arguments

In [None]:
# Set up the TrainingArguments. These control the training loop behavior.
training_args = TrainingArguments(
    output_dir="./gemma-dailycnn-lora",
    num_train_epochs=1,  # Number of training epochs
    per_device_train_batch_size=2,  # Reduce batch size
    gradient_accumulation_steps=8,  # Increase accumulation steps
    optim="paged_adamw_32bit",  # Optimizer to use
    save_strategy="epoch",  # Save checkpoint at the end of each epoch
    logging_steps=10,  # Log every N steps
    learning_rate=2e-4,  # Learning rate
    fp16=False,  # Set to True if your GPU supports it
    bf16=True,  # Use bfloat16 for computation if possible
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True, # Enable gradient checkpointing
    gradient_checkpointing_kwargs={'use_reentrant':False} # Recommended for Llama models
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    # tokenizer=tokenizer, # Remove the tokenizer argument from here
    formatting_func=format_prompt,
    args=training_args,
)



Applying formatting function to train dataset:   0%|          | 0/47692 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/47692 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/47692 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/47692 [00:00<?, ? examples/s]

In [None]:
# Start the training
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1, 'bos_token_id': 2, 'pad_token_id': 0}.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33montmerkglobal[0m ([33montmerkglobal-ontmerk[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
10,4.2401
20,4.2972
30,3.913
40,3.2068
50,2.1212
60,2.743
70,2.3479
80,1.9768
90,1.7385
100,1.2155


TrainOutput(global_step=2981, training_loss=1.7255973730739913, metrics={'train_runtime': 9916.9343, 'train_samples_per_second': 4.809, 'train_steps_per_second': 0.301, 'total_flos': 2609966935961088.0, 'train_loss': 1.7255973730739913, 'entropy': 0.6738779246807098, 'num_tokens': 4086381.0, 'mean_token_accuracy': 0.8965421915054321, 'epoch': 1.0})

99e5711fd26e822ad5f2efcdaa238d488901059f

In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model("/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-model")
tokenizer.save_pretrained("/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer")

('/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer/tokenizer.model',
 '/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer/added_tokens.json',
 '/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer/tokenizer.json')

## Model test

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# Define the paths where you saved your model and tokenizer
saved_model_path = "/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-model"
saved_tokenizer_path = "/content/drive/MyDrive/Cyberbullying Classification/gemma-cyberbullying-lora-final-tokenizer"
base_model_id = "google/gemma-3-270m"

# Configuration for 4-bit quantization (must match training)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

# Load the base model with quantization config
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16, # Use the same dtype as training
)

# Load the PEFT model by adding the adapter to the base model
model = PeftModel.from_pretrained(base_model, saved_model_path)

# Merge the LoRA weights with the base model weights
model = model.merge_and_unload()

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(saved_tokenizer_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Define the formatting function (must match the one used during training)
def format_prompt_inference(tweet_text):
    # This format should match the one used during fine-tuning for consistency.
    # We are providing the tweet text and asking the model to complete with the cyberbullying type.
    return f"Classify the following tweet as one of the cyberbullying types: 'not_cyberbullying', 'gender', 'religion', 'other_cyberbullying', 'age', or 'ethnicity'.\n\nTweet: {tweet_text}\n\nCyberbullying Type:"

# Example input text
input_text = "This is a test tweet about age."

# Format the input text
prompt = format_prompt_inference(input_text)

# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate a prediction
# Set max_new_tokens to a small number since we expect a short output (the classification)
with torch.no_grad():
    outputs = model.generate(
        **input_ids,
        max_new_tokens=20, # Adjust based on expected output length
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id # Use eos_token_id for padding
    )

# Decode the output
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the formatted prompt and the generated output
print("--- Prompt ---")
print(prompt)
print("\n--- Generated Output ---")
print(decoded_output)

# Post-process the generated output to extract the classification
# This is a simple example, you might need more robust parsing depending on model output
predicted_output_raw = decoded_output.replace(prompt, "").strip()

# Attempt to extract the predicted type
# This assumes the model outputs the type name directly after "Cyberbullying Type:"
# You might need to adjust this based on the exact model output format
predicted_type = predicted_output_raw.split('\n')[0].strip() # Take the first line after the prompt

# Determine if it's cyberbullying or not
is_cyberbullying = predicted_type.lower() != 'not cyberbullying'

# Print the output in the desired format
print("\n--- Formatted Output ---")
print(f"cyberbullying: {is_cyberbullying}")
print(f"type: {predicted_type}")

--- Prompt ---
Classify the following tweet as one of the cyberbullying types: 'not_cyberbullying', 'gender', 'religion', 'other_cyberbullying', 'age', or 'ethnicity'.

Tweet: This is a test tweet about age.

Cyberbullying Type:

--- Generated Output ---
Classify the following tweet as one of the cyberbullying types: 'not_cyberbullying', 'gender', 'religion', 'other_cyberbullying', 'age', or 'ethnicity'.

Tweet: This is a test tweet about age.

Cyberbullying Type: "not_cyberbullying".

Tweet: This is a test tweet about age.

For the

--- Formatted Output ---
cyberbullying: True
type: "not_cyberbullying".
