In [None]:
# ==================== Installation ====================
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install psutil
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install flash-attn --no-build-isolation
!pip install flash-attention==2.8.2
!pip install scikit-learn  # For language-balanced sampling

In [None]:
# ==================== Imports ====================
import torch
import random
import pandas as pd
import numpy as np
import json
from collections import Counter
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from unsloth import FastModel
from unsloth.trainer import SFTTrainer
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments, TextStreamer

In [None]:
# ==================== Configuration ====================
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Increased for better JSON generation
MAX_SEQ_LENGTH = 192  # Increased from 128 to handle longer texts + JSON

In [None]:
# ==================== 1. Load & Prepare Model ====================
print("Loading model and tokenizer...")
model, tokenizer = FastModel.from_pretrained(
    model_name="unsloth/Qwen2.5-0.5B-Instruct",  # Using Qwen 2.5 0.5B (slightly smaller than 0.6B but similar)
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=torch.bfloat16,
    load_in_4bit=True,
    attn_implementation="flash_attention_2",
)

# Set a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Apply PEFT with LoRA - UPDATED for better JSON generation
model = FastModel.get_peft_model(
    model,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    # Added more target modules for better instruction following
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "lm_head",  # Crucial for output format learning
    ],
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
)

In [None]:
# ==================== 2. Load & Prepare Dataset ====================
print("Loading and formatting dataset...")
df = pd.read_csv("multilingual_sentiment.csv")

# Check dataset structure
print(f"Dataset shape: {df.shape}")
print(f"Languages: {df['lang'].unique()}")
print(f"Label distribution:\n{df['label'].value_counts()}")

# ==================== CRITICAL: Language-Balanced Sampling ====================
print("Applying language-balanced sampling...")

def balance_dataset_by_language(df, target_samples_per_lang=None):
    """
    Balance dataset across languages to prevent bias
    """
    lang_counts = df['lang'].value_counts()
    print(f"Original language distribution:\n{lang_counts}")

    if target_samples_per_lang is None:
        # Use the median count to avoid over/under sampling too much
        target_samples_per_lang = int(lang_counts.median())

    balanced_dfs = []
    for lang in df['lang'].unique():
        lang_df = df[df['lang'] == lang].copy()  # Make a copy to avoid warnings
        if len(lang_df) > target_samples_per_lang:
            # Under-sample if we have too many
            lang_df = lang_df.sample(n=target_samples_per_lang, random_state=SEED, replace=False)
        else:
            # Over-sample if we have too few (with repetition)
            lang_df = lang_df.sample(n=target_samples_per_lang, replace=True, random_state=SEED)
        balanced_dfs.append(lang_df)

    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=SEED)  # Shuffle

    print(f"Balanced language distribution:\n{balanced_df['lang'].value_counts()}")
    return balanced_df

# Balance the dataset
df_balanced = balance_dataset_by_language(df)

# ==================== IMPROVED: Format data for instruction tuning ====================
def format_for_sft(row):
    """
    Create instruction-tuning format with system message for JSON generation
    """
    # Extract language from text if it contains <lang> tag, otherwise use lang column
    text = row["text"]
    lang = row["lang"]
    sentiment = row["label"]

    # Create instruction with system message format
    messages = [
        {"role": "system", "content": "You are a multilingual sentiment analyzer. Analyze the text and output valid JSON with language and sentiment tags. Only output the JSON, no additional text."},
        {"role": "user", "content": f"Text: {text}"},
        {"role": "assistant", "content": f'{{"language": "{lang}", "sentiment": "{sentiment}"}}'}
    ]

    # Format for Qwen chat template
    formatted = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )

    return {"text": formatted}

# Apply formatting
print("Formatting data for instruction tuning...")
formatted_data = []
for _, row in df_balanced.iterrows():
    formatted_data.append(format_for_sft(row))

formatted_df = pd.DataFrame(formatted_data)

# Split dataset - preserve language distribution in splits
print("Splitting dataset...")
def stratified_split_by_language(formatted_df, lang_column, test_size=0.15, val_size=0.15, random_state=42):
    """Split dataset while preserving language distribution in each split"""
    # Add language column back for stratification
    formatted_df = formatted_df.copy()
    formatted_df['lang'] = df_balanced['lang'].values

    # First split: train + temp
    train_df, temp_df = train_test_split(
        formatted_df,
        test_size=test_size + val_size,
        random_state=random_state,
        stratify=formatted_df['lang']  # Stratify by language
    )

    # Second split: validation + test
    val_ratio = val_size / (test_size + val_size)
    val_df, test_df = train_test_split(
        temp_df,
        test_size=1 - val_ratio,
        random_state=random_state,
        stratify=temp_df['lang']  # Stratify by language
    )

    # Remove the lang column before returning
    train_df = train_df.drop(columns=['lang'])
    val_df = val_df.drop(columns=['lang'])
    test_df = test_df.drop(columns=['lang'])

    return train_df, val_df, test_df

# Perform stratified split
train_df, val_df, test_df = stratified_split_by_language(formatted_df, lang_column='lang')

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

# Reset indices to avoid duplicate column issues
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Create DatasetDict
print("Creating DatasetDict...")
dataset_dict = DatasetDict({
    'train': Dataset.from_dict(train_df.to_dict('list')),
    'eval': Dataset.from_dict(val_df.to_dict('list')),
    'test': Dataset.from_dict(test_df.to_dict('list')),
})

print("Dataset created successfully!")

In [None]:
# ==================== 4. Configure Data Collator ====================
# Response template for JSON generation start
response_template = tokenizer('{"language":', add_special_tokens=False)['input_ids']
print(f"Response template tokens: {response_template}")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
)

In [None]:
# ==================== 5. Training Arguments - OPTIMIZED for L4 ====================
training_args = TrainingArguments(
    output_dir="./qwen_sentiment_finetuned",
    num_train_epochs=1,                # Increased for better convergence with small model
    per_device_train_batch_size=192,     # Reduced for L4 memory (4-bit model + LoRA)
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,     # Effective batch size = 4 * 4 = 16
    learning_rate=2e-4,
    warmup_ratio=0.1,
    logging_steps=10,
    greater_is_better=False,
    fp16=False,
    bf16=True, # Changed from bf16 for broader compatibility
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="cosine",
    seed=SEED,
    report_to="none",
    dataloader_num_workers=2,
    gradient_checkpointing=True,       # Enabled for L4 memory savings
    gradient_checkpointing_kwargs={"use_reentrant": False},
)

In [None]:
# Pre-tokenize everything before trainer initialization
def pre_tokenize_dataset(dataset):
    return dataset.map(
        lambda x: tokenizer(
            x["text"],
            truncation=True,
            max_length=MAX_SEQ_LENGTH,
            padding="max_length"
        ),
        batched=True,
        num_proc=1,  # Force single process
        remove_columns=["text"]
    )

tokenized_train = pre_tokenize_dataset(dataset_dict["train"])
tokenized_eval = pre_tokenize_dataset(dataset_dict["eval"])

# Then use these pre-tokenized datasets
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_train,    # Already tokenized
    data_collator=data_collator,
    max_seq_length=MAX_SEQ_LENGTH,
    # Don't use dataset_text_field since data is already tokenized
    dataset_text_field=None,
    packing=False,
)

In [None]:
# ==================== 7. Train & Save ====================
print("Starting training...")
trainer.train()

print("Training complete. Saving model...")
model.save_pretrained("./final_sentiment_model")
tokenizer.save_pretrained("./final_sentiment_model")
print("Model saved.")


In [None]:
from unsloth import FastModel
import torch
import json

# Load model
model, tokenizer = FastModel.from_pretrained(
    model_name="./final_sentiment_model",
    max_seq_length=192,
    load_in_4bit=False,
)

def test_model(text):
    """One-line test function"""
    prompt = f"""<|system|>
You are a multilingual sentiment analyzer. Output JSON with language and sentiment.

<|user|>
Text: {text}

<|assistant|>
"""

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(response)

# Quick test
print(test_model("I'm very happy with this!"))
print(test_model("No me gusta nada esto."))

# Saving Models in different precision

In [None]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Paths
lora_path = "/content/final_sentiment_model"
base_model_name = "unsloth/Qwen2.5-0.5B-Instruct"

# ==================== 1. FP16 (Default Precision) ====================
print("Saving in FP16...")
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    tie_word_embeddings=False,
    load_in_4bit=True,
)
peft_model = PeftModel.from_pretrained(base_model, lora_path)
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("/content/qwen_langsenti_merged_fp16")

# ==================== 2. INT8 (8-bit Quantization) ====================
print("Saving in INT8...")
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_quant_type="qat8",
    bnb_8bit_compute_dtype=torch.float16,
)

int8_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)
peft_model_int8 = PeftModel.from_pretrained(int8_model, lora_path)
merged_int8 = peft_model_int8.merge_and_unload()
merged_int8.save_pretrained("/content/qwen_langsenti_merged_int8")

# ==================== 3. INT4 (4-bit Quantization) ====================
print("Saving in INT4...")
bnb_config_4bit = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

int4_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config_4bit,
    device_map="auto"
)
peft_model_int4 = PeftModel.from_pretrained(int4_model, lora_path)
merged_int4 = peft_model_int4.merge_and_unload()
merged_int4.save_pretrained("/content/qwen_langsenti_merged_int4")

# ==================== Save Tokenizer ====================
tokenizer = AutoTokenizer.from_pretrained(lora_path)
tokenizer.save_pretrained("/content/qwen_langsenti_merged_fp16")
tokenizer.save_pretrained("/content/qwen_langsenti_merged_int8")
tokenizer.save_pretrained("/content/qwen_langsenti_merged_int4")

print("All precision models saved!")

In [None]:
!zip -r qwen_langsenti_merged_int8.zip qwen_langsenti_merged_int8
!zip -r qwen_langsenti_merged_int4.zip qwen_langsenti_merged_int4