In [1]:
!pip install -q -U bitsandbytes
!!pip install -q -U accelerate
!pip install peft

Installing collected packages: peft
Successfully installed peft-0.14.0


In [3]:
import pandas as pd

In [2]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from skmultilearn.model_selection import iterative_train_test_split
from datasets import Dataset, DatasetDict
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)


In [4]:
annotations = pd.read_csv('/kaggle/input/subtaskqkk/subtask-1-annotations.txt', sep="\t", header=None,
                          names=["article_id", "entity", "start_offset", "end_offset", "main_role", "fine_grained_roles"],
                          on_bad_lines="skip")

In [6]:
annotations.to_csv('annotations.csv', index=False)

In [12]:
import os
import csv
import random
import numpy as np
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import shuffle

# Define paths
data_file = 'annotations.csv'  # Your dataset
text_path = '/kaggle/input/subtask1/raw-documents/raw-documents/'  # Folder containing the article_id text files

# Label mappings
fine_grained_roles = [
    "Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous", "Instigator",
    "Conspirator", "Tyrant", "Foreign Adversary", "Traitor", "Spy", "Saboteur",
    "Corrupt", "Incompetent", "Terrorist", "Deceiver", "Bigot", "Forgotten",
    "Exploited", "Victim", "Scapegoat"
]
label_mapping = {label: i for i, label in enumerate(fine_grained_roles)}

# Read data
data = []
with open(data_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        # Read the text content from the file
        file_path = os.path.join(text_path, row['article_id'])
        if not file_path.endswith('EN_UA_$.txt'):
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
    
            # Create a one-hot encoded label for fine_grained_roles
            labels = [0] * len(fine_grained_roles)
            for role in row['fine_grained_roles'].split(','):  # Assume comma-separated labels
                role = role.strip()
                if role in label_mapping:
                    labels[label_mapping[role]] = 1
    
            data.append({
                'text': text,
                'labels': labels
            })

# Shuffle the data
random.shuffle(data)

# Extract texts and labels
texts = [item['text'] for item in data]
labels = np.array([item['labels'] for item in data], dtype=int)

# Compute label weights (optional)
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# Train/test split (stratified for multilabel)
row_ids = np.arange(len(labels))
train_idx, val_idx = train_test_split(row_ids, test_size=0.1, stratify=labels.sum(axis=1))
x_train = [texts[i] for i in train_idx]
x_val = [texts[i] for i in val_idx]
y_train = labels[train_idx]
y_val = labels[val_idx]

# Create Hugging Face Dataset
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train.tolist()}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val.tolist()})
})

# Model name (replace with your own model)
model_name = 'mistralai/Mistral-7B-v0.1'

# Print Dataset Details
print(ds)


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 349
    })
    val: Dataset({
        features: ['text', 'labels'],
        num_rows: 39
    })
})


In [None]:
# set random seed
random.seed(0)

# load data
with open('/kaggle/working/annotations.csv', newline='') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    header_row = data.pop(0)

# shuffle data
random.shuffle(data)

# reshape
idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:]) for row in data]))
labels = np.array(labels, dtype=int)

# create label weights
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# stratified train test split for multilabel ds
row_ids = np.arange(len(labels))
train_ids = row_ids
#train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
x_train = [text[i] for i in train_idx.flatten()]

# create hf dataset
ds = DatasetDict({
    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})


In [17]:
#model_name = "meta-llama/Llama-3.1-7b"
access_token = "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"

'''tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=access_token)
'''

'tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=access_token)\nmodel = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=access_token)\n'

In [None]:

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
    tokenized_inputs = tokenizer(examples['text'])
    tokenized_inputs['labels'] = examples['labels']
    return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=access_token)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, use_auth_token=access_token,
    quantization_config=quantization_config,
    num_labels=labels.shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

# define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8, # tested with 16gb gpu ram
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

# train
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Custom loss computation.
        """
        # Extract labels from inputs
        labels = inputs.pop("labels")
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        # Compute custom loss
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.label_weights)
        loss = loss_fct(logits, labels.float())
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_ds['train'],
    eval_dataset = tokenized_ds['val'],
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics,
    label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()

# save model
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

# load model
peft_model_id = 'multilabel_mistral'
model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)

In [None]:
from transformers import Trainer
import torch.nn as nn

class CustomTrainer(Trainer):
    def __init__(self, label_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = label_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Custom loss computation.
        """
        # Extract labels from inputs
        labels = inputs.pop("labels")
        
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute custom loss with label weights
        loss_fct = nn.BCEWithLogitsLoss(pos_weight=self.label_weights)
        loss = loss_fct(logits, labels.float())

        # Return loss and optionally outputs
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['val'],
    tokenizer=tokenizer,
    data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics=compute_metrics,
    label_weights=torch.tensor(label_weights, device=model.device)  # Explicitly pass label weights
)


trainer.train()


peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

