In [None]:
# install Hugging Face Libraries
!pip install  git+https://github.com/huggingface/peft.git
!pip install "transformers==4.27.2" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score tensorboard py7zr

In [None]:
# Select CUDA device index
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-xl"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
from peft import prepare_model_for_int8_training

model = prepare_model_for_int8_training(model)

In [None]:
from peft import LoraConfig, get_peft_model, TaskType


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


lora_config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["q", "v"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="SEQ_2_SEQ_LM"
)


model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 9437184 || all params: 2859194368 || trainable%: 0.33006444422319176


In [None]:
import pandas as pd

dataset = load_dataset('csv', data_files={'train': '/content/teknofest_preprocessed_train_df.csv',
                                          'validation': '/content/teknofest_preprocessed_val_df.csv'}
                       )



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-dacf2746f7174e1f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-dacf2746f7174e1f/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'is_offensive', 'target'],
        num_rows: 9968
    })
    validation: Dataset({
        features: ['id', 'text', 'is_offensive', 'target'],
        num_rows: 1745
    })
})

In [None]:
# data preprocessing
text_column = "text"
label_column = "target"
max_length = 32


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    labels = tokenizer(targets, max_length=5, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

Running tokenizer on dataset:   0%|          | 0/10 [00:00<?, ?ba/s]

Running tokenizer on dataset:   0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    "teknofest-flan-t5-xl",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    gradient_accumulation_steps=1,
    auto_find_batch_size=True,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 128,
    num_train_epochs=5,
    save_steps=100,
    save_total_limit=8,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.192907
2,No log,0.136595
3,No log,0.13241
4,0.291200,0.114305
5,0.291200,0.116222




TrainOutput(global_step=780, training_loss=0.22553551747248723, metrics={'train_runtime': 3658.7278, 'train_samples_per_second': 13.622, 'train_steps_per_second': 0.213, 'total_flos': 2.673077102051328e+16, 'train_loss': 0.22553551747248723, 'epoch': 5.0})

In [None]:
model.eval()

In [None]:
input_text = test_df.text[4]
inputs = tokenizer(input_text, return_tensors="pt")

outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)

print("input sentence: ", input_text)
print(" output prediction: ", tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

input sentence:  kadının tek görevi erkeklerin ihtiyacını gidermektir
 output prediction:  ['SEXIST']


In [None]:
test_df = pd.read_csv("/content/teknofest_preprocessed_test_df.csv")

In [None]:
test_df

In [None]:
from tqdm import tqdm 
test_predictions = []

for text in tqdm(test_df['text'].tolist()):
  inputs = tokenizer(text, return_tensors="pt")
  outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)

  predicted_text = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]
  test_predictions.append(predicted_text)

100%|██████████| 748/748 [10:45<00:00,  1.16it/s]


In [None]:
test_predictions

In [None]:
from sklearn.metrics import f1_score, classification_report, accuracy_score

gold = test_df['target'].tolist()
preds = test_predictions

print("Accuracy score: ", accuracy_score(gold,preds))
print("f1_score score: ", f1_score(gold,preds, average="macro"))

Accuracy score:  0.8877005347593583
f1_score score:  0.8921329545966845


In [None]:
test_df["prediction_target"] = test_predictions

In [None]:
test_df[test_df.target != test_df.prediction_target].sample(20)

Unnamed: 0,id,text,is_offensive,target,prediction_target
635,f7c0966a-48f8-4fa3-a44a-98fb3573546e,bülent altun prof,0,OTHER,INSULT
0,ce129457-8bef-4c2c-89fc-56c2be682c17,anasını satayım böyle,1,SEXIST,PROFANITY
491,40dea8ae-c22c-4278-8f46-2f04cda41c23,i̇şi resmen batırdın dostum böyle kötü iş yapı...,1,INSULT,OTHER
194,fce01790-2f1d-4080-ba98-0100402e7cf3,dikkat et kesinlikle ben seni müslümanlıktan a...,0,OTHER,RACIST
79,7e897501-e519-41b2-a1e4-cf64eb0b3f25,biraz mert ol korkaklık yapma,1,INSULT,OTHER
599,47c02b90-3144-47dc-98f9-fceaf3915603,onun çocuğu tam tembel teneke,1,INSULT,OTHER
77,d6d43925-3ded-4068-92c8-97dafd0b616c,ayı oğlu ayı,1,INSULT,SEXIST
158,7990f33b-5f8d-4652-b1ab-6602bf475b04,yolun karsisina gecmeye calisan kadina tramvay...,0,OTHER,SEXIST
19,d2056985-73f2-4a22-aaac-9fc5d8abefcb,i̇nsanlara haksızlık yapan kötü niyetli adamdı,1,INSULT,OTHER
541,98ae095b-04e5-4500-8ee9-9169d29313ee,asla yalniz icki icmem,0,OTHER,RACIST
