# Fine-Tuning LLMs on fake-news dataset

*(FYI: this notebook is designed to be run in Kaggle)*

In [2]:
! pip install -U trl peft accelerate bitsandbytes einops --quiet

In [3]:
import pandas as pd
import numpy as np
import torch

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

from datasets import Dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from pathlib import Path

2024-06-01 14:06:28.489002: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-01 14:06:28.489103: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-01 14:06:28.606993: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Dataset Creation

In [4]:
CLEAN_TRAIN_DATASET_PATH = Path("../input/welfake-clean/WELFake_clean_train.csv")
RANDOM_SEED = 42

dataset_df = pd.read_csv(CLEAN_TRAIN_DATASET_PATH, index_col=0)
dataset_df

Unnamed: 0,title,text,label
45905,Exclusive: Foreign Isis Fighters Defend Mosul ...,\nForeign fighters for Isis are choosing to ...,1
37291,JUDGE JEANINE UNLOADS On Hillary: “How Did You...,You don t want to miss a second of Judge Jeani...,1
46730,Gunman attacks Saudi security forces at gate o...,RIYADH (Reuters) - Two Saudi guards were shot ...,0
66327,Indian Software Mogul: Hire Americans Now Beca...,A leading Indian software entrepreneur says In...,0
58329,Rep. Diaz-Balart: Liberals Against Trump Who F...,Florida Congressman Mario attacked the “doub...,0
...,...,...,...
37847,"To applause and boos, Kerry urges Congress to ...",CHICAGO (Reuters) - Failure to approve the Tra...,0
6384,TINGLE UP HIS LEG? NBC Paid Off Chris Matthews...,Here s yet another claim that s really iffy be...,1
55885,U.S. government shares technical details on No...,WASHINGTON (Reuters) - The U.S. government on ...,0
881,Trumps history of corruption is mind-boggling....,"In the heat of a presidential campaign, youd t...",0


In [5]:
short_dataset_df = dataset_df[dataset_df["text"].str.split().str.len() <= 100]
short_dataset_df

Unnamed: 0,title,text,label
1290,'There appear to be no rules anymore',There is an path for Democrats to regain the p...,0
68068,Spain to control Catalan spending as long as '...,MADRID (Reuters) - The Spanish government said...,0
48833,House committee postpones hearing on Puerto Rico,NEW YORK (Reuters) - The U.S. House of Represe...,0
52423,Former New York City Mayor Bloomberg to endors...,WASHINGTON (Reuters) - Former New York City Ma...,0
37654,Newly Approved GM Potatoes Have Potential to S...,"By Whitney Webb Late last week, the US Departm...",1
...,...,...,...
9876,HOW BAD IS IT IN VENEZUELA? Socialism’s Endgam...,How bad is it in Venezuela? People are eating ...,1
36400,Wow! Must Watch Video Of Grilling Of Congressm...,Fournier totally hammers these two-great to see!,1
26093,Trump's New Ad Portraying 'Every Mother's Wors...,Share on Twitter The Wildfire is an opinion pl...,1
5407,BREAKING: FEDERAL COURT RULES ON NSA’S WARRANT...,Another positive step towards restoring our fr...,1


In [6]:
train_ds = Dataset.from_pandas(short_dataset_df)
real_train_ds = train_ds.filter(
    lambda x: x["label"] == 1
)

Filter:   0%|          | 0/6040 [00:00<?, ? examples/s]

## 1. Phi-2 + LoRA
Source: https://medium.com/@prasadmahamulkar/fine-tuning-phi-2-a-step-by-step-guide-e672e7f1d009

In [7]:
# MODEL_NAME = "microsoft/phi-2"
MODEL_NAME = "./phi-2-finetuned-lora"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    target_modules= ["Wqkv", "out_proj"]
)

orig_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    device_map="auto",
    revision="refs/pr/23",
)
orig_model.config.use_cache = False
orig_model.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

orig_model = prepare_model_for_kbit_training(orig_model, use_gradient_checkpointing=True)
model = get_peft_model(orig_model, peft_config=lora_config)

config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

modeling_phi.py:   0%|          | 0.00/33.7k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/577M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


In [8]:
model.print_trainable_parameters()

trainable params: 31,457,280 || all params: 2,811,141,120 || trainable%: 1.1190


In [25]:
PREFIX = "### Title:"
RESPONSE_TEMPLATE = "\n### Article:"

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["title"])):
        text = f"{PREFIX} {example['title'][i]}. {RESPONSE_TEMPLATE} {example['text'][i]}"
        output_texts.append(text)
    return output_texts


def generate_causal(model, tokenizer, title: str, max_new_tokens: int | None = None):
    input_tokens = tokenizer(f"{PREFIX} {title}. {RESPONSE_TEMPLATE}", return_tensors="pt")["input_ids"]
    input_tokens = input_tokens.to(device=model.device)
    output_tokens = model.generate(
        input_tokens, 
        max_new_tokens=max_new_tokens, 
        #eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(output_tokens[0])


collator = DataCollatorForCompletionOnlyLM(RESPONSE_TEMPLATE, tokenizer=tokenizer)

In [10]:
training_args = TrainingArguments(
    output_dir="./training_results", 
    report_to="none",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    max_grad_norm=0.3,
    learning_rate=2e-4,
    weight_decay=0.001,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    group_by_length=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=real_train_ds,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    args=training_args
)

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/2764 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [11]:
trainer.train()

Step,Training Loss
500,3.0206
1000,2.7723
1500,2.6772
2000,2.54
2500,2.4897
3000,2.3924
3500,2.3544
4000,2.277


config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.
You are using a model of type phi to instantiate a model of type phi-msft. This is not s

TrainOutput(global_step=4146, training_loss=2.5547550199575833, metrics={'train_runtime': 4162.2281, 'train_samples_per_second': 1.992, 'train_steps_per_second': 0.996, 'total_flos': 1.310613297214464e+16, 'train_loss': 2.5547550199575833, 'epoch': 3.0})

In [27]:
print(generate_causal(model, tokenizer, "Belarus under new hard-hitting sanctions", max_new_tokens=150))

### Title: Belarus under new hard-hitting sanctions. 
### Article: By David Icke The US and EU have imposed new sanctions on Belarus, in response to the country’s recent presidential election. The US has imposed sanctions on the country’s president Alexander Lukashenko, and the EU has imposed sanctions on the country’s prime minister, Svetlana Tsikhanouskaya. The sanctions are aimed at pressuring Lukashenko to step down, and to support Tsikhanouskaya, who is the first female president in the country’s history. The sanctions are a significant escalation in the US and EU’s efforts to pressure Lukashenko to step down. The sanctions are also a sign of the growing tension between the US and Russia, as well as between the


In [16]:
trainer.save_model("phi-2-finetuned-lora-new")

You are using a model of type phi to instantiate a model of type phi-msft. This is not supported for all configurations of models and can yield errors.


## 2. Flan-T5

In [10]:
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# We prefix our tasks with "answer the question"
PREFIX = "Please write an article based on the title: "

# Define the preprocessing function

def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # The "inputs" are the tokenized answer:
    inputs = [PREFIX + doc for doc in examples["title"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["text"], 
                      max_length=512,         
                      truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = (
    real_train_ds
    .map(preprocess_function, batched=True)
    .select_columns(["input_ids", "labels"])
)
tokenized_dataset

Map:   0%|          | 0/2764 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 2764
})

In [25]:
def generate_t5(model, tokenizer, title: str, max_new_tokens=100):
    input_tokens = tokenizer(PREFIX + title, return_tensors="pt")["input_ids"]
    input_tokens = input_tokens.to(device=model.device)
    output_tokens = model.generate(input_tokens, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

In [44]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
#     evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  deprecated_dl_args["split_batches"] = split_batches


In [52]:
trainer.train()

Step,Training Loss
500,0.7169
1000,0.665
1500,0.5294
2000,0.4482
2500,0.3764
3000,0.325


TrainOutput(global_step=3460, training_loss=0.48189539826674266, metrics={'train_runtime': 1068.0379, 'train_samples_per_second': 25.879, 'train_steps_per_second': 3.24, 'total_flos': 1953575278866432.0, 'train_loss': 0.48189539826674266, 'epoch': 10.0})

In [12]:
article_text = generate_t5(
    model, 
    tokenizer, 
    "China calls for restraint when asked about Norway"
)
article_text

NameError: name 'generate_t5' is not defined

In [53]:
trainer.save_model("flan-t5-base-finetuned")