# Fine-Tuning LLMs on fake-news dataset

*(FYI: this notebook is designed to be run in Kaggle)*

In [1]:
! pip install -U trl peft accelerate --quiet
! pip install -i https://pypi.org/simple/ bitsandbytes --quiet

In [6]:
import pandas as pd
import numpy as np

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
    T5Tokenizer, 
    T5ForConditionalGeneration, 
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

from datasets import Dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model

from pathlib import Path

## Dataset Creation

In [7]:
CLEAN_TRAIN_DATASET_PATH = Path("../input/welfake-clean/WELFake_clean_train.csv")
RANDOM_SEED = 42

dataset_df = pd.read_csv(CLEAN_TRAIN_DATASET_PATH, index_col=0)
dataset_df

Unnamed: 0,title,text,label
45905,Exclusive: Foreign Isis Fighters Defend Mosul ...,\nForeign fighters for Isis are choosing to ...,1
37291,JUDGE JEANINE UNLOADS On Hillary: “How Did You...,You don t want to miss a second of Judge Jeani...,1
46730,Gunman attacks Saudi security forces at gate o...,RIYADH (Reuters) - Two Saudi guards were shot ...,0
66327,Indian Software Mogul: Hire Americans Now Beca...,A leading Indian software entrepreneur says In...,0
58329,Rep. Diaz-Balart: Liberals Against Trump Who F...,Florida Congressman Mario attacked the “doub...,0
...,...,...,...
37847,"To applause and boos, Kerry urges Congress to ...",CHICAGO (Reuters) - Failure to approve the Tra...,0
6384,TINGLE UP HIS LEG? NBC Paid Off Chris Matthews...,Here s yet another claim that s really iffy be...,1
55885,U.S. government shares technical details on No...,WASHINGTON (Reuters) - The U.S. government on ...,0
881,Trumps history of corruption is mind-boggling....,"In the heat of a presidential campaign, youd t...",0


In [8]:
short_dataset_df = dataset_df[dataset_df["text"].str.split().str.len() <= 100]
short_dataset_df

Unnamed: 0,title,text,label
1290,'There appear to be no rules anymore',There is an path for Democrats to regain the p...,0
68068,Spain to control Catalan spending as long as '...,MADRID (Reuters) - The Spanish government said...,0
48833,House committee postpones hearing on Puerto Rico,NEW YORK (Reuters) - The U.S. House of Represe...,0
52423,Former New York City Mayor Bloomberg to endors...,WASHINGTON (Reuters) - Former New York City Ma...,0
37654,Newly Approved GM Potatoes Have Potential to S...,"By Whitney Webb Late last week, the US Departm...",1
...,...,...,...
9876,HOW BAD IS IT IN VENEZUELA? Socialism’s Endgam...,How bad is it in Venezuela? People are eating ...,1
36400,Wow! Must Watch Video Of Grilling Of Congressm...,Fournier totally hammers these two-great to see!,1
26093,Trump's New Ad Portraying 'Every Mother's Wors...,Share on Twitter The Wildfire is an opinion pl...,1
5407,BREAKING: FEDERAL COURT RULES ON NSA’S WARRANT...,Another positive step towards restoring our fr...,1


In [9]:
train_ds = Dataset.from_pandas(short_dataset_df)
real_train_ds = train_ds.filter(
    lambda x: x["label"] == 1
)

Filter:   0%|          | 0/6040 [00:00<?, ? examples/s]

## 1. CausalLM Models

In [6]:
MODEL_NAME = "facebook/opt-350m"

#b_and_b_config = BitsAndBytesConfig(load_in_4bit=True)
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.1,
    use_rslora=True
)

orig_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)#, quantization_config=b_and_b_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = get_peft_model(orig_model, peft_config=lora_config)

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

In [7]:
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 332,769,280 || trainable%: 0.4727


In [8]:
def generate_causal(model, tokenizer, prompt: str):
    input_tokens = tokenizer.encode(prompt, return_tensors="pt")
    input_tokens = input_tokens.to(device=model.device)
    output_tokens = model.generate(input_tokens, max_new_tokens=100)
    return tokenizer.decode(output_tokens[0])

In [9]:
print(generate_causal(model, tokenizer, "Joe Biden passes another bill"))

</s>Joe Biden passes another bill to help Americans with disabilities
WASHINGTON (AP) — Joe Biden passed another bill Thursday to help Americans with disabilities, including a $1,400 stimulus check and a $1,400 unemployment benefit.
The $1.9 trillion coronavirus relief package passed by the Senate on a party-line vote. It includes $1,400 checks for most Americans, $300 weekly unemployment benefits and a $600 weekly federal jobless benefit.
The bill also includes $1,400 checks for


In [10]:
RESPONSE_TEMPLATE = "\n### Article:"

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example["title"])):
        text = f"### Title: {example['title'][i]}. {RESPONSE_TEMPLATE} {example['text'][i]}"
        output_texts.append(text)
    return output_texts


collator = DataCollatorForCompletionOnlyLM(RESPONSE_TEMPLATE, tokenizer=tokenizer)

In [13]:
training_args = TrainingArguments(
    output_dir="./training_output", 
    report_to="none",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2
)

trainer = SFTTrainer(
    model=model,
    train_dataset=real_train_ds,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    args=training_args
)



Map:   0%|          | 0/2764 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [14]:
! wget https://raw.githubusercontent.com/huggingface/trl/main/examples/scripts/sft.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--2024-05-30 15:54:23--  https://raw.githubusercontent.com/huggingface/trl/main/examples/scripts/sft.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4614 (4.5K) [text/plain]
Saving to: 'sft.py'


2024-05-30 15:54:23 (47.4 MB/s) - 'sft.py' saved [4614/4614]



In [15]:
trainer.train()



Step,Training Loss
500,3.3746
1000,3.2149
1500,3.1795
2000,3.0955




TrainOutput(global_step=2073, training_loss=3.214072124887881, metrics={'train_runtime': 1696.5383, 'train_samples_per_second': 4.888, 'train_steps_per_second': 1.222, 'total_flos': 1902266985578496.0, 'train_loss': 3.214072124887881, 'epoch': 3.0})

In [22]:
print(generate_causal(model, tokenizer, "My name is Nikita."))

</s>My name is Nikita.
I'm a young man from the United States.
I'm a student at the University of California, Berkeley.
I'm a student at the University of California, Berkeley.
I'm a student at the University of California, Berkeley.
I'm a student at the University of California, Berkeley.
I'm a student at the University of California, Berkeley.
I'm a student at the University of California, Berkeley.
I'm a student at the University of California,


## 2. Flan-T5

In [10]:
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
# We prefix our tasks with "answer the question"
PREFIX = "Please write an article based on the title: "

# Define the preprocessing function

def preprocess_function(examples):
    """Add prefix to the sentences, tokenize the text, and set the labels"""
    # The "inputs" are the tokenized answer:
    inputs = [PREFIX + doc for doc in examples["title"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["text"], 
                      max_length=512,         
                      truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = (
    real_train_ds
    .map(preprocess_function, batched=True)
    .select_columns(["input_ids", "labels"])
)
tokenized_dataset

Map:   0%|          | 0/2764 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'labels'],
    num_rows: 2764
})

In [25]:
def generate_t5(model, tokenizer, title: str, max_new_tokens=100):
    input_tokens = tokenizer(PREFIX + title, return_tensors="pt")["input_ids"]
    input_tokens = input_tokens.to(device=model.device)
    output_tokens = model.generate(input_tokens, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output_tokens[0], skip_special_tokens=True)

In [44]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
#     evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    push_to_hub=False,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  deprecated_dl_args["split_batches"] = split_batches


In [52]:
trainer.train()

Step,Training Loss
500,0.7169
1000,0.665
1500,0.5294
2000,0.4482
2500,0.3764
3000,0.325


TrainOutput(global_step=3460, training_loss=0.48189539826674266, metrics={'train_runtime': 1068.0379, 'train_samples_per_second': 25.879, 'train_steps_per_second': 3.24, 'total_flos': 1953575278866432.0, 'train_loss': 0.48189539826674266, 'epoch': 10.0})

In [51]:
article_text = generate_t5(
    model, 
    tokenizer, 
    "China calls for restraint when asked about Norway"
)
article_text

'November 1, 2016 - Bailout Denmark - Foreign Minister Sergej Stockman asked: China calls for restraint when asked about Norway The EU-Mexico relations are on track, with China insisting that the country should seek to maintain its sovereignty in the event of an emergency.'

In [53]:
trainer.save_model("flan-t5-base-finetuned")

In [59]:
! zip -r flan-t5-base-finetuned.zip flan-t5-base-finetuned

  adding: flan-t5-base-finetuned/ (stored 0%)
  adding: flan-t5-base-finetuned/training_args.bin (deflated 51%)
  adding: flan-t5-base-finetuned/tokenizer_config.json (deflated 94%)
  adding: flan-t5-base-finetuned/spiece.model (deflated 48%)
  adding: flan-t5-base-finetuned/generation_config.json (deflated 29%)
  adding: flan-t5-base-finetuned/added_tokens.json (deflated 83%)
  adding: flan-t5-base-finetuned/config.json (deflated 62%)
  adding: flan-t5-base-finetuned/special_tokens_map.json (deflated 85%)
  adding: flan-t5-base-finetuned/model.safetensors (deflated 7%)


In [63]:
from IPython.display import FileLink
FileLink("flan-t5-base-finetuned.zip")