# Samsun summarization Mixtral 7B LoRA


In [None]:
!pip install rouge_score
!pip install datasets
!pip install transformers
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install peft
!pip install trl
!pip install tqdm
!pip install pd
!pip install huggingface_hub
# !pip install dataclasses
# !pip install typing

Looking in indexes: https://pypi.org/simple/


In [None]:
# import tensorflow as tf
# from tensorflow.contrib import tpu
# from tensorflow.contrib.cluster_resolver import TPUClusterResolver

In [None]:
import torch
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import SFTTrainer

import pandas as pd

import time
from tqdm import tqdm
# from dataclasses import dataclass, field
# from typing import Optional, Sequence, Dict

In [None]:
DATASET = "samsum"
# TODO: Change model
# MODEL_CHECKPOINT = 'mistralai/Mistral-7B-v0.1'
MODEL_CHECKPOINT = 'mistralai/Mistral-7B-v0.1'

## 1. Data preprocessing

In [None]:
data = load_dataset(DATASET)

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
data["train"] = data["train"].select(range(7366))

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 7366
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
print(data["train"][0]["dialogue"])

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)


In [None]:
print(data["train"][0]["summary"])

Amanda baked cookies and will bring Jerry some tomorrow.


In [None]:
MODEL_CHECKPOINT


'mistralai/Mistral-7B-v0.1'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [None]:
tokenizer.padding_side = "right"

In [None]:
# tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.unk_token

In [None]:
# # TODO: add prompt formatting
# def preprocess_data(batch):
#     start_prompt = 'Summarize the following conversation.\n\n'
#     end_prompt = '\n\nSummary: '
#     prompts = [start_prompt + dialogue + end_prompt for dialogue in batch["dialogue"]]
#     return {"text": prompts}

In [None]:
data["train"][0]

{'id': '13818513',
 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)",
 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}

In [None]:
# def transform_dialogue_to_prompt(dialogue):
#   prompt = f"""<s>[INST] You are a helpful assistant. Your task is to generate the following dialogue summarization:
# {dialogue}[/INST]
# </s>"""
#   return prompt


In [None]:
# print(transform_dialogue_to_prompt(data["train"][0]["dialogue"]))

In [None]:
print(data["train"][0]["dialogue"])

Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)


In [None]:
print(data["train"][0]["summary"])

Amanda baked cookies and will bring Jerry some tomorrow.


In [None]:
def tokenize_data(batch):
    input_encodings = tokenizer(batch["dialogue"], max_length=1024, truncation=True, padding="max_length", return_tensors="pt")
    target_encodings = tokenizer(batch["summary"], max_length=128, truncation=True, padding="max_length", return_tensors="pt")
    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings

In [None]:
# # TODO: Add lengths to the training
# def preprocess_data(batch):
#   input_encodings = tokenizer(batch["dialogue"], max_length=1024, truncation=True,
#                         padding="max_length", return_tensors="pt")
#   with tokenizer.as_target_tokenizer():
#     target_encodings = tokenizer(batch["summary"], max_length=128, truncation=True,
#                         padding="max_length", return_tensors="pt")
#   return {"input_ids": input_encodings["input_ids"],
#           "attention_mask": input_encodings["attention_mask"],
#           "labels": target_encodings["input_ids"]}

In [None]:
data_tokenized = data.map(tokenize_data, batched=True, remove_columns=["id", "dialogue", "summary"])

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
data_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7366
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [None]:
# data_tokenized["train"]

In [None]:
# print(data_tokenized["train"][0]["test"])

In [None]:
# print(data_tokenized["train"][0]["text"])

In [None]:
data_tokenized.set_format(type="torch")

## 2. Model

In [None]:
compute_dtype = getattr(torch, "float16")
use_4bit = True

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [None]:
MODEL_CHECKPOINT

'mistralai/Mistral-7B-v0.1'

In [None]:
device_map = "auto"
# model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CHECKPOINT,
    quantization_config=bnb_config,  # loading model in 4-bit
    device_map=device_map, # to use max gpu resources if exist
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#Configure the pad token in the model
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# TD: Czy nie uzyc tutaj AutoModelForSeq2SeqLM
peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head",
        ],
    )

In [None]:
model = get_peft_model(model, peft_config)

In [None]:
model.print_trainable_parameters()

trainable params: 85,041,152 || all params: 7,326,773,248 || trainable%: 1.1606903765339511


In [None]:
from transformers import TrainingArguments

run_name = "peft-dialogue-summary-training"
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=6, # Increase, if still giving OOM error
    gradient_checkpointing=True,
    optim="paged_adamw_32bit", # paged_adamw_8bit for lower memory
    save_steps=500,
    # logging_steps=600,
    learning_rate=3e-4,
    # fp16=True, # Enabled causes error: ValueError: Attempting to unscale FP16 gradients.
    # evaluation_strategy="steps",
    max_grad_norm=0.3,
    num_train_epochs=3,
    weight_decay=0.001,
    warmup_steps=50,
    lr_scheduler_type="linear",
    run_name=run_name,
    # report_to='wandb',
)

In [None]:
data_tokenized

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 7366
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})

In [None]:
data_tokenized["train"][0]

{'input_ids': tensor([    1, 26413, 28747,  ...,     0,     0,     0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor([    1, 26413,   287,  6343, 15648,   304,   622,  2968, 19176,   741,
         10759, 28723,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,    

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=data_tokenized["train"],
    # eval_dataset=data_tokenized["validation"], # remove you have low VRAM and getting OOM errors
    # peft_config=peft_config, #
    # dataset_text_field="dialogue", #
    # max_seq_length=4096, # depends on your dataset
    tokenizer=tokenizer, # maybe remove this
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    # packing=False, #
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,2.1658
1000,2.0938
1500,2.0675
2000,2.0309
2500,2.0148
3000,1.9705
3500,1.964




TrainOutput(global_step=3681, training_loss=2.0398051938878488, metrics={'train_runtime': 4487.7307, 'train_samples_per_second': 4.924, 'train_steps_per_second': 0.82, 'total_flos': 9.764306396978872e+17, 'train_loss': 2.0398051938878488, 'epoch': 3.0})

In [None]:
peft_model_path="./peft-dialogue-summary-mistral-checkpoint-local"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



('./peft-dialogue-summary-mistral-checkpoint-local/tokenizer_config.json',
 './peft-dialogue-summary-mistral-checkpoint-local/special_tokens_map.json',
 './peft-dialogue-summary-mistral-checkpoint-local/tokenizer.model',
 './peft-dialogue-summary-mistral-checkpoint-local/added_tokens.json',
 './peft-dialogue-summary-mistral-checkpoint-local/tokenizer.json')

In [None]:
MODEL_CHECKPOINT

'mistralai/Mistral-7B-v0.1'

In [None]:
peft_model_path

'./peft-dialogue-summary-mistral-checkpoint-local'

In [None]:
# TODO: Implement this
#Load the base model with default precision
# model_name = "mistralai/Mistral-7B-Instruct-v0.1"
# adapter = "Enter the path of your LoRA adapter"
model = AutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT)

#Load and activate the PEFT adapter on top of the base model
model = PeftModel.from_pretrained(model, peft_model_path)

#Merge the PEFT adapter with the base model
model = model.merge_and_unload()

#Save the merged model in a directory in the safetensors format
model_dir = "./models/merged-peft-dialogue-summary-mistral/"
model.save_pretrained(model_dir, safe_serialization=True)

#Save the custom tokenizer in the same directory
tokenizer.save_pretrained(model_dir)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('./models/merged-peft-dialogue-summary-mistral/tokenizer_config.json',
 './models/merged-peft-dialogue-summary-mistral/special_tokens_map.json',
 './models/merged-peft-dialogue-summary-mistral/tokenizer.model',
 './models/merged-peft-dialogue-summary-mistral/added_tokens.json',
 './models/merged-peft-dialogue-summary-mistral/tokenizer.json')

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
model.push_to_hub("Mistral-7B-v0.1-Samsum-Dialogue-Summary")

model-00005-of-00006.safetensors:   0%|          | 0.00/4.83G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.25G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/msznajder/Mistral-7B-v0.1-Samsum-Dialogue-Summary/commit/44dd27dd0781f2ed197bd500959dd9052e02705b', commit_message='Upload MistralForCausalLM', commit_description='', oid='44dd27dd0781f2ed197bd500959dd9052e02705b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("Mistral-7B-v0.1-Samsum-Dialogue-Summary")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/msznajder/Mistral-7B-v0.1-Samsum-Dialogue-Summary/commit/2dfcf707dafcbfdc34a80ef8ac1431496ce13579', commit_message='Upload tokenizer', commit_description='', oid='2dfcf707dafcbfdc34a80ef8ac1431496ce13579', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
model = AutoModelForCausalLM.from_pretrained("msznajder/Mistral-7B-v0.1-Samsum-Dialogue-Summary"")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("msznajder/Mistral-7B-v0.1-Samsum-Dialogue-Summary"")

## 3. Evaluation

In [None]:
# TODO: Wywalic
data_tokenized = data_tokenized.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [None]:
# TODO: Reimplement this evaluation - like in a book plus like in LLM clourse

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

def chunks(list_of_elements, batch_size):
    """Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def evaluate_summaries_pegasus(dataset, metric, model, tokenizer,
                               batch_size=16, device=device,
                               column_text="article",
                               column_summary="highlights"):
    article_batches = list(chunks(dataset[column_text], batch_size))
    target_batches = list(chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        inputs = tokenizer(article_batch, max_length=1024,  truncation=True,
                        padding="max_length", return_tensors="pt")

        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device),
                         length_penalty=0.8, num_beams=8, max_length=128)

        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                clean_up_tokenization_spaces=True)
               for s in summaries]
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    score = metric.compute()
    return score

In [None]:
rouge_metric = load_metric("rouge", cache_dir=None)
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

In [None]:
data_tokenized

In [None]:
score = evaluate_summaries_pegasus(
    data_tokenized["test"], rouge_metric, model, tokenizer, # data["test"].select(range(0, 20))
    batch_size=2, column_text="dialogue", column_summary="summary"
)

In [None]:
score = evaluate_summaries_pegasus(
    data_tokenized["test"], rouge_metric, model.to(device), tokenizer, # data["test"].select(range(0, 20))
    batch_size=2, column_text="dialogue", column_summary="summary"
)

In [None]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=[f"Mistral 7B"])

In [None]:
MODEL_CHECKPOINT

In [None]:
raw_model = AutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT)

In [None]:
score = evaluate_summaries_pegasus(
    data["test"], rouge_metric, raw_model.to(device), tokenizer, # data["test"].select(range(0, 20))
    batch_size=2, column_text="dialogue", column_summary="summary"
)

In [None]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
pd.DataFrame(rouge_dict, index=[f"Mistral 7B"])

In [None]:
transformers.logging.set_verbosity_error()

In [None]:
gen_kwargs = {"length_penalty": 0.8, "num_beams":8, "max_length": 128}
sample_text = data["test"][0]["dialogue"]
reference = data["test"][0]["summary"]
pipe = pipeline("summarization", model=peft_model, tokenizer=tokenizer)

print("Dialogue:")
print(sample_text)
print("\nReference Summary:")
print(reference)
print("\nModel Summary:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])

In [None]:
custom_dialogue = """\
Thom: Hi guys, have you heard of transformers?
Lewis: Yes, I used them recently!
Leandro: Indeed, there is a great library by Hugging Face.
Thom: I know, I helped build it ;)
Lewis: Cool, maybe we should write a book about it. What do you think?
Leandro: Great idea, how hard can it be?!
Thom: I am in!
Lewis: Awesome, let's do it together!
"""
print(pipe(custom_dialogue, **gen_kwargs)[0]["summary_text"])