In [1]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
hugging_face_dataset = "neil-code/dialogsum-test"
dataset = load_dataset(hugging_face_dataset)

Using the latest cached version of the dataset since neil-code/dialogsum-test couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\sharm\.cache\huggingface\datasets\neil-code___dialogsum-test\default\0.0.0\f0524dd2e0267dc8102109ce1b14bae8f97976d3 (last modified on Thu Oct 17 16:08:44 2024).


In [3]:
dataset['train'][0]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [4]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
    load_in_8bit_fp32_cpu_offload=True
)

Unused kwargs: ['load_in_8bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [5]:
model_name = 'microsoft/phi-2'
device_map = {"":0}
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device_map,
    quantization_config = bnb_config,
    token = True
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.57s/it]


In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code = True,
    padding_side = "left",
    add_eos_token = True,
    add_bos_token = True,
    use_fast = False
)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
from transformers import GenerationConfig

# Define the gen function
def gen(model, prompt, max_new_tokens=1024):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    generation_config = GenerationConfig(
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.1,
    )
    with torch.no_grad():
        outputs = model.generate(**inputs, generation_config=generation_config)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [8]:
from transformers import set_seed
seed = 42
set_seed(seed)

index = 51

prompt = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

formatted_prompt = f"Instruct: Summarize the following conversation.\n{prompt}\nOutput:\n"
res = gen(original_model, formatted_prompt, 100,)

output = res[0].split('Output:\n')[1]

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{formatted_prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: What does your sister look like, Mike?
#Person2#: Well, she's tall and pretty.
#Person1#: Is she like you?
#Person2#: I suppose so. We're both friendly and easy-going.
#Person1#: Is she as clever as you?
#Person2#: No, she's not as clever as me.
#Person1#: Big head!
Output:

---------------------------------------------------------------------------------------------------
BASELINE HUMAN SUMMARY:
Mike is describing his sister to #Person1#.

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
Person 1 asks about Person 2's sister and describes her physical appearance, personality traits, and intelligence level in comparison to Person 2. Person 2 responds that their sister is tall and pretty but not as clever as them. Person 1 concludes by s

In [9]:
def create_prompt_formats(sample):
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['dialogue']}" if sample["dialogue"] else None
    response = f"{RESPONSE_KEY}\n{sample["summary"]}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt
    return sample

In [10]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length : {max_length}")
            break
        if not max_length:
            max_length = 1024
            print(f"Using default max length: {max_length}")
        return max_length


In [11]:
def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch['text'],
        max_length = max_length, 
        truncation = True
    )

In [12]:
from functools import partial

def preprocess_dataset(tokenizer:AutoTokenizer, max_length : int, seed, dataset):
    print("Preprocessing dataset......")

    dataset = dataset.map(create_prompt_formats)

    preprocessing_function = partial(preprocess_batch, max_length = max_length, tokenizer = tokenizer)

    dataset = dataset.map(
        preprocessing_function,
        batched = True,
        remove_columns = ['id', 'topic', 'dialogue', 'summary']
    )

    dataset = dataset.filter(lambda sample : len(sample["input_ids"]) < max_length)

    dataset = dataset.shuffle(seed=seed)
    return dataset

In [13]:
max_length = get_max_length(original_model)
print(max_length)

Using default max length: 1024
1024


In [14]:
train_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length, seed, dataset['validation'])

Preprocessing dataset......


Map: 100%|██████████| 1999/1999 [00:00<00:00, 17574.62 examples/s]
Map: 100%|██████████| 1999/1999 [00:01<00:00, 1362.56 examples/s]
Filter: 100%|██████████| 1999/1999 [00:00<00:00, 7369.48 examples/s]


Preprocessing dataset......


Map: 100%|██████████| 499/499 [00:00<00:00, 11636.59 examples/s]
Map: 100%|██████████| 499/499 [00:00<00:00, 1212.08 examples/s]
Filter: 100%|██████████| 499/499 [00:00<00:00, 5958.45 examples/s]


In [15]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

original_model = prepare_model_for_kbit_training(original_model)

config = LoraConfig(
    r = 32, #rank
    lora_alpha = 32,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "dense"
    ],
    bias = "none",
    lora_dropout = 0.05, #conventional
    task_type = "CAUSAL_LM"

    )
original_model.gradient_checkpointing_enable()

peft_model = get_peft_model(original_model, config)
peft_model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 2,800,655,360 || trainable%: 0.7488


In [16]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

In [17]:
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps = 3,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    max_steps = 1000,
    learning_rate = 2e-4,
    optim = "paged_adamw_8bit",
    logging_steps = 50,
    logging_dir = "./logs",
    save_strategy = "steps",
    save_steps = 100,
    evaluation_strategy = 'steps',
    eval_steps = 100,
    do_eval = True,
    gradient_checkpointing = True,
    report_to = "none",
    overwrite_output_dir = True,
    group_by_length = True
    )



In [18]:
peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model = peft_model,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    args = peft_training_args,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm = False)
)

max_steps is given, it will override any value given in num_train_epochs


In [19]:
peft_trainer.train()

  5%|▌         | 50/1000 [06:49<1:47:06,  6.76s/it]

{'loss': 1.477, 'grad_norm': 0.14942295849323273, 'learning_rate': 0.00019057171514543633, 'epoch': 0.4}


 10%|█         | 100/1000 [13:05<2:29:36,  9.97s/it]

{'loss': 1.3076, 'grad_norm': 0.14999301731586456, 'learning_rate': 0.00018054162487462388, 'epoch': 0.8}


                                                    
 10%|█         | 100/1000 [15:26<2:29:36,  9.97s/it]

{'eval_loss': 1.3367149829864502, 'eval_runtime': 140.4978, 'eval_samples_per_second': 3.552, 'eval_steps_per_second': 0.448, 'epoch': 0.8}


 15%|█▌        | 150/1000 [21:35<1:18:04,  5.51s/it] 

{'loss': 1.2937, 'grad_norm': 0.15707220137119293, 'learning_rate': 0.00017051153460381145, 'epoch': 1.2}


 20%|██        | 200/1000 [28:10<1:43:20,  7.75s/it]

{'loss': 1.3065, 'grad_norm': 0.13852399587631226, 'learning_rate': 0.00016048144433299902, 'epoch': 1.6}


                                                    
 20%|██        | 200/1000 [30:07<1:43:20,  7.75s/it]

{'eval_loss': 1.3244941234588623, 'eval_runtime': 116.9292, 'eval_samples_per_second': 4.268, 'eval_steps_per_second': 0.539, 'epoch': 1.6}


 25%|██▌       | 250/1000 [36:05<2:03:02,  9.84s/it]

{'loss': 1.2449, 'grad_norm': 0.13445858657360077, 'learning_rate': 0.00015045135406218657, 'epoch': 2.0}


 30%|███       | 300/1000 [42:46<1:17:57,  6.68s/it]

{'loss': 1.2994, 'grad_norm': 0.15030372142791748, 'learning_rate': 0.00014042126379137414, 'epoch': 2.4}


                                                    
 30%|███       | 300/1000 [44:44<1:17:57,  6.68s/it]

{'eval_loss': 1.3230178356170654, 'eval_runtime': 118.149, 'eval_samples_per_second': 4.223, 'eval_steps_per_second': 0.533, 'epoch': 2.4}


 35%|███▌      | 350/1000 [51:09<1:46:06,  9.79s/it]

{'loss': 1.2623, 'grad_norm': 0.15852603316307068, 'learning_rate': 0.00013039117352056168, 'epoch': 2.81}


 40%|████      | 400/1000 [57:04<56:03,  5.61s/it]  

{'loss': 1.241, 'grad_norm': 0.1846274584531784, 'learning_rate': 0.00012036108324974927, 'epoch': 3.21}


                                                  


{'eval_loss': 1.3211191892623901, 'eval_runtime': 117.9497, 'eval_samples_per_second': 4.231, 'eval_steps_per_second': 0.534, 'epoch': 3.21}


 45%|████▌     | 450/1000 [1:05:38<1:11:35,  7.81s/it]

{'loss': 1.2621, 'grad_norm': 0.16903765499591827, 'learning_rate': 0.00011033099297893681, 'epoch': 3.61}


 50%|█████     | 500/1000 [1:11:30<1:23:12,  9.99s/it]

{'loss': 1.2086, 'grad_norm': 0.16520389914512634, 'learning_rate': 0.00010030090270812438, 'epoch': 4.01}


                                                      
 50%|█████     | 500/1000 [1:13:28<1:23:12,  9.99s/it]

{'eval_loss': 1.3232418298721313, 'eval_runtime': 117.5302, 'eval_samples_per_second': 4.246, 'eval_steps_per_second': 0.536, 'epoch': 4.01}


 55%|█████▌    | 550/1000 [1:20:06<48:26,  6.46s/it]  

{'loss': 1.2554, 'grad_norm': 0.19089876115322113, 'learning_rate': 9.027081243731194e-05, 'epoch': 4.41}


 60%|██████    | 600/1000 [1:26:19<1:01:46,  9.27s/it]

{'loss': 1.2118, 'grad_norm': 0.21383000910282135, 'learning_rate': 8.024072216649951e-05, 'epoch': 4.81}


                                                      
 60%|██████    | 600/1000 [1:28:17<1:01:46,  9.27s/it]

{'eval_loss': 1.3225795030593872, 'eval_runtime': 117.5416, 'eval_samples_per_second': 4.245, 'eval_steps_per_second': 0.536, 'epoch': 4.81}


 65%|██████▌   | 650/1000 [1:34:17<32:29,  5.57s/it]  

{'loss': 1.1942, 'grad_norm': 0.22171078622341156, 'learning_rate': 7.021063189568707e-05, 'epoch': 5.21}


 70%|███████   | 700/1000 [1:40:55<39:40,  7.93s/it]  

{'loss': 1.2195, 'grad_norm': 0.2069230079650879, 'learning_rate': 6.018054162487463e-05, 'epoch': 5.61}


                                                    
 70%|███████   | 700/1000 [1:42:53<39:40,  7.93s/it]

{'eval_loss': 1.3229470252990723, 'eval_runtime': 118.4158, 'eval_samples_per_second': 4.214, 'eval_steps_per_second': 0.532, 'epoch': 5.61}


 75%|███████▌  | 750/1000 [1:48:49<43:00, 10.32s/it]  

{'loss': 1.1663, 'grad_norm': 0.18576709926128387, 'learning_rate': 5.015045135406219e-05, 'epoch': 6.01}


 80%|████████  | 800/1000 [1:55:22<21:37,  6.49s/it]

{'loss': 1.2166, 'grad_norm': 0.22938580811023712, 'learning_rate': 4.0120361083249755e-05, 'epoch': 6.41}


                                                    
 80%|████████  | 800/1000 [1:57:20<21:37,  6.49s/it]

{'eval_loss': 1.3249585628509521, 'eval_runtime': 117.8948, 'eval_samples_per_second': 4.233, 'eval_steps_per_second': 0.534, 'epoch': 6.41}


 85%|████████▌ | 850/1000 [2:03:42<23:06,  9.25s/it]  

{'loss': 1.1719, 'grad_norm': 0.24068766832351685, 'learning_rate': 3.0090270812437316e-05, 'epoch': 6.81}


 90%|█████████ | 900/1000 [2:09:42<09:01,  5.42s/it]

{'loss': 1.1688, 'grad_norm': 0.2611583173274994, 'learning_rate': 2.0060180541624878e-05, 'epoch': 7.21}


                                                    
 90%|█████████ | 900/1000 [2:11:40<09:01,  5.42s/it]

{'eval_loss': 1.3280491828918457, 'eval_runtime': 118.1582, 'eval_samples_per_second': 4.223, 'eval_steps_per_second': 0.533, 'epoch': 7.21}


 95%|█████████▌| 950/1000 [2:18:17<06:13,  7.47s/it]  

{'loss': 1.194, 'grad_norm': 0.2562006413936615, 'learning_rate': 1.0030090270812439e-05, 'epoch': 7.62}


100%|██████████| 1000/1000 [2:24:15<00:00, 11.27s/it]

{'loss': 1.1244, 'grad_norm': 0.2056654542684555, 'learning_rate': 0.0, 'epoch': 8.02}


                                                     
100%|██████████| 1000/1000 [2:26:13<00:00, 11.27s/it]

{'eval_loss': 1.3294334411621094, 'eval_runtime': 118.2628, 'eval_samples_per_second': 4.219, 'eval_steps_per_second': 0.533, 'epoch': 8.02}


100%|██████████| 1000/1000 [2:26:15<00:00,  8.78s/it]

{'train_runtime': 8775.3003, 'train_samples_per_second': 1.823, 'train_steps_per_second': 0.114, 'train_loss': 1.2413030166625976, 'epoch': 8.02}





TrainOutput(global_step=1000, training_loss=1.2413030166625976, metrics={'train_runtime': 8775.3003, 'train_samples_per_second': 1.823, 'train_steps_per_second': 0.114, 'total_flos': 7.450303140421632e+16, 'train_loss': 1.2413030166625976, 'epoch': 8.016032064128256})

In [20]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model_id = "microsoft/phi-2"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map = device_map,
    quantization_config = bnb_config,
    trust_remote_code = True,
    use_auth_token = True,
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:46<00:00, 23.43s/it]


In [21]:
eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token = True,
    use_fast = False
    )

eval_tokenizer.pad_token = eval_tokenizer.eos_token

In [23]:
from peft import PeftModel

peft_model = PeftModel.from_pretrained(
    base_model,
    r"peft-dialogue-summary-training-1729186817\checkpoint-1000",
    torch_dtype = torch.float16,
    is_trainable = False
    )

In [24]:
from transformers import set_seed
set_seed(seed)

index = 5
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"Instruct: Summarize the following conversation.\n{dialogue}\nOutput:\n"

peft_model_res = gen(peft_model,prompt,100,)
peft_model_output = peft_model_res[0].split('Output:\n')[1]
#print(peft_model_output)
prefix, success, result = peft_model_output.partition('###')

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'PEFT MODEL:\n{prefix}')

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following conversation.
#Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's always rather congested down there during rush hour. Maybe you should try to find a different route to get home.
#Person2#: I don't think it can be avoided, to be honest.
#Person1#: perhaps it would be better if you started taking public transport system to work.
#Person2#: I think it's something that I'll have to consider. The public transport system is pretty good.
#Person1#: It would be better for the environment, too.
#Person2#: I know. I feel bad about how much my car is adding to the pollution problem in this city.
#Person1#: Taking the subway would be a lot less stressful than driving as well.
#Person2#: The only problem is that I'm going to really 

In [25]:
original_model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                                      device_map='auto',
                                                      quantization_config=bnb_config,
                                                      trust_remote_code=True,
                                                      use_auth_token=True)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


In [26]:
import pandas as pd

dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    human_baseline_text_output = human_baseline_summaries[idx]
    prompt = f"Instruct: Summarize the following conversation.\n{dialogue}\nOutput:\n"
    
    original_model_res = gen(original_model,prompt,100,)
    original_model_text_output = original_model_res[0].split('Output:\n')[1]
    
    peft_model_res = gen(peft_model,prompt,100,)
    peft_model_output = peft_model_res[0].split('Output:\n')[1]
    print(peft_model_output)
    peft_model_text_output, success, result = peft_model_output.partition('###')

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))
 
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#Person1# needs #Person2# to take a dictation for him to send out an announcement restricting office communications to email and official memos, which will apply to both internal and external communications.

### End of Output

#Person1# thinks instant messaging wastes too much time. He asks #Person2# about its application. #Person1# reminds #Person2# to distribute the memo to all employees before 4pm.

### End of Input

#Person


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#Person1# asks Ms. Dawson to take a dictation for #Person1# to send out as an intra-office memorandum to all employees as a restriction on Instant Messaging. #Person2# suggests that some employees use Instant Messaging to communicate with their clients, so #Person1# insists it should apply to all communications. #Person2# confirms it and then continues taking the dictation.

End of Output.

### End of Text

### Output:


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#Person1# asks Ms. Dawson to take a dictation for #Person1#. #Person1# requests Ms. Dawson to write a memo about a new policy restricting communications to email and official memos. #Person1# thinks the new policy will waste time if used by employees. #Person1# tells Ms. Dawson to keep updating the memo and gives some other information.

### End of Output

### Instruction: Summarize the below conversation.

#Person


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#Person1# suggests #Person2# take public transportation or bike instead of driving because #Person2#'s car contributes to pollution and stress. #Person2# agrees.

End of output.



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#Person2# gets stuck in traffic and #Person1# suggests finding a new way to get home, like taking public transportation or biking. #Person2# decides to stop driving to work because it's not good for #Person2# or the environment.

#Person3# joins the conversation.
#Person3#: Why do you need to decide whether to stop driving or not?
#Person1#: Because #Person2#'s car adds to the pollution


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


#Person2# got stuck in traffic and #Person1# suggests #Person2# starts taking public transportation or biking to work instead of driving. #Person2# agrees and decides to stop driving to work.

### End of Output

#Person1#: You're finally here! What took so long?
#Person2#: I got stuck in traffic again. There was a terrible traffic jam near the Carrefour intersection.
#Person1#: It's


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Masha tells Kate that her parents are getting divorced after two-month separation. They've worked out everything except the divorce itself.

#Person1#: That's the change from all the back stepping we usually hear about. Well, I still can't believe it, Masha and Hero, the perfect couple. When would they divorce be final?

#Person2#: Early in the New Year I guess.



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Masha tells Kate her parents Masha and Hero will divorce after two months of separation.

#Person1#'s parents decided to have a quiet divorce with no fighting over details. #Person1# is surprised.

#Person1# tells Kate that the divorce will be finalized early in the new year.

###End of Output.



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Masha tells Kate that her husband Hero is divorcing her after two months of separation without any conflict over custody of their children.

End of output.



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Brian invites #Person1# to have a dance at his birthday party. They compliment each other after having a drink together.

### End of Output

The conversation between Brian and #Person1# describes their compliments on each other after having a drink together.

### End of Input

### 


Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,Ms. Dawson instructed Mr. Johnson to summarize...,#Person1# needs #Person2# to take a dictation ...
1,In order to prevent employees from wasting tim...,Ms. Dawson instructed Person 2 to summarize th...,#Person1# asks Ms. Dawson to take a dictation ...
2,Ms. Dawson takes a dictation for #Person1# abo...,Ms. Dawson informs all staff members about a n...,#Person1# asks Ms. Dawson to take a dictation ...
3,#Person2# arrives late because of traffic jam....,\nThe conversation between Person 1 and Person...,#Person1# suggests #Person2# take public trans...
4,#Person2# decides to follow #Person1#'s sugges...,# Person1 and #Person2# are discussing whether...,#Person2# gets stuck in traffic and #Person1# ...
5,#Person2# complains to #Person1# about the tra...,"Person 2 has been caught in traffic, and they ...",#Person2# got stuck in traffic and #Person1# s...
6,#Person1# tells Kate that Masha and Hero get d...,Kate informed Person2 that Masha and Hero have...,Masha tells Kate that her parents are getting ...
7,#Person1# tells Kate that Masha and Hero are g...,Kate informs Person2 that Masha and Hero have ...,Masha tells Kate her parents Masha and Hero wi...
8,#Person1# and Kate talk about the divorce betw...,Kate informs Person2 that Masha and Hero have ...,Masha tells Kate that her husband Hero is divo...
9,#Person1# and Brian are at the birthday party ...,Brian thanked his friend for remembering his b...,Brian invites #Person1# to have a dance at his...


In [27]:
import evaluate

rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

ORIGINAL MODEL:
{'rouge1': 0.25399343006628494, 'rouge2': 0.07524937651267982, 'rougeL': 0.17981781715544354, 'rougeLsum': 0.17890899386281736}
PEFT MODEL:
{'rouge1': 0.32593791675457295, 'rouge2': 0.0989859486679662, 'rougeL': 0.22189633024448713, 'rougeLsum': 0.2443493201016658}
Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 7.19%
rouge2: 2.37%
rougeL: 4.21%
rougeLsum: 6.54%
