In [9]:
import sys 
import os
sys.path.append(os.path.abspath("src/Measurement"))
import globals
from src.Measurement.measurement import Measurement
from src.utils import *
globals.init()
print(globals.API_URL)

import os
import sys
import argparse
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np


from datasets import concatenate_datasets, load_dataset, DatasetDict, Dataset
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import platform
import evaluate
from transformers import AutoTokenizer, BartModel, GPT2Model, T5ForConditionalGeneration
print(platform.platform())
from transformers import pipeline


from huggingface_hub import login

macOS-14.1-arm64-arm-64bit
https://api-inference.huggingface.co/models/s-nlp/roberta-base-formality-ranker
macOS-14.1-arm64-arm-64bit


In [10]:
def init():
    login(token=globals.hug_token)
    

def make_output_base_path_str():
    path = os.path.join(args['output_dir'], args['base_model'], args['dataset'])
    print(f"Output directory path: {path}")
    if os.path.isdir(path):
        print(f"The supplied output directory, {[path]}, already exists. Do you wish to overwrite this directory's contents? [y/n]: ")
        if str(input()).lower() != "y":
            sys.exit()
    return path

In [11]:
def preprocess_text_function(examples, tokenizer, prefix = "summarize: "):
    inputs = [prefix + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=globals.max_src_length[args['dataset']], truncation=False)
    labels = tokenizer(text_target=examples["summary"], max_length=globals.max_target_length[args['dataset']], truncation=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def load_huggingface_dataset(data_config):
    data_dict = load_dataset(*data_config)
    dataset = concatenate_datasets([data_dict[k] for k in data_dict.keys()])
    # print(dataset.column_names)
    for col in dataset.column_names:
        if col in globals.new_col_names.keys():
            dataset = dataset.rename_column(col, globals.new_col_names[col])
    #         print(col, globals.new_col_names[col])
    # print(dataset.column_names)
    if args['dataset'] == 'reddit':
        return dataset.select_columns(['document', 'summary'])
    else:
       return dataset.select_columns(['id', 'document', 'summary'])
    
    
def train_val_test_split(dataset):
    data_dict = dataset.train_test_split(test_size=0.3)
    dev_data_dict = data_dict['test'].train_test_split(test_size=0.5)
    # print()
    # print(data_dict)
    # print()
    # print(dev_data_dict)
    return DatasetDict({
        'train': data_dict['train'],
        'validation': dev_data_dict['train'],
        'test': dev_data_dict['test']
    })


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_res = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    rouge_res["summary_length"] = np.mean(prediction_lens)
    results = {k: round(v, 4) for k, v in rouge_res.items()}
    
    bert_res = bert_score.compute(predictions=decoded_preds, references=decoded_labels)
    results.update({k: round(v,4) for k, v in bert_res.items()})
    
    return results

In [12]:
def load_pickle(file_path, file_name):
    with open(os.path.join(file_path , file_name), 'rb') as f:
        return pickle.load(f)
    
    
def load_synthetic_dataset(synthetic_data_path, file_name):
    data_df = pd.read_pickle(os.path.join(synthetic_data_path, file_name))
    dataset = Dataset.from_pandas(data_df)
    return dataset


def save_files_to_pkl(path_dict, parent_dir):
    for path, values in path_dict.keys():
        pickle.dump(values, open(os.path.join(parent_dir, path), "wb" ) )
    print("Files saved 💃🕺")

In [13]:
def perform_summarization(sample):
    return summarizer(sample['documents'], max_length=30)
    
    
def generate_new_synthetic_dataset(generation_path):
    summarizer = pipeline("summarization", model=generation_path)
    summaries = dataset.map(lambda data_x: perform_summarization(data_x), batched=True)
    synthetic_df = pd.DataFrame({
        'document': dataset['document'],
        'summary': summaries
        })
    if 'id' in dataset.column_names:
        synthetic_df['id'] = dataset['id']
    return synthetic_df

In [16]:
# parser = argparse.ArgumentParser(description='running parameter experiments')
# parser.add_argument('--output_dir', type=str, default='test')
# parser.add_argument('--base_model', type=str, default='t5')
# parser.add_argument('--dataset', type=str, default='news')
# parser.add_argument('--num_generations', type=int, default=3)

parser = {
    'output_dir':'test',
    'base_model':'t5',
    'dataset':'dialogue',
    'num_generations':3
}

In [18]:
"""
What it do:
    1. fine tune the model
    2. compute_metrics results and save
    2. save the new checkpoint
    3. generate new synthetic dataset
    4. perform measurement analysis on the new dataset
"""
init()
# parser = argparse.ArgumentParser(description='running parameter experiments')
# parser.add_argument('--output_dir', type=str, required=True)
# parser.add_argument('--base_model', type=str, required=True)
# parser.add_argument('--dataset', type=str, required=True)
# parser.add_argument('--num_generations', type=int, default=3)

# parsed_args = parser.parse_args()
args = parser

base_models = {
    't5': 't5-small',
    'bart': 'facebook/bart-base',
    'gpt': 'gpt2'
}
ModelConstructor = {
    't5':T5ForConditionalGeneration,
    'bart': BartModel,
    'gpt': GPT2Model
}
learning_rates = {
    't5': 1e-4,
    'bart': 2e-5,
    'gpt2': 2e-5
}
dataset_configs = {
    'news': ["cnn_dailymail", "2.0.0"],
    'reddit': ["reddit_tifu", 'long'],
    'dialogue': ['samsum']
}

assert(args['base_model'] in base_models.keys()), "Invalid 'base_model' supplied"
assert(args['dataset'] in dataset_configs.keys()), "Invalid 'dataset' suplied"

base_path = make_output_base_path_str()
print("base_path =", base_path)

dataset_key, dataset_config = args['dataset'], dataset_configs[args['dataset']]
base_model = args['base_model']
base_model_checkpoint = base_models[base_model]

tokenizer = AutoTokenizer.from_pretrained(base_model_checkpoint)

for gen_num in tqdm(range(args['num_generations'])):
    print(f"Generation {gen_num}")
    generation_path = os.path.join(base_path, f"generation{gen_num}")
    assert(not os.path.exists(generation_path))
    os.makedirs(generation_path)
    os.makedirs(os.path.join(generation_path, 'synthetic_data'))
    
    if gen_num == 0:
        model_checkpoint = base_model_checkpoint
        dataset = load_huggingface_dataset(dataset_config)
        
    else:
        model_checkpoint = os.path.join(base_path, f"generation{gen_num-1}")
        dataset = load_synthetic_dataset(os.path.join(model_checkpoint, "synthetic_data"), "synthetic_data.pkl")
        
        
    data_dict = train_val_test_split(dataset)
    
    tokenized_data = data_dict.map(lambda data_x: preprocess_text_function(data_x, tokenizer), batched=True)
    tokenized_data = tokenized_data.filter(lambda example: len(example['input_ids']) <= globals.max_src_length[args['dataset']])
    tokenized_data = tokenized_data.filter(lambda example: len(example['labels']) <= globals.max_target_length[args['dataset']])
    
    assert(tokenized_data['train'] != tokenized_data['test'])
    assert(tokenized_data['train'] != tokenized_data['validation'])
    assert(tokenized_data['validation'] != tokenized_data['test'])
    
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoint)
    
    model = ModelConstructor[base_model].from_pretrained(model_checkpoint)
    
    rouge = evaluate.load("rouge")
    bert_score = evaluate.load("bertscore")
    
    training_args = Seq2SeqTrainingArguments(
        output_dir=generation_path,
        evaluation_strategy="epoch",
        learning_rate=learning_rates[base_model],
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=1,
        predict_with_generate=True,
        push_to_hub=True,
    )
    
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # trainer.train()
    trainer.save_model()
    
    test_data_eval_results = trainer.evaluate(tokenized_data["test"])
    print(test_data_eval_results)
    
    # Generate new synthetic dataset
    new_dataset = generate_new_synthetic_dataset(generation_path)
    dataset_specs = {
        'generation':gen_num, 
        'subject':args['dataset']
    }
    synthetic_dataset_measurements = Measurement(new_dataset, dataset_specs, DEBUG=True)
    synthetic_dataset_results = synthetic_dataset_measurements.measure()
    
    files_to_save = {
        'test_data_eval_results.pkl': test_data_eval_results,
        'synthetic_data/synthetic_data.pkl': new_dataset,
        'synthetic_data/config.pkl': synthetic_dataset_results['config'],
        'synthetic_data/measurements.pkl': synthetic_dataset_results['metrics']
    }

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/madisonthantu/.cache/huggingface/token
Login successful
Output directory path: test/t5/dialogue
base_path = test/t5/dialogue


  0%|          | 0/3 [00:00<?, ?it/s]

Generation 0
['id', 'dialogue', 'summary']
dialogue document
['id', 'document', 'summary']

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'summary'],
        num_rows: 11458
    })
    test: Dataset({
        features: ['id', 'document', 'summary'],
        num_rows: 4911
    })
})

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'summary'],
        num_rows: 2455
    })
    test: Dataset({
        features: ['id', 'document', 'summary'],
        num_rows: 2456
    })
})


Map:   0%|          | 0/11458 [00:00<?, ? examples/s]

Map:   0%|          | 0/2455 [00:00<?, ? examples/s]

Map:   0%|          | 0/2456 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11458 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2455 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2456 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11338 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2418 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2425 [00:00<?, ? examples/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]