## Library Imports

In [None]:
from datasets import load_dataset
import random
import time
import vertexai
from vertexai.preview.tuning import sft
import json
import utils
import mercury as mr
import openai
import os

## Loading Dataset From HuggingFace

In [None]:
dataset = load_dataset("knkarthick/dialogsum")
print(dataset)

In [None]:
train=dataset["train"].to_list()
validation =dataset["validation"].to_list()

## Gemini 1.5 Tuning

In [None]:
#Prepare data for Gemini 1.5 Tuning
# Define a base prompt for zero-shot summarization 
base_prompt_zero_shot="Summarize the dialogue:"

# Initialize lists to store messages for training and validation
train_messages = []
validation_messages = []

# Iterate over training data and create messages for each dialogue-summary pair
for d in train:
  prompts=[]
  prompts.append({"role": "user", "parts": [{"text": d["dialogue"]}]})
  prompts.append({"role": "model", "parts": [{"text": d["summary"]}]}) 
  train_messages.append({'contents': prompts})

# Iterate over validation data and create messages similarly
for d in validation:
  prompts=[]
  prompts.append({"role": "user", "parts": [{"text": d["dialogue"]}]})
  prompts.append({"role": "model", "parts": [{"text": d["summary"]}]}) 
  validation_messages.append({'contents': prompts})

# Print lengths of message lists and an example training message
len(train_messages), len(validation_messages), train_messages[2]

In [None]:
# Save to JSON locally
utils.dicts_to_jsonl(train_messages, "dialogsum_train", False)
utils.dicts_to_jsonl(validation_messages, "dialogsum_valid", False)

In [None]:
# Delete & Overwrite files to upload to GCS

utils.delete_blob("mchrestkha-sample-data","dialogsum/dialogsum_train.jsonl")
utils.delete_blob("mchrestkha-sample-data","dialogsum/dialogsum_valid.jsonl")
utils.upload_blob("mchrestkha-sample-data","dialogsum_train.jsonl","dialogsum/dialogsum_train.jsonl")
utils.upload_blob("mchrestkha-sample-data","dialogsum_valid.jsonl","dialogsum/dialogsum_valid.jsonl")

In [None]:
#Submit Tuning Job

timestr = time.strftime("%Y%m%d-%H%M%S")
model_name="dialogsum"+timestr
print(model_name)

vertexai.init(project="mchrestkha-sandbox", location="us-central1")

sft_tuning_job = sft.train(
    source_model="gemini-1.5-flash-001",
    train_dataset="gs://mchrestkha-sample-data/dialogsum/dialogsum_train.jsonl",
    # The following parameters are optional
    validation_dataset="gs://mchrestkha-sample-data/dialogsum/dialogsum_valid.jsonl",
    epochs=5,
    adapter_size=4,
    learning_rate_multiplier=1.0,
    tuned_model_display_name="dialogsum",
)

## OpenAI Tuning

In [None]:
#Prepare data for OpenAI  Tuning
# Define a base prompt for zero-shot summarization 
system_prompt="Summarize the dialogue:"

# Initialize lists to store messages for training and validation
train_messages = []
validation_messages = []

# Iterate over training data and create messages for each dialogue-summary pair
for d in train:
  prompts = []
  prompts.append({"role": "system", "content": system_prompt})
  prompts.append({"role": "user", "content": d["dialogue"]})
  prompts.append({"role": "assistant", "content": d["summary"]})
  train_messages.append({'messages': prompts})

# Iterate over validation data and create messages similarly
for d in validation:
  prompts = []
  prompts.append({"role": "system", "content": system_prompt})
  prompts.append({"role": "user", "content": d["dialogue"]})
  prompts.append({"role": "assistant", "content": d["summary"]})
  validation_messages.append({'messages': prompts})

    # Print lengths of message lists and an example training message
len(train_messages), len(validation_messages), train_messages[2]

In [None]:
# Save to JSON locally
utils.dicts_to_jsonl(train_messages, "openai_dialogsum_train", False)
utils.dicts_to_jsonl(validation_messages, "openai_dialogsum_valid", False)

In [18]:
from openai import OpenAI
client = OpenAI()

In [21]:
# Register & Uplaod Files to OpenAI Storage
client.files.create(
  file=open("openai_dialogsum_train.jsonl", "rb"),
  purpose="fine-tune"
)

client.files.create(
  file=open("openai_dialogsum_valid.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-6dP710zFkj7deIIVBlgcAtSa', bytes=504860, created_at=1724279641, filename='openai_dialogsum_valid.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [24]:
#Submit Tuning Job
client.fine_tuning.jobs.create(
  training_file="file-hn28HMGpYecHguGp3JiVKnrz", 
  validation_file="file-6dP710zFkj7deIIVBlgcAtSa",
  model="gpt-4o-mini-2024-07-18"
)

FineTuningJob(id='ftjob-EuC6PXka7HAyD9gp5z2EHT6t', created_at=1724279844, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-rt7XBH1XjAWqIfKVhHpllhPn', result_files=[], seed=1406416728, status='validating_files', trained_tokens=None, training_file='file-hn28HMGpYecHguGp3JiVKnrz', validation_file='file-6dP710zFkj7deIIVBlgcAtSa', estimated_finish=None, integrations=[], user_provided_suffix=None)