# Template for Fine-Tuning Classification with OpenAI Models

### Import all Modules

In [None]:
import os
import glob
import pprint
import time
import argparse
from ast import literal_eval

from sklearn.metrics import accuracy_score
from tqdm import tqdm

import wandb
import openai
import pandas as pd

from utils import (
    dataset_has_format_errors,
    write_jsonl,
)
from utils_src import task_num_to_task_name, dataset_num_to_dataset_name, plot_count_and_normalized_confusion_matrix, \
    task_to_display_labels, load_dataset_task_prompt_mappings

module_dir = os.path.dirname(os.path.abspath(__file__))

# read API key
with open('src/OpenAI_key.txt') as f:
    openai.api_key = f.readlines()[0]


### Setup Arguments and Data

In the following code block, you are required to set up several key parameters that will define the behavior and environment of your fine-tuning process:

1. **WandB Project Name (`WANDB_PROJECT_NAME`)**: This is the name of the project in Weights & Biases (WandB) where your training run will be logged. WandB is a tool that helps track experiments, visualize data, and share insights. By setting the project name here, you ensure that all the metrics, outputs, and logs from your training process are organized under a single project for easy access and comparison. Specify a meaningful name that reflects the nature of your training session or experiment.

2. **Model Name (`MODEL_NAME`)**: Here, you select the specific model from OpenAI's suite that you wish to fine-tune. The model name, such as `'gpt-3.5-turbo-0613'`, refers to a particular configuration and version of the model. This selection dictates the starting point of your fine-tuning process, leveraging the pre-trained weights and architecture of the specified model. Ensure that the model name corresponds to an existing and available model in OpenAI's library. You can find more models here:  https://platform.openai.com/docs/models 

3. **Completion Retries (`COMPLETION_RETRIES`)**: This parameter defines the number of retry attempts for generating a completion in case the initial attempts fail. When interacting with the model, especially in a fine-tuning context, certain queries may not succeed on the first try due to various reasons (e.g., network issues, API errors). This setting provides resilience, allowing the process to attempt the generation multiple times before considering it a failure.


In [None]:
# Specs WandB and Which Model you want to fine-tune
WANDB_PROJECT_NAME = "chatGPT_template_1"
MODEL_NAME = 'gpt-3.5-turbo-0613'
COMPLETION_RETRIES = 10

In the next code block, you are required to set up various configuration variables that will dictate how the inference processes are executed. These variables are crucial as they define the nature of the task, the data, and the specific behaviors during the model's training and evaluation.

1. **Task (`task`)**: Specify the type of task you want to run inference on. The task is represented by an integer, with each number corresponding to a different type of task (e.g., 1, 2, 3, etc.). You must select from the predefined choices, which are typically mapped to specific NLP tasks or scenarios.

2. **Dataset (`dataset`)**: Choose the dataset on which you want to run inference. Like tasks, datasets are identified by integers, and each number corresponds to a different dataset. Ensure that the dataset selected is relevant to the task at hand.

3. **Output Directory (`output_dir`)**: Define the path to the directory where you want to store the generated samples. This is where the output of your training and inference processes will be saved.

4. **Random Seed (`seed`)**: Setting a random seed ensures that the results are reproducible. By using the same seed, you can achieve the same outcomes on repeated runs under identical conditions.

5. **Data Directory (`data_dir`)**: Specify the path to the directory containing the datasets you plan to use for training and evaluation.

6. **Label Usage (`not_use_full_labels`)**: This boolean variable determines whether to use the full label descriptions or abbreviated labels during training and inference. Setting it to `False` means full labels will be used.

7. **Dataset-Task Mappings File Path (`dataset_task_mappings_fp`)**: Define the path to the file containing mappings between datasets and tasks. This file is crucial for ensuring the correct dataset is used for the specified task.

8. **Rewrite DataFrame (`rewrite_df_in_openai`)**: A boolean that dictates whether to rewrite the dataframe in OpenAI format. If set to `True`, the data will be reformatted according to OpenAI's expected input structure during the execution.

9. **Number of Epochs (`n_epochs`)**: Specify the number of epochs for training the model. An epoch refers to one complete pass through the entire training dataset.

10. **Run Name (`run_name`)**: Give a unique name to your run, which will help you identify it later, especially when tracking multiple experiments or runs.

11. **Temperature (`temp`)**: Set the temperature for text generation. Temperature controls the randomness of the output; a lower temperature results in less random completions.

12. **Few-shot (`few_shot`)**: A boolean indicating whether to use a few-shot learning approach. When set to `True`, the model will be fine-tuned with only a few examples.

13. **System-User Prompt Division (`system_user_division`)**: This integer defines the separation between system and user prompts, which is critical for structuring the input data correctly for the model.

**Customizing for Your Own Tasks:**
If you plan to run a custom task or use a dataset that is not predefined, you will need to make modifications to the `utils_src` file. This file contains all mappings for different datasets and tasks. Adding your custom task or dataset involves defining the new task or dataset number and specifying its characteristics and mappings in the `utils_src` file. This ensures that your custom task or dataset integrates seamlessly with the existing framework for training and inference.


In [None]:
# Configuration Variables

# Type of task to run inference on
task = 1  # Choices: [1,2,3,4,5,6]

# Dataset to run inference on
dataset = 1  # Choices: [1, 2, 3, 4]

# Size of the sample to generate
sample_size = '250'  # Choices: ['50','100','250','500','1000','1500']

# Path to the directory to store the generated samples
output_dir = '../../data'

# Random seed to use
seed = 2019

# Path to the directory containing the datasets
data_dir = '../../data'

# Whether to use the full label
not_use_full_labels = False

# Path to the dataset-task mappings file
dataset_task_mappings_fp = os.path.normpath(os.path.join(module_dir, '..', 'dataset_task_mappings.csv'))

# Whether to rewrite the dataframe in OpenAI format
rewrite_df_in_openai = True

# Number of epochs to train the model
n_epochs = 3

# Name of the run
run_name = 'finetune_chatGPT_3.5_template'

# Temperature to use when generating text
temp = 0.0

### Define Utility Functions

In [None]:
def load_train_and_eval_sets(data_dir: str, dataset_num: int, task_num: int, sample_size: int, dataset_eval:str) \
        -> dict[str, pd.DataFrame]:
    datasets = dict()

    train_dataset_task_files = glob.glob(os.path.join(data_dir, f'ds_{dataset_num}__task_{task_num}_train_set*.csv'))
    eval_set_name = f'ds_{dataset_num}__task_{task_num}_eval_set'
    datasets[eval_set_name] = pd.read_csv(os.path.join(data_dir, eval_set_name + '.csv'))

    # Load the additional evaluation dataset specified by dataset_eval
    second_eval_set_name = f'ds_{dataset_eval}__task_{task_num}_full_eval'
    datasets[second_eval_set_name] = pd.read_csv(os.path.join(data_dir, second_eval_set_name + '.csv'))

    if sample_size == 'all':
        train_dfs_ = {fn.strip('.csv'): pd.read_csv(fn) for fn in train_dataset_task_files}
        datasets.update(train_dfs_)
    else:
        train_df_fn = f'ds_{dataset_num}__task_{task_num}_train_set_{sample_size}'
        datasets[train_df_fn] = pd.read_csv(os.path.join(data_dir, train_df_fn + '.csv'))

        if train_df_fn not in [os.path.basename(fn).strip('.csv') for fn in train_dataset_task_files]:
            raise ValueError(f"Sample size {sample_size} not found for"
                             f" dataset {dataset_num} and task {task_num}")

    return datasets

In [None]:
def create_training_example(system_prompt, user_prompt_format, user_prompt_text, completion):
    return {'messages': [
        {'role': 'system',
         'content': system_prompt},

        {'role': 'user',
         'content': user_prompt_format.format(text=user_prompt_text)},

        {'role': 'assistant',
         'content': completion}
    ]}

In [None]:
def upload_datasets_to_openai(output_dir, not_use_full_labels, rewrite_df_in_openai, datasets):
    hatespeech_open_ai_metadata = list()

    df_id_metadata = pd.DataFrame() if not os.path.exists('hatespeech_open_ai_metadata.csv') \
        else pd.read_csv('hatespeech_open_ai_metadata.csv')

    for df_name, df in datasets.items():
        df_jsonl_filename = os.path.join(output_dir, 'temp', df_name + '.jsonl')
        write_jsonl(data_list=df['openai_instance_format'].tolist(), filename=df_jsonl_filename)

        if not_use_full_labels:
            df_name += '_single_letter_labels'

        if (not rewrite_df_in_openai and
                (len(df_id_metadata) > 0 and df_name in df_id_metadata['df_name'].tolist())):
            print(f"Dataset {df_name} already uploaded to OpenAI")
            continue

        print(f"Uploading {df_name} to OpenAI")
        df_response = openai.File.create(
            file=open(df_jsonl_filename, "rb"), purpose="fine-tune"
        )
        df_file_id = df_response["id"]

        # Wait until the file is processed
        while True:
            file = openai.File.retrieve(df_file_id)
            if file["status"] == "processed":
                break
            time.sleep(15)
        hatespeech_open_ai_metadata.append({'df_name': df_name, 'file_id': df_file_id})

    df_id_metadata = pd.concat([df_id_metadata, pd.DataFrame(hatespeech_open_ai_metadata)])
    df_id_metadata.to_csv('hatespeech_open_ai_metadata.csv', index=False)

    return df_id_metadata


In [None]:
def fine_tune_chat_gpt(evaluation_file_id, training_file_id, model_name, n_epochs):
    response = openai.FineTuningJob.create(
        training_file=training_file_id,
        validation_file=evaluation_file_id,
        model="gpt-3.5-turbo",
        suffix=model_name,
        hyperparameters={"n_epochs": n_epochs}
    )

    job_id = response["id"]
    print("Job ID:", response["id"])
    print("Status:", response["status"])

    # Wait until the job is done
    while True:
        job = openai.FineTuningJob.retrieve(job_id)
        if job["status"] == "succeeded":
            break
        elif job["status"] == "failed":
            raise Exception("Training failed: %s" % job["error"])
        time.sleep(30)

    return job_id

In [None]:
def print_and_log_finetuning_event_history(job_id):
    response = openai.FineTuningJob.list_events(id=job_id)
    events = response["data"]
    events.reverse()
    for event in events:
        print(event["message"])

    # Log events
    for event in events:
        if event['type'] != 'metrics':
            continue
        data = event['data']
        wandb.log(
            {
                "train_loss": data["train_loss"],
                "valid_loss": data["valid_loss"],
                "train_mean_token_accuracy": data["train_mean_token_accuracy"],
                "valid_mean_token_accuracy": data["valid_mean_token_accuracy"]
            },
            step=data['step']
        )


## Main Implementation

In [None]:
# Initialize the Weights and Biases run
wandb.init(
    # set the wandb project where this run will be logged
    project=WANDB_PROJECT_NAME,
    name=run_name if run_name != '' else f'{MODEL_NAME}_ds_{dataset}_task_{int(task)}'
                                                    f'_sample_{sample_size}_epochs_{n_epochs}'
                                                    f'_full_label_names_{str(not not_use_full_labels)}'
                                                    f'_temp_{temp}',

    # track hyperparameters and run metadata
    config = {
        "model": MODEL_NAME,
        "dataset": dataset_num_to_dataset_name[int(dataset)],
        "task": task_num_to_task_name[int(task)],
        "epochs": n_epochs,
        "temp": temp
    }
)

### Load and Process Data

In [None]:
# Load the dataset and filename
dataset_idx, dataset_task_mappings = load_dataset_task_prompt_mappings(
    dataset_num=dataset, task_num=task, dataset_task_mappings_fp=dataset_task_mappings_fp)

In [None]:
# Load the train and eval datasets
datasets = load_train_and_eval_sets(
    data_dir=data_dir, dataset_num=dataset, task_num=task, sample_size=sample_size)

In [None]:
# Get information specific to the dataset
label_column = dataset_task_mappings.loc[dataset_idx, "label_column"]
system_prompt = dataset_task_mappings.loc[dataset_idx, 'zero_shot_prompt']
user_prompt_format = dataset_task_mappings.loc[dataset_idx, 'user_prompt']

#system_user_prompt_division_line = 3 if args.task != 3 else 15
system_user_prompt_division_line = args.system_user_division
system_prompt = ('\n'.join(prompt.split('\n')[:-system_user_prompt_division_line])).strip()
user_prompt_format = ('\n'.join(prompt.split('\n')[-system_user_prompt_division_line:])).strip()
print(user_prompt_format)


# Log the system prompt and user_prompt_format as files in wandb
prompts_artifact = wandb.Artifact('prompts', type='prompts')
with prompts_artifact.new_file('system_prompt.txt', mode='w', encoding='utf-8') as f:
    f.write(system_prompt)
with prompts_artifact.new_file('user_prompt_format.txt', mode='w', encoding='utf-8') as f:
    f.write(user_prompt_format)
wandb.run.log_artifact(prompts_artifact)

In [None]:
# Generate the training and evaluation examples in the way expected by the Open AI API to finetune chatgpt3.5
preprocessed_output_dir = os.path.join(
    output_dir, 'preprocessed', 'full_name_labels' if not not_use_full_labels else 'single_letter_labels')

In [None]:
for df_name, df in datasets.items():
    df['completion_label'] = df[label_column].map(
        lambda label: map_label_to_completion(label=label, task_num = task,
                                              full_label=not not_use_full_labels)
        )
    df['openai_instance_format'] = df.apply(
        lambda row: create_training_example(
            system_prompt=system_prompt, user_prompt_format=user_prompt_format,
            user_prompt_text=row['text'],
            completion=row['completion_label']
        ),
        axis=1
    )
    df['openai_instance_without_completion'] = df['openai_instance_format'].map(lambda x: x['messages'][:-1])

In [None]:
print(f'Check for errors {df_name} set: ')
assert not dataset_has_format_errors(df['openai_instance_format'].tolist()), f"Errors found in {df_name}"

In [None]:
df.to_csv(os.path.join(preprocessed_output_dir, df_name + '.csv'), index=False)

### Upload Training Dataset to Open AI

In [None]:
# Create jsonl file and upload to OpenAI
df_id_metadata =upload_datasets_to_openai(output_dir, not_use_full_labels, rewrite_df_in_openai, datasets)

In [None]:
# delete all files in the temp folder
os.system(f"rm -rf {os.path.join(output_dir, 'temp')}")

### Finetune chatGPT Model with Training Set

In [None]:
# Run training on the train_df samples selected
eval_set_name = f'ds_{dataset}__task_{task}_eval_set'
eval_df = datasets[eval_set_name]
for df_name, df in datasets.items():
    if df_name == eval_set_name:
        continue

In [None]:
print(f"Finetuning {df_name}")
print('-' * 50)

In [None]:
# Log in wandb.config the dataset sample size used
sample_size = df_name.split('_')[-1]
wandb.config['trainset_size'] = sample_size

In [None]:
# Run the fine-tuning job
training_file_id = df_id_metadata.loc[df_id_metadata['df_name'] == df_name, 'file_id'].values[0]
evaluation_file_id = df_id_metadata.loc[df_id_metadata['df_name'] == eval_set_name, 'file_id'].values[0]

In [None]:
model_name = (df_name.replace('__', '_')
              .replace('train_set', 'trn')
              .replace('task', 't')
              .replace('_single_letter_labels', '_sl'))
job_id = fine_tune_chat_gpt(evaluation_file_id, training_file_id, model_name=model_name, n_epochs=n_epochs)

In [None]:
# Print the model name
response = openai.FineTuningJob.retrieve(job_id)
full_model_name = response["fine_tuned_model"]
print(f'The model {full_model_name} has been successfully fine-tuned')
wandb.config['model_name_openai'] = full_model_name
wandb.config['finetuning_jobid'] = job_id
wandb.config['training_file_openai_id'] = response['training_file']
wandb.config['validation_file_openai_id'] = response['validation_file']

In [None]:
# Print the events (training history of the model)
print_and_log_finetuning_event_history(job_id)

## Evaluate Model on Validation Set

In [None]:
#  Evaluate the model on the evaluation set and store the predictions
print("\n" + "#" * 50)
print("Getting predictions on the evaluation set")
predictions = []

In [None]:
for messages in tqdm(eval_df['openai_instance_without_completion'].tolist()):
    # Retry the completion at least COMPLETION_RETRIES times
    num_retries = 2
    response = None
    while num_retries < COMPLETION_RETRIES and response is None:
        try:
            response = openai.ChatCompletion.create(
                model=full_model_name,
                messages=messages,
                temperature=temp,
                n=1
            )
        except Exception as e:
            print('Error getting predictions. Retrying...')
            time.sleep(5)
            num_retries += 1
            if num_retries >= COMPLETION_RETRIES:
                print('Maximum amount of retires reached')
                raise e
    predictions.append(response['choices'][0]['message']['content'])

In [None]:
# Add predictions to df
eval_df['prediction'] = predictions

In [None]:
# Store output
predictions_output_dir = os.path.join(output_dir, 'predictions',
                                      f'dataset_{dataset}_task_{task}')
os.makedirs(predictions_output_dir, exist_ok=True)
datasets[eval_set_name].to_csv(
    os.path.join(predictions_output_dir, f"{model_name}-{run_name}.csv"),
    index=False)

In [None]:
# Get performance metrics
y_true = eval_df['completion_label']
y_pred = eval_df['prediction']

In [None]:
label_type = 'full_name' if not not_use_full_labels else 'short_name'
display_labels = task_to_display_labels[task][label_type]
labels = display_labels

In [None]:
cm_plot, classification_report, metrics = plot_count_and_normalized_confusion_matrix(
    y_true, y_pred, display_labels, labels, xticks_rotation='horizontal')

In [None]:
# Log metrics
for metric_name, metric_value in metrics.items():
    wandb.log({metric_name: metric_value})

In [None]:
# Log the confusion matrix matplotlib figure
wandb.log({'confusion_matrix': wandb.Image(cm_plot)})

In [None]:
# Log the classification report as an artifact
classification_report = (pd.DataFrame({k: v for k, v in classification_report.items() if k != 'accuracy'})
                         .transpose().reset_index())

wandb.log({'classification_report': wandb.Table(
    dataframe=classification_report)})

classification_report_artifact = wandb.Artifact(
      f'classification_report_{model_name}', type='classification_report')

with classification_report_artifact.new_file('classification_report.txt', mode='w') as f:
    f.write(pprint.pformat(classification_report))

wandb.run.log_artifact(classification_report_artifact)

## Terminate WandB

In [None]:
# Mark the end of the run
wandb.finish()