In [2]:
import os
import csv
import torch
import random
import pandas as pd
import pyarrow as pa
from tqdm import tqdm
from trl import SFTTrainer
from datasets import Dataset
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [3]:
shapenet_core_dir = "/scratch/noam/ShapeNetCore.v1"
data_csv_path = "/home/noamatia/repos/PEPSI/data/shapetalk.csv"
finetune_csv_path = "/home/noamatia/repos/PEPSI/data/finetune_llama3.csv"
shapetalk_csv_path = (
    "/scratch/noam/shapetalk/language/shapetalk_preprocessed_public_version_0.csv"
)
shape_category_to_synset_id = {
    "chair": "03001627",
    "table": "04379243",
    "lamp": "03636649",
}
shape_category_to_part_to_key_words = {
    "chair": {
        "back": ["back", "spindle"],
        "leg": ["leg", "feet", "foot", "stretcher", "wheel"],
        "arm": ["arm", "handle"],
        "seat": ["seat", "base", "apron"],
    },
    "table": {
        "top": ["top", "apron", "surface", "skirt"],
        "leg": ["leg", "feet", "foot", "wheel"],
        "support": ["support", "brace", "stretcher", "crossbar"],
    },
    "lamp": {
        "shade": ["shade", "top"],
        "base": ["base"],
        "tube": [
            "tube",
            "leg",
            "rod",
            "arm",
            "neck",
            "pole",
            "strut",
            "body",
            "column",
        ],
        "bulb": ["bulb", "light"],
    },
}

# Create first version of the dataset

In [11]:
df = pd.read_csv(shapetalk_csv_path)
df = df[df.source_object_class.isin(list(shape_category_to_synset_id.keys()))]
df = df[df.changeit_split.isin(["train", "test"])]
df.to_csv(data_csv_path, index=True, index_label="id")

# Add wnlemmas to samples

In [None]:

df = pd.read_csv(data_csv_path, index_col="id")
df["source_wnlemmas"], df["target_wnlemmas"] = "", ""
df["source_random_wnlemma"], df["target_random_wnlemma"] = "", ""


def build_shapenetuid_to_wnlemmas(synset_id):
    data_dict = {}
    with open(f"{shapenet_core_dir}/{synset_id}.csv", mode="r") as csv_file:
        csv_reader = csv.DictReader(csv_file)
        for row in csv_reader:
            data_dict[row["fullId"].removeprefix("3dw.")] = row["wnlemmas"]
    return data_dict


shape_category_to_shapenetuid_to_wnlemmas = {}
for category, synset_id in shape_category_to_synset_id.items():
    shape_category_to_shapenetuid_to_wnlemmas[category] = build_shapenetuid_to_wnlemmas(
        synset_id
    )

for i, row in tqdm(df.iterrows(), total=len(df)):
    if row.source_dataset == "ShapeNet":
        wnlemmas = shape_category_to_shapenetuid_to_wnlemmas[row.source_object_class][
            row.source_model_name
        ]
        df.at[i, "source_wnlemmas"] = wnlemmas
        df.at[i, "source_random_wnlemma"] = random.choice(wnlemmas.split(","))
    else:
        df.at[i, "source_wnlemmas"] = row.source_object_class
        df.at[i, "source_random_wnlemma"] = row.source_object_class
    if row.target_dataset == "ShapeNet":
        wnlemmas = shape_category_to_shapenetuid_to_wnlemmas[row.target_object_class][
            row.target_model_name
        ]
        df.at[i, "target_wnlemmas"] = wnlemmas
        df.at[i, "target_random_wnlemma"] = random.choice(wnlemmas.split(","))
    else:
        df.at[i, "target_wnlemmas"] = row.target_object_class
        df.at[i, "target_random_wnlemma"] = row.target_object_class

df.to_csv(data_csv_path, index=True, index_label="id")

# Add part annotation by keywords

In [5]:
df = pd.read_csv(data_csv_path, index_col="id")

def utterance_and_shape_category_to_part(utterance: str, shape_category: str) -> str:
    parts = [part for part, keywords in shape_category_to_part_to_key_words[shape_category].items() if any(keyword in utterance for keyword in keywords)]
    if len(parts) == 1:
        return parts[0]
    else:
        return "none"
    
df["part_keywords"] = df.apply(lambda row: utterance_and_shape_category_to_part(row.utterance, row.source_object_class), axis=1)
df.to_csv(data_csv_path, index=True, index_label="id")

# Llama3
[source](https://colab.research.google.com/github/A3597466/unsloth-T4/blob/main/Alpaca_%2B_Llama_3_8b_full_example.ipynb#scrollTo=kR3gIAX-SM2q)

Create data for finetuning

In [3]:
df = pd.read_csv(data_csv_path, index_col="id")
if os.path.exists(finetune_csv_path):
    finetune_df = pd.read_csv(finetune_csv_path, index_col="id")
else:
    finetune_df = df.sample(300)
    finetune_df["utterance_llama3"], finetune_df["part"] = "", ""
    for i, row in tqdm(finetune_df.iterrows(), total=len(finetune_df)):
        print(row.target_random_wnlemma, row.utterance)
        finetune_df["utterance_llama3"][i] = input("Enter the correct utterance: ")
        finetune_df["part_llama3"][i] = input("Enter the part of the utterance: ")
        print()
    finetune_df.to_csv(finetune_csv_path, index=True, index_label="id")

In [4]:
class Llama3Dataset(Dataset):
    def __init__(self, df):
        arrow_table = pa.Table.from_pandas(df)
        super().__init__(arrow_table=arrow_table)


alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


def formatting_prompts_func(examples, eos_token):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + eos_token
        texts.append(text)
    return {
        "text": texts,
    }


def build_dataset(df, fn_instruction, fn_input, fn_output, eos_token):
    headers, rows = ["instruction", "input", "output"], []
    for _, row in df.iterrows():
        instruction = fn_instruction(row)
        input = fn_input(row)
        output = fn_output(row)
        rows.append([instruction, input, output])
    dataset = Llama3Dataset(pd.DataFrame(rows, columns=headers))
    dataset = dataset.map(
        lambda examples: formatting_prompts_func(examples, eos_token),
        batched=True,
    )
    return dataset


def build_model_and_tokenizer():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/llama-3-8b-bnb-4bit",
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )
    return model, tokenizer


def build_trainer(model, tokenizer, finetune_dataset):
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=finetune_dataset,
        dataset_text_field="text",
        max_seq_length=2048,
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_steps=5,
            max_steps=60,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
        ),
    )
    return trainer

Finetune llama3 to deduce part from utterance

In [None]:
part_fn_instruction = lambda row: f"What part of a {row.source_object_class} does the next utterance describe? Return a single word using only one of the following options: {', '.join(list(shape_category_to_part_to_key_words[row.source_object_class].keys()))}. If none of the parts is described in the utterance return none."
part_fn_input = lambda row: row.utterance
model, tokenizer = build_model_and_tokenizer()
finetune_dataset = build_dataset(
    finetune_df,
    part_fn_instruction,
    part_fn_input,
    lambda row: row.part_llama3,
    tokenizer.eos_token,
)
trainer = build_trainer(model, tokenizer, finetune_dataset)
trainer.train()

In [None]:
if "part_llama3" not in df.columns:
    df["part_llama3"] = ""
FastLanguageModel.for_inference(model)
count = 0
for i, row in tqdm(df.iterrows(), total=len(df)):
    if row.part_llama3:
        continue
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                part_fn_instruction(row),
                part_fn_input(row),
                "",
            )
        ],
        return_tensors="pt",
    ).to("cuda") 
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache = True)
    output = (
        tokenizer.batch_decode(outputs)[0]
        .split("### Response:")[1]
        .split("\n")[1]
        .split(tokenizer.eos_token)[0]
    )
    df.at[i, "part_llama3"] = output
    count += 1
    if count % 1000 == 0:
        df.to_csv(data_csv_path, index=True, index_label="id")

In [None]:
part_fn_instruction = lambda row: f"What part of a {row.source_object_class} does the next utterance describe? Return a single word using only one of the following options: {', '.join(list(shape_category_to_part_to_key_words[row.source_object_class].keys()))}."
part_fn_input = lambda row: row.utterance
model, tokenizer = build_model_and_tokenizer()
finetune_dataset = build_dataset(
    finetune_df[finetune_df.part_llama3 != "none"],
    part_fn_instruction,
    part_fn_input,
    lambda row: row.part_llama3,
    tokenizer.eos_token,
)
trainer = build_trainer(model, tokenizer, finetune_dataset)
trainer.train()

In [None]:
if "part_llama3_no_none" not in df.columns:
    df["part_llama3_no_none"] = ""
FastLanguageModel.for_inference(model)
count = 0
for i, row in tqdm(df.iterrows(), total=len(df)):
    if row.part_llama3_no_none:
        continue
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                part_fn_instruction(row),
                part_fn_input(row),
                "",
            )
        ],
        return_tensors="pt",
    ).to("cuda") 
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache = True)
    output = (
        tokenizer.batch_decode(outputs)[0]
        .split("### Response:")[1]
        .split("\n")[1]
        .split(tokenizer.eos_token)[0]
    )
    df.at[i, "part_llama3_no_none"] = output
    count += 1
    if count % 1000 == 0:
        df.to_csv(data_csv_path, index=True, index_label="id")

Set part and locality

In [27]:
df = pd.read_csv(data_csv_path, index_col="id")

def set_part(row):
    if row.part_keywords != "none":
        return row.part_keywords
    elif row.part_llama3 != "none":
        return row.part_llama3
    else:
        return row.part_llama3_no_none
    
df["part"] = df.apply(lambda row: set_part(row), axis=1)
df["is_local"] = df.apply(lambda row: row.part_keywords != "none" or row.part_llama3 != "none", axis=1)
df.to_csv(data_csv_path, index=True, index_label="id")

Finetune llama3 to rewrite the comparative utterance as a descriptive utterance

In [None]:
utterance_fn_instruction = (
    lambda row: f"Rewrite the comparative utterance of a {row.target_random_wnlemma} as a descriptive utterance about a {row.target_random_wnlemma}."
)
utterance_fn_input = lambda row: row.utterance
model, tokenizer = build_model_and_tokenizer()
finetune_dataset = build_dataset(
    finetune_df,
    utterance_fn_instruction,
    utterance_fn_input,
    lambda row: row.utterance_llama3,
    tokenizer.eos_token,
)
trainer = build_trainer(model, tokenizer, finetune_dataset)
trainer.train()

In [None]:
if "utterance_llama3" not in df.columns:
    df["utterance_llama3"] = ""
FastLanguageModel.for_inference(model)
count = 0
for i, row in tqdm(df.iterrows(), total=len(df)):
    if row.utterance_llama3:
        continue
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                utterance_fn_instruction(row),
                utterance_fn_input(row),
                "",
            )
        ],
        return_tensors="pt",
    ).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    output = (
        tokenizer.batch_decode(outputs)[0]
        .split("### Response:")[1]
        .split("\n")[1]
        .split(tokenizer.eos_token)[0]
    )
    df.at[i, "utterance_llama3"] = output
    print(row.utterance, "->", output)
    count += 1
    if count % 1000 == 0:
        df.to_csv(data_csv_path, index=True, index_label="id")
df.to_csv(data_csv_path, index=True, index_label="id")