In [2]:
% load_ext dotenv
% dotenv

import os

import openai
import pandas as pd
from sklearn.model_selection import train_test_split

import wandb

openai.api_key = os.environ["OPENAI_API_KEY"]

### Creating a subset of the MAVE dataset

In [3]:
positive_samples = pd.read_json("data/mave_positives.jsonl", lines=True, orient="records")
subset = positive_samples[positive_samples["attributes"].map(len) > 2]
category_counts = subset.category.value_counts()
subset = subset[subset.category.isin(category_counts.index[:20])]

subset = subset.groupby(subset.category).apply(lambda x: x.sample(150, replace=False)).reset_index(drop=True)

attribute_keys = subset.attributes.map(lambda x: [item["key"] for item in x])
exploded_keys = attribute_keys.explode()
exploded_key_counts = exploded_keys.value_counts()
subset_keys = exploded_key_counts.index[exploded_key_counts > 50]
selected_keys = exploded_keys[(exploded_keys.isin(subset_keys))]
keys_index = selected_keys.index.unique()
subset = subset.loc[keys_index]
subset.category.value_counts().sum()

In [7]:
wandb.init(project="mave", entity="parambharat")
raw_dataset = wandb.Artifact("raw_dataset", type="dataset")
raw_dataset.add(wandb.Table(dataframe=subset), "raw_dataset")
wandb.log_artifact(raw_dataset)
wandb.finish()

### Preprocessing the dataset

In [8]:
import json


def prepare_dataset(row):
    paragraphs = row["paragraphs"]
    attributes = row["attributes"]

    completion = {}

    pids = []
    for attribute in attributes:
        key = attribute["key"]
        for evidence in attribute["evidences"]:
            pid = evidence['pid']
            source = paragraphs[pid].get('source', pid)
            if source in ["title", ]:
                current = {key: evidence['value']}
                if current[key].lower() not in map(lambda x: x.lower(), completion.values()):
                    completion[key] = current[key]
                    pids.append(pid)
    completion["category"] = row["category"]
    completion = " " + json.dumps(completion) + "\n\n###\n\n"

    prompt = ""
    for pid in set(pids):
        source = paragraphs[pid]
        prompt += f"{source.get('text', '')}\n"
    prompt += "==>\n"

    return pd.Series({"prompt": prompt, "completion": completion})

In [9]:
wandb.init(project="mave", entity="parambharat")
artifact = wandb.use_artifact('raw_dataset:latest', type="dataset")
subset = artifact.get("raw_dataset")
subset = pd.DataFrame(subset.data, columns=subset.columns)
subset

[34m[1mwandb[0m:   1 of 1 files downloaded.  


Unnamed: 0,id,category,paragraphs,attributes
0,B000T8A2GU,Cabinet Knobs & Handles,"[{'source': 'title', 'text': 'Hickory Hardware...","[{'evidences': [{'begin': 44, 'end': 48, 'pid'..."
1,B013VN2YZQ,Cabinet Knobs & Handles,"[{'source': 'title', 'text': 'Southern Hills P...","[{'evidences': [{'begin': 15, 'end': 23, 'pid'..."
2,B01BOG12PA,Cabinet Knobs & Handles,"[{'source': 'title', 'text': 'Cooality KN02SN ...","[{'evidences': [{'begin': 57, 'end': 61, 'pid'..."
3,B01549J96M,Cabinet Knobs & Handles,"[{'source': 'title', 'text': '(30 Pack) Probri...","[{'evidences': [{'begin': 48, 'end': 54, 'pid'..."
4,B0088E0JOW,Cabinet Knobs & Handles,"[{'source': 'title', 'text': 'Set of 7 Tropica...","[{'evidences': [{'begin': 24, 'end': 31, 'pid'..."
...,...,...,...,...
2993,B00AGPOR80,Watches,"[{'source': 'title', 'text': 'Breitling Men's ...","[{'evidences': [{'begin': 60, 'end': 66, 'pid'..."
2994,B00KINCONK,Watches,"[{'source': 'title', 'text': 'Tissot Men's T03...","[{'evidences': [{'begin': 222, 'end': 240, 'pi..."
2995,B00JRVEVHG,Watches,"[{'source': 'title', 'text': 'Youyoupifa 5 Pie...","[{'evidences': [{'begin': 0, 'end': 16, 'pid':..."
2996,B00NGLO0UG,Watches,"[{'source': 'title', 'text': 'Swiss Legend Men...","[{'evidences': [{'begin': 418, 'end': 427, 'pi..."


In [11]:
train_df, test_df = train_test_split(subset, stratify=subset.category, test_size=0.25)
val_df, test_df = train_test_split(test_df, stratify=test_df.category, test_size=0.5)

train_df = train_df.apply(prepare_dataset, axis=1)
train_df.to_json("prompts_dataset_train.jsonl", lines=True, orient="records")

val_df = test_df.apply(prepare_dataset, axis=1)
val_df.to_json("prompts_dataset_val.jsonl", lines=True, orient="records")

test_df = test_df.apply(prepare_dataset, axis=1)
test_df.to_json("prompts_dataset_test.jsonl", lines=True, orient="records")

In [12]:
# wandb.init(project="mave", entity="parambharat")
split_dataset = wandb.Artifact("split_dataset", type="dataset")
split_dataset.add(wandb.Table(dataframe=train_df), "train")
split_dataset.add(wandb.Table(dataframe=val_df), "val")
split_dataset.add(wandb.Table(dataframe=test_df), "test")
wandb.log_artifact(split_dataset)
wandb.finish()

In [18]:
json.dumps(json.loads(subset.sample(1).to_json(lines=True, orient="records")))

'{"id": "B008VSYQPS", "category": "Candy & Chocolate", "paragraphs": [{"source": "title", "text": "Eclipse Spearmint Sugarfree Gum, 18-Piece Pack (3 Packs)"}, {"source": "description", "text": "Eclipse Sugar Free Gum, Spearmint"}, {"source": "description", "text": "WM. Wrigley Jr. Company, Chicago, IL 60642"}, {"source": "description", "text": "MADE OF:SORBITOL, MALTITOL, GUM BASE, GLYCEROL, NATURAL AND ARTIFICIAL FLAVORS, GUM ARABIC; LESS THAN 2% OF: SOY LECITHIN, ASPARTAME, COLOR (TITANIUM DIOXIDE), ACESULFAME K, CARNAUBA WAX, BHT (TO MAINTAIN FRESHNESS)."}, {"source": "description", "text": "Remove product from packaging"}, {"source": "description", "text": "Statements regarding dietary supplements have not been evaluated by the FDA and are not intended to diagnose, treat, cure, or prevent any disease or health condition."}, {"source": "price", "text": "$2.79"}, {"source": "brand", "text": "Eclipse Gum"}], "attributes": [{"evidences": [{"begin": 8, "end": 17, "pid": 0, "value": "Spe

In [None]:
!openai tools fine_tunes.prepare_data -f prompts_dataset_train.jsonl -q
!openai tools fine_tunes.prepare_data -f prompts_dataset_val.jsonl -q
!openai tools fine_tunes.prepare_data -f prompts_dataset_test.jsonl -q


In [None]:
train_df = pd.read_json("prompts_dataset_train_prepared.jsonl", lines=True, orient="records")
val_df = pd.read_json("prompts_dataset_val_prepared.jsonl", lines=True, orient="records")
test_df = pd.read_json("prompts_dataset_test_prepared.jsonl", lines=True, orient="records")

wandb.init(project="mave", entity="parambharat")
prepared_dataset = wandb.Artifact("prepared_dataset", type="dataset")
prepared_dataset.add(wandb.Table(dataframe=train_df), "train")
prepared_dataset.add(wandb.Table(dataframe=val_df), "val")
prepared_dataset.add(wandb.Table(dataframe=test_df), "test")
wandb.log_artifact(prepared_dataset)
wandb.finish()

In [None]:
## Finetuning GPT 3 model

In [None]:
# !export $(cat ./.env | grep -v ^# | xargs) >/dev/null
# !openai api fine_tunes.create -t "prompts_dataset_train_prepared.jsonl" -v "prompts_dataset_val_prepared.jsonl" -m ada --suffix "mave attribute recognition"

In [None]:
loaded_items = train_df["completion"].str.strip("\n\n###\n\n").map(json.loads)
shirts = loaded_items[loaded_items.map(lambda x: x["category"] == "Shirts & Tops")]

a = shirts.iloc[5]
b = shirts.iloc[4]
print(a)
print(b)

In [3]:
run = wandb.init(project="mave", entity="parambharat", job_type="eval", reinit=False)

finetune_artifact = run.use_artifact('parambharat/mave/fine_tune_details:v26', type='fine_tune_details')
finetune_dir = finetune_artifact.download()

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [4]:
wandb.config.update({k: finetune_artifact.metadata[k] for k in ['fine_tuned_model', 'model', 'hyperparams']})

In [8]:
validation_artifact = run.use_artifact(f'parambharat/mave/prepared_dataset:latest', type="dataset")
val_table = validation_artifact.get("val")
val_df = pd.DataFrame(val_table.data, columns=val_table.columns)
test_table = validation_artifact.get("test")
test_df = pd.DataFrame(test_table.data, columns=test_table.columns)

[34m[1mwandb[0m:   3 of 3 files downloaded.  


In [None]:
config = wandb.config
config.fine_tuned_model

### Evaluating the finetuned model

In [13]:
import tqdm

In [14]:
def predict_completions(val_df):
    data = []

    for _, row in tqdm.tqdm(val_df.iterrows(), total=len(val_df)):
        prompt = row['prompt']
        res = openai.Completion.create(model=config.fine_tuned_model, prompt=prompt, max_tokens=256,
                                       stop=["\n\n###\n\n"])
        completion = res['choices'][0]['text']
        prompt = prompt[:-5]  # remove "\n==>\n"
        target = row['completion'][1:-7]  # remove initial space and "END"
        data.append([prompt, target, completion])
    return pd.DataFrame(data, columns=["prompt", "reference", "prediction"])



In [15]:
def score_dict_similar(row):
    reference = row["reference"]
    prediction = row["prediction"]
    try:
        reference = json.loads(reference)
        prediction = json.loads(prediction)
        common = len(set(reference.items()) & set(prediction.items()))
        actual = len(reference.items())
        return common / actual
    except:
        return 0.0


def prompt_to_bio(row, label_key="target"):
    prompt = row["prompt"]
    target = row[label_key]
    try:
        target = json.loads(target)
        prompt = prompt.split()
        labels = ["O"] * len(prompt)
    except:
        labels = ["O"]
        return labels

    for attribute, value in target.items():
        values = value.split()
        start_ent = False
        for idx, word in enumerate(values):
            try:
                first_idx = prompt.index(word)
                if idx == 0:
                    first_idx = prompt.index(word)
                    labels[first_idx] = f"B-{attribute}"
                    start_ent = True
                elif start_ent:
                    first_idx = prompt.index(word)
                    labels[first_idx] = f"I-{attribute}"
            except ValueError:
                pass
    return labels


def to_category(row):
    reference = json.loads(row["reference"])["category"]
    try:
        prediction = json.loads(row["prediction"])["category"]
    except Exception:
        return pd.Series({"reference_category": reference, "predicted_category": ""})
    return pd.Series({"reference_category": reference, "predicted_category": prediction})



In [16]:
results_df = predict_completions(test_df)

100%|██████████████████████████████████████████████████████████████████| 369/369 [03:19<00:00,  1.85it/s]


In [25]:
from functools import partial
import json
from sklearn.metrics import classification_report

In [21]:
prompt_to_labels = partial(prompt_to_bio, label_key="reference")
prompt_to_predictions = partial(prompt_to_bio, label_key="prediction")
results_df["reference_labels"] = results_df.apply(prompt_to_labels, axis=1)
results_df["predicted_labels"] = results_df.apply(prompt_to_predictions, axis=1)
results_df[["reference_category", "predicted_category"]] = results_df.apply(to_category, axis=1)

In [26]:
import evaluate

metric = evaluate.load("seqeval")


def evaluate_results(results_df):
    results_df = results_df[results_df.reference_labels.map(len) == results_df.predicted_labels.map(len)]
    results_df["exact_match_score"] = results_df.apply(score_dict_similar, axis=1)
    seq_results = metric.compute(
        predictions=results_df["predicted_labels"].tolist(),
        references=results_df["reference_labels"].tolist())

    seq_results = (pd.DataFrame(seq_results)
                   .T
                   .reset_index()
                   .rename({"index": "label"}, axis=1)
                   )

    clf_results = clf_report = classification_report(
        y_true=results_df["reference_category"],
        y_pred=results_df["predicted_category"],
        output_dict=True)
    clf_results = (pd.DataFrame(clf_results)
                   .T
                   .reset_index()
                   .rename({"index": "label"}, axis=1)
                   )
    return results_df, seq_results, clf_results


In [27]:
results_df, seq_results, clf_results = evaluate_results(results_df)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
wandb.log({"test_predictions": wandb.Table(dataframe=results_df),
           "test_seq_eval_metrics": wandb.Table(dataframe=seq_results),
           "test_classification_metrics": wandb.Table(dataframe=clf_results),
           })

In [31]:
# wandb.run.summary["test_exact_match_score"] = results_df.exact_match_score.describe().to_dict()
# wandb.run.summary["test_classification_metric"] = json.loads(clf_results.loc[22:].set_index("label").T.to_json())
wandb.run.summary["test_seqeval_metrics"] = json.loads(
    seq_results[(seq_results
                 .label
                 .str
                 .startswith("overall"))]
    .set_index("label")
    .T
    .to_json())

In [35]:
seq_results[seq_results.label == "overall_f1"]["f1"].values[0]
clf_results[clf_results.label == "macro avg"]["f1-score"].values[0]

0.7816091954022988

In [40]:
wandb.run.summary["exact_match_score"] = results_df.exact_match_score.mean()
wandb.run.summary["seqeval_f1score"] = seq_results[seq_results.label == "overall_f1"]["f1"].values[0]
wandb.run.summary["classification_f1score"] = clf_results[clf_results.label == "macro avg"]["f1-score"].values[0]

In [41]:
wandb.finish()

VBox(children=(Label(value='0.427 MB of 0.443 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.964135…

0,1
classification_f1score,0.89387
exact_match_score,0.81206
seqeval_f1score,0.78161
