In [None]:
!pip install datasts py7zr

# About

In this notebook, we will see how to use a managed service like replicate to finetune a base Llama model for our task.

In [25]:
# Important: install these dependencies first:
# pip install datasets py7zr
from datasets import load_dataset
import os
import pandas as pd
from dotenv import load_dotenv
import replicate

In [3]:
load_dotenv("../.env") 

True

## Training Data

This is the data that we extracted from GPT4 using the prompt from our st prompt.


In [5]:
df = pd.read_parquet("data/st_trainining_input.parquet")

In [6]:
len(df)

1970

In [7]:
df

Unnamed: 0,query,output
0,revent 80 cfm,"{""brand"":null,""gender"":""not_given"",""product_ty..."
1,bathroom fan without light,"{""brand"":null,""gender"":""not_given"",""product_ty..."
2,bathroom fan with light,"{""brand"":null,""gender"":""not_given"",""product_ty..."
3,110cfm bathroom exhaust fan without light,"{""brand"":null,""gender"":null,""product_type"":""ba..."
4,12 inch bathroomwall mounted fan,"{""brand"":null,""gender"":""not_given"",""product_ty..."
...,...,...
1965,"Plastic Planter, HOMENOTE 7/6/5.5/4.8/4.5 Inch...","{""brand"":""HOMENOTE"",""gender"":""not_given"",""prod..."
1966,"Classic Home and Garden Honeysuckle Planter, P...","{""brand"":""Classic Home and Garden"",""gender"":""n..."
1967,1020 Trays - Seed Starter Garden Plant Propaga...,"{""brand"":""1020 Trays"",""gender"":""not_given"",""pr..."
1968,10 Pack of Durable Black Plastic Growing Trays...,"{""brand"":null,""gender"":null,""product_type"":""Gr..."


In [8]:
df.iloc[0].to_dict()

{'query': ' revent 80 cfm',
 'output': '{"brand":null,"gender":"not_given","product_type":null,"color":null,"size":"80 cfm"}'}

## Prompt

Since we will be using the base model, we can choose a prompt format that works for our task. 

We will use the same format that we used in the st task

In [37]:
PROMPT_TEMPLATE_SYS = "[INST] <<SYS>>\nExtract attributes from the given e-commerce customer query.\nPossible attributes are 'product type', 'brand', 'gender', 'color', 'size'.<</SYS>>\n\nInput:\n{query} [/INST]\n\nOutput: {output}"


In [38]:
print(PROMPT_TEMPLATE_SYS)

[INST] <<SYS>>
Extract attributes from the given e-commerce customer query.
Possible attributes are 'product type', 'brand', 'gender', 'color', 'size'.<</SYS>>

Input:
{query} [/INST]

Output: {output}


In [16]:
PROMPT_TEMPLATE = "Extract attributes from the given e-commerce customer query.\nPossible attributes are 'product type', 'brand', 'gender', 'color', 'size'.\n\nInput:\n{query} \n\nOutput:\n{output}"


In [17]:
print(PROMPT_TEMPLATE)

Extract attributes from the given e-commerce customer query.
Possible attributes are 'product type', 'brand', 'gender', 'color', 'size'.

Input:
{query} 

Output:
{output}


for this chat complettion task, we need to combine the input and output as one string

In [19]:
def format_instruction(sample):
    return PROMPT_TEMPLATE.format(query=sample["query"], output=sample["output"])


In [20]:
df['text'] = df.apply(format_instruction , axis=1)

In [21]:
df.iloc[0].to_dict()

{'query': ' revent 80 cfm',
 'output': '{"brand":null,"gender":"not_given","product_type":null,"color":null,"size":"80 cfm"}',
 'text': 'Extract attributes from the given e-commerce customer query.\nPossible attributes are \'product type\', \'brand\', \'gender\', \'color\', \'size\'.\n\nInput:\n revent 80 cfm \n\nOutput:\n{"brand":null,"gender":"not_given","product_type":null,"color":null,"size":"80 cfm"}'}

In [22]:
print(df.iloc[0].to_dict()['text'])

Extract attributes from the given e-commerce customer query.
Possible attributes are 'product type', 'brand', 'gender', 'color', 'size'.

Input:
 revent 80 cfm 

Output:
{"brand":null,"gender":"not_given","product_type":null,"color":null,"size":"80 cfm"}


In [23]:
df[['text']].to_json("data/st_train.jsonl",lines=True, orient="records")

In [24]:
!head data/st_train.jsonl -n 5

{"text":"Extract attributes from the given e-commerce customer query.\nPossible attributes are 'product type', 'brand', 'gender', 'color', 'size'.\n\nInput:\n revent 80 cfm \n\nOutput:\n{\"brand\":null,\"gender\":\"not_given\",\"product_type\":null,\"color\":null,\"size\":\"80 cfm\"}"}
{"text":"Extract attributes from the given e-commerce customer query.\nPossible attributes are 'product type', 'brand', 'gender', 'color', 'size'.\n\nInput:\nbathroom fan without light \n\nOutput:\n{\"brand\":null,\"gender\":\"not_given\",\"product_type\":\"bathroom fan\",\"color\":null,\"size\":null}"}
{"text":"Extract attributes from the given e-commerce customer query.\nPossible attributes are 'product type', 'brand', 'gender', 'color', 'size'.\n\nInput:\nbathroom fan with light \n\nOutput:\n{\"brand\":null,\"gender\":\"not_given\",\"product_type\":\"bathroom fan\",\"color\":null,\"size\":null}"}
{"text":"Extract attributes from the given e-commerce customer query.\nPossible attributes are 'product ty

## Replicate Steps

### upload training data

the below command uploads your dataset to a cdn accesible by replicate.  
You can also use a public location 

replace with your api key

In [45]:
%%bash

export REPLICATE_API_TOKEN="...."

RESPONSE=$(curl -s -X POST -H "Authorization: Token $REPLICATE_API_TOKEN" https://dreambooth-api-experimental.replicate.com/v1/upload/data.jsonl)

curl -X PUT -H "Content-Type: application/jsonl" --upload-file data/st_train.jsonl "$(jq -r ".upload_url" <<< "$RESPONSE")"

SERVING_URL=$(jq -r ".serving_url" <<< $RESPONSE)
echo $SERVING_URL

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 78367    0     0  100 78367      0  15064  0:00:05  0:00:05 --:--:-- 19212


https://replicate.delivery/pbxt/JyrgXaadFpOt1V3J6CGRW1Jdoe6t2DObqdYjm9aFG6MIcESm/data.jsonl


In [None]:
!ls

base llama model checkpoints

In [53]:
model_version ="meta/llama-2-7b-chat:13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0"

model_version = "meta/llama-2-7b:73001d654114dad81ec65da3b834e2f691af1e1526453189b7bf36fb3f32d0f9"


### Training Job

In [None]:
upload_url = "https://replicate.delivery/pbxt/JyrgXaadFpOt1V3J6CGRW1Jdoe6t2DObqdYjm9aFG6MIcESm/data.jsonl"

In [54]:


training = replicate.trainings.create(
  version=model_version,
  input={
    "train_data": upload_url,
    "num_train_epochs": 3
  },
  destination="npatta01/llama2-ecommerce-st"
)

print(training)

id='y4czartb25qvhmjuqt3cw5elyi' model='meta/llama-2-7b-chat' version='13c3cdee13ee059ab779f0291d29054dab00a47dad8261375654de5540165fb0' destination=None status='starting' input={'num_train_epochs': 3, 'train_data': 'https://replicate.delivery/pbxt/JyrgXaadFpOt1V3J6CGRW1Jdoe6t2DObqdYjm9aFG6MIcESm/data.jsonl'} output=None logs='' error=None created_at='2023-12-02T21:52:22.374751149Z' started_at=None completed_at=None urls={'cancel': 'https://api.replicate.com/v1/predictions/y4czartb25qvhmjuqt3cw5elyi/cancel', 'get': 'https://api.replicate.com/v1/predictions/y4czartb25qvhmjuqt3cw5elyi'}


In [None]:
job_id = y4czartb25qvhmjuqt3cw5elyi

In [56]:
# If you've got a handle to the object returned by create()
training.reload()

# If you've got the training ID
training = replicate.trainings.get(job_id)

if training.status == "succeeded":
    print(training.output)
    # {"weights": "...", "version": "..."

## References

https://replicate.com/blog/fine-tune-llama-2