### Read CSV Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../learning_dataset/query_fine_tune/dataset.csv")

In [3]:
df = df.head(100)

In [4]:
df.head()

Unnamed: 0,query,refined_query
0,blue t-shirt men large,large blue t-shirt for men
1,running shoes women size 8,"women's running shoes, size 8"
2,black leather jacket,black jacket made of leather
3,iphone 13 pro max 128gb,iPhone 13 Pro Max 128GB
4,red dress summer,summer red dress


In [5]:
len(df)

100

In [6]:
formatted_data = [
    {
        "raw_query": str(row["query"]) if row["query"] is not None else "",
        "refined_query": (
            str(row["refined_query"]) if row["refined_query"] is not None else ""
        ),
    }
    for index, row in df.iterrows()
]

In [7]:
formatted_data

[{'raw_query': 'blue t-shirt men large',
  'refined_query': 'large blue t-shirt for men'},
 {'raw_query': 'running shoes women size 8',
  'refined_query': "women's running shoes, size 8"},
 {'raw_query': 'black leather jacket',
  'refined_query': 'black jacket made of leather'},
 {'raw_query': 'iphone 13 pro max 128gb',
  'refined_query': 'iPhone 13 Pro Max 128GB'},
 {'raw_query': 'red dress summer', 'refined_query': 'summer red dress'},
 {'raw_query': 'gaming laptop 16gb ram',
  'refined_query': 'gaming laptop with 16GB RAM'},
 {'raw_query': 'nike air max shoes size 10',
  'refined_query': 'Nike Air Max shoes, size 10'},
 {'raw_query': 'leather wallet brown',
  'refined_query': 'brown leather wallet'},
 {'raw_query': 'laptop charger dell', 'refined_query': 'Dell laptop charger'},
 {'raw_query': 'watch for men stainless steel',
  'refined_query': 'stainless steel watch for men'},
 {'raw_query': 'headphones wireless over ear',
  'refined_query': 'wireless over-ear headphones'},
 {'raw_q

### Prepare Dataset

In [8]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
split_index = int(0.8 * len(formatted_data))

In [10]:
split_index

80

In [11]:
train_data = formatted_data[:split_index]

In [12]:
train_data[:5]

[{'raw_query': 'blue t-shirt men large',
  'refined_query': 'large blue t-shirt for men'},
 {'raw_query': 'running shoes women size 8',
  'refined_query': "women's running shoes, size 8"},
 {'raw_query': 'black leather jacket',
  'refined_query': 'black jacket made of leather'},
 {'raw_query': 'iphone 13 pro max 128gb',
  'refined_query': 'iPhone 13 Pro Max 128GB'},
 {'raw_query': 'red dress summer', 'refined_query': 'summer red dress'}]

In [13]:
eval_data = formatted_data[split_index:]

In [14]:
eval_data[:5]

[{'raw_query': 'keyboard wireless', 'refined_query': 'wireless keyboard'},
 {'raw_query': 'fitness dumbbells', 'refined_query': 'fitness dumbbells'},
 {'raw_query': 'wall art canvas', 'refined_query': 'canvas wall art'},
 {'raw_query': 'bike pump portable', 'refined_query': 'portable bike pump'},
 {'raw_query': 'gaming keyboard rgb', 'refined_query': 'RGB gaming keyboard'}]

### Create Hugging Face Dataset

In [15]:
train_data_dict = {
    "raw_query": [item["raw_query"] for item in train_data],
    "refined_query": [item["refined_query"] for item in train_data],
}

train_dataset = Dataset.from_dict(train_data_dict)

In [16]:
train_dataset

Dataset({
    features: ['raw_query', 'refined_query'],
    num_rows: 80
})

In [17]:
eval_data_dict = {
    "raw_query": [item["raw_query"] for item in eval_data],
    "refined_query": [item["refined_query"] for item in eval_data],
}

eval_dataset = Dataset.from_dict(eval_data_dict)

In [18]:
eval_dataset

Dataset({
    features: ['raw_query', 'refined_query'],
    num_rows: 20
})

### Preprocess The Data

In [22]:
from transformers import AutoTokenizer

In [23]:
max_input_length = 128
max_target_length = 128

In [24]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")

In [25]:
tokenizer

T5TokenizerFast(name_or_path='google-t5/t5-base', vocab_size=32100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extr

In [20]:
def preprocess(examples):
    inputs = ["refine e-commerce query: " + query for query in examples["raw_query"]]
    targets = examples["refined_query"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
        return_attention_mask=True,
    )

    labels = tokenizer(
        targets,
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [26]:
tokenized_train = train_dataset.map(preprocess, batched=True)

Map: 100%|██████████| 80/80 [00:00<00:00, 776.74 examples/s]


In [27]:
tokenized_train

Dataset({
    features: ['raw_query', 'refined_query', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 80
})

In [28]:
tokenized_eval = eval_dataset.map(preprocess, batched=True)

Map: 100%|██████████| 20/20 [00:00<00:00, 1217.31 examples/s]


In [29]:
tokenized_eval

Dataset({
    features: ['raw_query', 'refined_query', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 20
})

### Set Up Training Arguments

In [27]:
from transformers import Seq2SeqTrainingArguments

In [41]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../app/fine_tune_vault/flan-t5-query-refiner-args",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
    lr_scheduler_type="linear",
    warmup_steps=500,
    save_strategy="epoch",
    gradient_accumulation_steps=4,
    label_smoothing_factor=0.1
)

### Initialize Model

In [42]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [43]:
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

In [45]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [46]:
data_collator

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='google-t5/t5-base', vocab_size=32100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra

### Create Trainer and Start Training

In [47]:
from transformers import Seq2SeqTrainer

In [48]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [49]:
trainer

<transformers.trainer_seq2seq.Seq2SeqTrainer at 0x704e972a6350>

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
model.save_pretrained("../app/fine_tune_vault/flan-t5-query-refiner-model")

In [39]:
tokenizer.save_pretrained("../app/fine_tune_vault/flan-t5-query-refiner-token")

('../app/fine_tune_vault/flan-t5-query-refiner-token/tokenizer_config.json',
 '../app/fine_tune_vault/flan-t5-query-refiner-token/special_tokens_map.json',
 '../app/fine_tune_vault/flan-t5-query-refiner-token/tokenizer.json')