### Read CSV Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../learning_dataset/query_fine_tune/dataset.csv")

In [3]:
df.head()

Unnamed: 0,query,refined_query
0,blue t-shirt men large,large blue t-shirt for men
1,running shoes women size 8,"women's running shoes, size 8"
2,black leather jacket,black jacket made of leather
3,iphone 13 pro max 128gb,iPhone 13 Pro Max 128GB
4,red dress summer,summer red dress


In [4]:
len(df)

2000

In [5]:
formatted_data = [
    {
        "raw_query": str(row["query"]) if row["query"] is not None else "",
        "refined_query": (
            str(row["refined_query"]) if row["refined_query"] is not None else ""
        ),
    }
    for index, row in df.iterrows()
]

In [6]:
formatted_data

[{'raw_query': 'blue t-shirt men large',
  'refined_query': 'large blue t-shirt for men'},
 {'raw_query': 'running shoes women size 8',
  'refined_query': "women's running shoes, size 8"},
 {'raw_query': 'black leather jacket',
  'refined_query': 'black jacket made of leather'},
 {'raw_query': 'iphone 13 pro max 128gb',
  'refined_query': 'iPhone 13 Pro Max 128GB'},
 {'raw_query': 'red dress summer', 'refined_query': 'summer red dress'},
 {'raw_query': 'gaming laptop 16gb ram',
  'refined_query': 'gaming laptop with 16GB RAM'},
 {'raw_query': 'nike air max shoes size 10',
  'refined_query': 'Nike Air Max shoes, size 10'},
 {'raw_query': 'leather wallet brown',
  'refined_query': 'brown leather wallet'},
 {'raw_query': 'laptop charger dell', 'refined_query': 'Dell laptop charger'},
 {'raw_query': 'watch for men stainless steel',
  'refined_query': 'stainless steel watch for men'},
 {'raw_query': 'headphones wireless over ear',
  'refined_query': 'wireless over-ear headphones'},
 {'raw_q

### Prepare Dataset

In [7]:
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
split_index = int(0.8 * len(formatted_data))

In [9]:
split_index

1600

In [10]:
train_data = formatted_data[:split_index]

In [11]:
train_data[:5]

[{'raw_query': 'blue t-shirt men large',
  'refined_query': 'large blue t-shirt for men'},
 {'raw_query': 'running shoes women size 8',
  'refined_query': "women's running shoes, size 8"},
 {'raw_query': 'black leather jacket',
  'refined_query': 'black jacket made of leather'},
 {'raw_query': 'iphone 13 pro max 128gb',
  'refined_query': 'iPhone 13 Pro Max 128GB'},
 {'raw_query': 'red dress summer', 'refined_query': 'summer red dress'}]

In [12]:
eval_data = formatted_data[split_index:]

In [13]:
eval_data[:5]

[{'raw_query': 'folding camping chair with cup holder',
  'refined_query': 'camping chair folding with cup holder'},
 {'raw_query': 'puzzle mat for exercise and playroom',
  'refined_query': 'exercise playroom puzzle mat'},
 {'raw_query': 'compact blender for smoothies and shakes',
  'refined_query': 'smoothies shakes compact blender'},
 {'raw_query': 'wooden jewelry box with mirror',
  'refined_query': 'jewelry box wooden with mirror'},
 {'raw_query': 'rechargeable headlamp for outdoor activities',
  'refined_query': 'outdoor activities rechargeable headlamp'}]

### Create Hugging Face Dataset

In [14]:
train_data_dict = {
    "raw_query": [item["raw_query"] for item in train_data],
    "refined_query": [item["refined_query"] for item in train_data],
}

train_dataset = Dataset.from_dict(train_data_dict)

In [15]:
train_dataset

Dataset({
    features: ['raw_query', 'refined_query'],
    num_rows: 1600
})

In [16]:
eval_data_dict = {
    "raw_query": [item["raw_query"] for item in eval_data],
    "refined_query": [item["refined_query"] for item in eval_data],
}

eval_dataset = Dataset.from_dict(eval_data_dict)

In [17]:
eval_dataset

Dataset({
    features: ['raw_query', 'refined_query'],
    num_rows: 400
})

### Preprocess The Data

In [18]:
from transformers import AutoTokenizer

In [19]:
max_input_length = 128
max_target_length = 128

In [22]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")

In [23]:
def preprocess(examples):
    inputs = ["refine e-commerce query: " + query for query in examples["raw_query"]]
    targets = examples["refined_query"]

    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding="max_length"
    )

    labels = tokenizer(
        targets, max_length=max_target_length, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [24]:
tokenized_train = train_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map: 100%|██████████| 1600/1600 [00:00<00:00, 3540.24 examples/s]


In [25]:
tokenized_train

Dataset({
    features: ['raw_query', 'refined_query', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1600
})

In [26]:
tokenized_eval = eval_dataset.map(preprocess, batched=True)

Map: 100%|██████████| 400/400 [00:00<00:00, 4057.13 examples/s]


In [27]:
tokenized_eval

Dataset({
    features: ['raw_query', 'refined_query', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 400
})

### Set Up Training Arguments

In [28]:
from transformers import Seq2SeqTrainingArguments

In [29]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../app/fine_tune_vault/flan-t5-query-refiner-args",
    eval_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
)

### Initialize Model

In [30]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [31]:
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

In [32]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [33]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [34]:
data_collator

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='google-t5/t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', 

### Create Trainer and Start Training

In [35]:
from transformers import Seq2SeqTrainer

In [36]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Seq2SeqTrainer(


In [37]:
trainer

<transformers.trainer_seq2seq.Seq2SeqTrainer at 0x782ad038fe00>

In [38]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,0.201883


TrainOutput(global_step=100, training_loss=1.2525546264648437, metrics={'train_runtime': 973.6158, 'train_samples_per_second': 1.643, 'train_steps_per_second': 0.103, 'total_flos': 54136720588800.0, 'train_loss': 1.2525546264648437, 'epoch': 1.0})

In [39]:
model.save_pretrained("../app/fine_tune_vault/flan-t5-query-refiner-model")

In [40]:
tokenizer.save_pretrained("../app/fine_tune_vault/flan-t5-query-refiner-token")

('../app/fine_tune_vault/flan-t5-query-refiner-token/tokenizer_config.json',
 '../app/fine_tune_vault/flan-t5-query-refiner-token/special_tokens_map.json',
 '../app/fine_tune_vault/flan-t5-query-refiner-token/tokenizer.json')