In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset, DatasetDict
import numpy as np
import torch
from torch.profiler import profile, record_function, ProfilerActivity
import time
import re
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
2024-10-29 11:19:54.203832: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-29 11:19:54.204913: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-29 11:19:54.225065: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Function to read JSONL files
def read_jsonl(file_path, nrows=None):
    return pd.read_json(file_path, lines=True, nrows=nrows)

# Read data
train_data = read_jsonl('/home/dysl-ai/Desktop/indoml_datathon/datathon_phase_2_data/training_data/train.features')
train_solution = read_jsonl('/home/dysl-ai/Desktop/indoml_datathon/datathon_phase_2_data/training_data/train.labels')
test_data = read_jsonl('/home/dysl-ai/Desktop/indoml_datathon/final_test_data/final_test_data.features')


In [3]:
def preprocess_data(data, solution):
    merged = pd.merge(data, solution, on='indoml_id')

    merged['input_text'] = merged.apply(lambda row: f"description: {row['description']} retailer: {row['retailer']} price: {row['price']}", axis=1)
    merged['target_text'] = merged.apply(lambda row: f"supergroup: {row['supergroup']} group: {row['group']} module: {row['module']} brand: {row['brand']}", axis=1)
    
    return merged[['input_text', 'target_text']]


train_processed = preprocess_data(train_data, train_solution)

In [4]:
train = Dataset.from_pandas(train_processed)

In [5]:
from datasets import load_dataset

# Load your dataset
dataset = train

# If your dataset is already split into train and test, you might need to select one
# For example, if you want to split the training set:
# dataset = dataset["train"]

# Split the dataset
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Now you have your splits
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']  # Note: This is actually our validation set

In [6]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [7]:
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('/home/dysl-ai/Desktop/indoml_datathon/final_final_results/checkpoint-252828')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 449470/449470 [00:38<00:00, 11776.47 examples/s]
Map: 100%|██████████| 112368/112368 [00:09<00:00, 11859.22 examples/s]


In [9]:
training_args = TrainingArguments(
    output_dir='./final_final_results',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.0001,
    save_total_limit=3,
    logging_dir='./logs',
    logging_steps=20,
    report_to='none'
)
# Custom optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=1000)

# Custom callback
class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step: {state.global_step}")
            for key, value in logs.items():
                print(f"{key}: {value}")
            print("\n")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[CustomCallback()]
)



In [10]:
training_args = TrainingArguments(
    output_dir='./final_final_results',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.0001,
    save_total_limit=3,
    logging_dir='./logs',
    logging_steps=20,
    report_to='none'
)

# Custom optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, total_iters=1000)

# Custom callback
class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step: {state.global_step}")
            for key, value in logs.items():
                print(f"{key}: {value}")
            print("\n")

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[CustomCallback()],
    optimizers=(optimizer, scheduler)
)

# Training with profiling for one sample and one epoch
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_flops=True) as prof:
    # Take the first batch (one sample)
    for batch in trainer.get_train_dataloader():
        inputs = batch
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        break  # Only one batch


STAGE:2024-10-29 11:20:50 350654:350654 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-10-29 11:20:51 350654:350654 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-10-29 11:20:51 350654:350654 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [11]:
# Print number of model parameters
print("Number of model parameters that are used for training")
print(sum(p.numel() for p in model.parameters()))

Number of model parameters that are used for training
737668096


In [12]:
# Print profile log for training
print(prof.key_averages().table(sort_by="flops", row_limit=10))
print("GFLOPs during training")  # GigaFLOPs
print(sum(k.flops for k in prof.key_averages()) / 1e9)


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm         2.15%      11.548ms         7.13%      38.231ms      33.100us     198.654ms        40.05%     234.801ms     203.291us          1155   9062917.865  
                                              aten::bmm         0.60%       3.243ms         1.74%       9.327ms      21.590us      11.034ms         2.2

In [13]:
def preprocess_data1(data):
    # Create the input_text column
    data['input_text'] = data.apply(lambda row: f"description: {row['description']} retailer: {row['retailer']} price: {row['price']}", axis=1)

    # Return the dictionary format with only input_text
    return {
        'input_text': data['input_text'].tolist()
    }

# Process the test data
test_processed = preprocess_data1(test_data)

# Convert the processed dictionary to a Hugging Face Dataset
test_dataset = Dataset.from_dict(test_processed)

In [14]:
# Load fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('/home/dysl-ai/Desktop/indoml_datathon/fine_tuned_t5_large_4').to('cuda')
tokenizer = T5Tokenizer.from_pretrained('/home/dysl-ai/Desktop/indoml_datathon/fine_tuned_t5_large_4')

model.eval()

# Helper function to generate text
def generate_text(inputs):
    inputs = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True, truncation=True, max_length=352)
    inputs = {key: value.to('cuda') for key, value in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return generated_texts

# Inference with profiling for one sample
start = time.time()
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], with_flops=True) as prof:
    test_data = test_dataset['input_text']
    # Take the first sample
    sample_input = [test_data[0]]
    generated_text = generate_text(sample_input)

# Inference time
inference_time = time.time() - start
print(f"Inference time: {inference_time} seconds")

# Print profile log for inference
print(prof.key_averages().table(sort_by="flops", row_limit=10))
print("GFLOPs during inference")  # GigaFLOPs
print(sum(k.flops for k in prof.key_averages()) / 1e9)


STAGE:2024-10-29 11:20:55 350654:350654 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-10-29 11:20:56 350654:350654 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-10-29 11:20:56 350654:350654 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


Inference time: 5.8283538818359375 seconds
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total MFLOPs  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm        11.01%      29.830ms        14.32%      38.813ms       7.184us      53.949ms        79.26%      54.490ms      10.085us          5403     33485.488  
                                              aten::bmm         5.45%      14.773ms         7.30%      19.77