In [1]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from datasets import Dataset, DatasetDict
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
2024-10-28 14:32:09.841714: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-28 14:32:09.842855: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-28 14:32:09.863081: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import torch
torch.cuda.empty_cache()
import gc

gc.collect()
torch.cuda.empty_cache()

In [3]:
def read_jsonl(file_path, nrows=None):
    return pd.read_json(file_path, lines=True, nrows=nrows)


train_data = read_jsonl('/home/dysl-ai/Desktop/indoml_datathon/datathon_phase_2_data/training_data/train.features')
train_solution = read_jsonl('/home/dysl-ai/Desktop/indoml_datathon/datathon_phase_2_data/training_data/train.labels')
test_data=read_jsonl('/home/dysl-ai/Desktop/indoml_datathon/final_test_data/final_test_data.features')

In [4]:
def preprocess_data(data, solution):
    merged = pd.merge(data, solution, on='indoml_id')

    merged['input_text'] = merged.apply(lambda row: f"description: {row['description']} retailer: {row['retailer']} price: {row['price']}", axis=1)
    merged['target_text'] = merged.apply(lambda row: f"supergroup: {row['supergroup']} group: {row['group']} module: {row['module']} brand: {row['brand']}", axis=1)
    
    return merged[['input_text', 'target_text']]


train_processed = preprocess_data(train_data, train_solution)

In [5]:
train = Dataset.from_pandas(train_processed)

In [6]:
from datasets import load_dataset

# Load your dataset
dataset = train

# If your dataset is already split into train and test, you might need to select one
# For example, if you want to split the training set:
# dataset = dataset["train"]

# Split the dataset
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Now you have your splits
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']  # Note: This is actually our validation set

In [7]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

In [8]:
tokenizer = T5Tokenizer.from_pretrained('t5-large')
model = T5ForConditionalGeneration.from_pretrained('/home/dysl-ai/Desktop/indoml_datathon/final_final_results/checkpoint-224736')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
def preprocess_function(examples):
    inputs = examples['input_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 449470/449470 [00:38<00:00, 11621.70 examples/s]
Map: 100%|██████████| 112368/112368 [00:09<00:00, 11821.50 examples/s]


In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 449470
    })
    validation: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 112368
    })
})

In [11]:
training_args = TrainingArguments(
    output_dir='./final_final_results',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=9,
    weight_decay=0.0001,
    save_total_limit=3,
    logging_dir='./logs',
    logging_steps=20,
    report_to='none'
)



In [12]:
class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(f"Step: {state.global_step}")
            for key, value in logs.items():
                print(f"{key}: {value}")
            print("\n")

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    callbacks=[CustomCallback()]
)

trainer.train(resume_from_checkpoint="/home/dysl-ai/Desktop/indoml_datathon/final_final_results/checkpoint-224736")

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss
9,0.001,0.010128


Step: 224740
loss: 0.0009
grad_norm: 0.012383080087602139
learning_rate: 3.332858702358916e-05
epoch: 8.000142389292325


Step: 224760
loss: 0.0008
grad_norm: 0.02467884123325348
learning_rate: 3.330485547486829e-05
epoch: 8.000854335753951


Step: 224780
loss: 0.001
grad_norm: 0.016249921172857285
learning_rate: 3.3281123926147415e-05
epoch: 8.001566282215578


Step: 224800
loss: 0.0012
grad_norm: 0.01644115336239338
learning_rate: 3.325739237742655e-05
epoch: 8.002278228677204


Step: 224820
loss: 0.0007
grad_norm: 0.00980929285287857
learning_rate: 3.323366082870568e-05
epoch: 8.00299017513883


Step: 224840
loss: 0.0013
grad_norm: 0.01944376528263092
learning_rate: 3.320992927998481e-05
epoch: 8.003702121600456


Step: 224860
loss: 0.0009
grad_norm: 0.016858242452144623
learning_rate: 3.3186197731263936e-05
epoch: 8.004414068062081


Step: 224880
loss: 0.0007
grad_norm: 0.010275577194988728
learning_rate: 3.316246618254307e-05
epoch: 8.005126014523707


Step: 224900
loss: 0.0007
gr

TrainOutput(global_step=252828, training_loss=0.0001263577391465667, metrics={'train_runtime': 11075.9946, 'train_samples_per_second': 365.225, 'train_steps_per_second': 22.827, 'total_flos': 2.18952856829952e+18, 'train_loss': 0.0001263577391465667, 'epoch': 9.0})

In [14]:
val_results = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])
print(f"Validation Loss: {val_results['eval_loss']}")

Step: 196644
eval_loss: 0.009085921570658684
eval_runtime: 703.1566
eval_samples_per_second: 159.805
eval_steps_per_second: 9.988
epoch: 7.0


Validation Loss: 0.009085921570658684


In [14]:
model.save_pretrained('./fine_tuned_t5_large_4')
tokenizer.save_pretrained('./fine_tuned_t5_large_4')

('./fine_tuned_t5_large_4/tokenizer_config.json',
 './fine_tuned_t5_large_4/special_tokens_map.json',
 './fine_tuned_t5_large_4/spiece.model',
 './fine_tuned_t5_large_4/added_tokens.json')

In [15]:
def preprocess_data1(data):
    # Create the input_text column
    data['input_text'] = data.apply(lambda row: f"description: {row['description']} retailer: {row['retailer']} price: {row['price']}", axis=1)

    # Return the dictionary format with only input_text
    return {
        'input_text': data['input_text'].tolist()
    }

# Process the test data
test_processed = preprocess_data1(test_data)

# Convert the processed dictionary to a Hugging Face Dataset
test_dataset = Dataset.from_dict(test_processed)

In [17]:
import re
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = T5ForConditionalGeneration.from_pretrained('/home/dysl-ai/Desktop/indoml_datathon/fine_tuned_t5_large_4').to(device)
tokenizer = T5Tokenizer.from_pretrained('/home/dysl-ai/Desktop/indoml_datathon/fine_tuned_t5_large_4')

model.eval()

test_data = test_dataset['input_text']
#test_label = test_dataset['target_text']

def generate_text(inputs):
    inputs = tokenizer.batch_encode_plus(inputs, return_tensors="pt", padding=True, truncation=True, max_length=352)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=128)

    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return generated_texts

def extract_details(text):
    pattern = r'supergroup: (.*?) group: (.*?) module: (.*?) brand: (.*)'
    match = re.match(pattern, text)
    if match:
        return tuple(item if item is not None else 'na' for item in match.groups())
    return 'na', 'na', 'na', 'na'

def clean_repeated_patterns(text):
    cleaned_data = text.split(' brand')[0]
    return cleaned_data

In [18]:
batch_size = 128
generated_details = []

for i in tqdm(range(0, len(test_data), batch_size), desc="Processing test data"):
    batch_inputs = test_data[i:i+batch_size]  # Get a batch of inputs

    # Generate texts based on the batch inputs
    generated_texts = generate_text(batch_inputs)

    # Extract details from the generated texts and store them
    for generated_text in generated_texts:
        generated_details.append(extract_details(generated_text))

print('Generated info extracted.............')

Processing test data: 100%|██████████| 1443/1443 [14:43<00:00,  1.63it/s]

Generated info extracted.............





In [19]:
import json
categories = ['supergroup', 'group', 'module', 'brand']

with open('attrebute_test_baseline_200dp.predict', 'w') as file:

    for indoml_id, details in enumerate(generated_details):
        result = {"indoml_id": indoml_id}
        for category, value in zip(categories, details):
            result[category] = value

        file.write(json.dumps(result) + '\n')

In [20]:
import zipfile

file_to_zip = 'attrebute_test_baseline_200dp.predict'
zip_file_name = 'codalab_new_final_3.zip'

with zipfile.ZipFile(zip_file_name, 'w') as zipf:
     zipf.write(file_to_zip, arcname=file_to_zip)