## Carregando FlanT5

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, PreTrainedTokenizerFast

model_name = 'google/flan-t5-small'
model = T5ForConditionalGeneration.from_pretrained(model_name)

smiles_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="./data_to_train/tokenizer-smiles-20000.json"
)
flan_t5_tokenizer = T5Tokenizer.from_pretrained(model_name)

flan_t5_vocab = flan_t5_tokenizer.get_vocab()
smiles_vocab = smiles_tokenizer.get_vocab()

flan_t5_tokens = set(flan_t5_vocab.keys())
smiles_tokens = set(smiles_vocab.keys())

new_tokens = list(smiles_tokens - flan_t5_tokens)
print(len(new_tokens))

num_added_tokens = flan_t5_tokenizer.add_tokens(new_tokens)

model.resize_token_embeddings(len(flan_t5_tokenizer))

tokenizer = flan_t5_tokenizer

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


1926


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


## Carregando Bart

Obs: escolher entre FlanT5 ou Bart, se carregar os dois um sobrescrevera a variavel "model" e "tokenizer" do outro



In [5]:
from transformers import BartForConditionalGeneration, PreTrainedTokenizerFast, BartTokenizer

model_name = 'facebook/bart-base'
model = BartForConditionalGeneration.from_pretrained(model_name)

smiles_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="./data_to_train/tokenizer-smiles-20000.json"
)

bart_tokenizer = BartTokenizer.from_pretrained(model_name)

bart_vocab = bart_tokenizer.get_vocab()
smiles_vocab = smiles_tokenizer.get_vocab()

bart_tokens = set(bart_vocab.keys())
smiles_tokens = set(smiles_vocab.keys())

new_tokens = list(smiles_tokens - bart_tokens)
print(len(new_tokens))

num_added_tokens = bart_tokenizer.add_tokens(new_tokens)


model.resize_token_embeddings(len(bart_tokenizer))

tokenizer = bart_tokenizer

1863


## Carregando e processando dados

In [2]:
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

with open("./data/moleculas.json", "r", encoding='utf-8') as f:
    data = json.load(f)

def is_valid_record(entry):
    """Validate a record to ensure required fields are present and valid."""
    if not isinstance(entry.get('Name'), str) or not entry['Name'].strip():
        return False
    if not isinstance(entry.get('Canonical SMILES'), str) or not entry['Canonical SMILES'].strip():
        return False
    return True

processed_data = [
    {
        'name': entry['Name'].strip(),
        'smiles': entry['Canonical SMILES'].strip()
    }
    for entry in data if is_valid_record(entry)
]

if not processed_data:
    raise ValueError("No valid data found after cleaning!")

train_data, val_data = train_test_split(processed_data, test_size=0.1, random_state=42)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['name', 'smiles'],
        num_rows: 250768
    })
    validation: Dataset({
        features: ['name', 'smiles'],
        num_rows: 27864
    })
})


In [3]:
def preprocess_function(examples):
    # inputs = examples['smiles'] # remover comentario se esta usando o BART
    inputs = [f"Translate SMILES into molecule name: {smile}" for smile in examples['smiles']] # remover comentario se esta usando o FlanT5
    targets = examples['name']
    
    model_inputs = tokenizer(
        inputs, max_length=256, truncation=True, padding='max_length'
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=256, truncation=True, padding='max_length'
        )'
    
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset['train'].column_names
)

Map:   0%|          | 0/250768 [00:00<?, ? examples/s]



Map:   0%|          | 0/27864 [00:00<?, ? examples/s]

## Treinamento

In [4]:
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch


lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir='./results_lora',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_steps=100,
    load_best_model_at_end=True,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

True


  trainer = Trainer(


In [5]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.5953,1.483167
2,1.5743,1.469656
3,1.5732,1.462873




TrainOutput(global_step=94038, training_loss=1.6153049812583815, metrics={'train_runtime': 20199.1644, 'train_samples_per_second': 37.244, 'train_steps_per_second': 4.656, 'total_flos': 7.144358948949197e+16, 'train_loss': 1.6153049812583815, 'epoch': 3.0})

In [6]:
model.save_pretrained("./models/flant5-mini-smiles-v1")



In [6]:
tokenizer.save_pretrained("./models/flant5-mini-smiles-v1")

('./models/bart-smiles-v1/tokenizer_config.json',
 './models/bart-smiles-v1/special_tokens_map.json',
 './models/bart-smiles-v1/vocab.json',
 './models/bart-smiles-v1/merges.txt',
 './models/bart-smiles-v1/added_tokens.json')

## Testando modelo nos respectivos dados de teste

In [None]:
from transformers import AutoTokenizer, BartTokenizer, AutoModelForSeq2SeqLM

# bart
bart_tunning_tokenizer = AutoTokenizer.from_pretrained('./models/bart-smiles-v1')
bart_tunning_model =  AutoModelForSeq2SeqLM.from_pretrained('./models/bart-smiles-v1')

# flanT5
flant5_tunning_tokenizer = AutoTokenizer.from_pretrained('./models/flant5-mini-smiles-v1')
flant5_tunning_model = AutoModelForSeq2SeqLM.from_pretrained('./models/flant5-mini-smiles-v1')

In [None]:
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

with open("./moleculas.json", "r", encoding='utf-8') as f:
    data = json.load(f)

def is_valid_record(entry):
    """Validate a record to ensure required fields are present and valid."""
    if not isinstance(entry.get('Name'), str) or not entry['Name'].strip():
        return False
    if not isinstance(entry.get('Canonical SMILES'), str) or not entry['Canonical SMILES'].strip():
        return False
    return True

processed_data = [
    {
        'name': entry['Name'].strip(),
        'smiles': entry['Canonical SMILES'].strip()
    }
    for entry in data if is_valid_record(entry)
]

if not processed_data:
    raise ValueError("No valid data found after cleaning!")

_, val_data = train_test_split(processed_data, test_size=0.1, random_state=42)

def clean_val_data(data):
    return [
        item for item in data
        if item.get('name') and item.get('smiles')
        and len(item['name']) <= 250
        and len(item['smiles']) <= 250
    ]

val_data = clean_val_data(val_data)

print(f"Size of validation database: {len(val_data)}")
print(val_data[0])

In [13]:
import csv
from Levenshtein import distance as levenshtein_distance
from tqdm import tqdm
import torch

def generate_inferences(model, tokenizer, val_data, output_csv, device=None, batch_size=16, max_length=250, num_beams=3):
    device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    with open(output_csv, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["input_text", "expected_output", "generated_output", "model_name"])

        for i in tqdm(range(0, len(val_data), batch_size), desc="Processing Batches", unit="batch"):
            try:
                batch = val_data[i:i + batch_size]

                input_texts = [record['smiles'] for record in batch]
                expected_outputs = [record['name'] for record in batch]
                print(batch)
                print("####")

                inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True).to(device)

                outputs = model.generate(
                    **inputs,
                    max_length=max_length,
                    num_beams=num_beams,
                    early_stopping=True
                )

                generated_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

                for input_text, expected_output, generated_output in zip(input_texts, expected_outputs, generated_outputs):
                    writer.writerow([input_text, expected_output, generated_output, model.config.name_or_path])
            except Exception as e:
                print(f"Error generating output for batch {i}: {e}")
                continue

generate_inferences(
    model=bart_tunning_model,
    tokenizer=bart_tunning_tokenizer,
    val_data=val_data,
    output_csv="bart_results.csv"
)

generate_inferences(
    model=flant5_tunning_model,
    tokenizer=flant5_tunning_tokenizer,
    val_data=[{'smiles': f"Translatse SMILES into molecule name: {entry['smiles']}", 'name': entry['name']} for entry in val_data],
    output_csv="flant5_results.csv"
)

Generated Name: 1-Fluoro-4-[4-[4-(4-propylcyclohexyl)cyclohexyl]cyclohexyl]benzene
