In [1]:
%%capture
! pip install datasets
! pip install imblearn
! pip install rdkit-pypi
! git clone https://github.com/ChangyunCho/ChemAP
! pip install bitsandbytes
! pip install trl

In [2]:
# Imports
import pandas as pd
from datasets import load_dataset , Dataset
#from transformers import  TrainingArguments , Trainer,TrainerCallback,AutoModelForSequenceClassification,AutoTokenizer
from huggingface_hub import HfApi, create_repo
from sklearn.metrics import precision_recall_fscore_support , accuracy_score
import torch
import numpy as np
import torch.nn as nn
from imblearn.over_sampling import SMOTE
from datasets import Dataset, Features, Value,Sequence
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import RDKFingerprint
from rdkit.Avalon.pyAvalonTools import GetAvalonFP

In [3]:
from transformers import BitsAndBytesConfig


nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct", quantization_config=nf4_config, trust_remote_code=True)#, quantization_config=nf4_config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

In [4]:
num_added_toks = tokenizer.add_tokens(["<Approved>", "<NotApproved>"])
num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["<[lbl]>", "<[mr]>"]})
print("We have added", num_added_toks, "tokens")
model.resize_token_embeddings(len(tokenizer))

We have added 2 tokens


Embedding(32015, 3072, padding_idx=32000)

In [21]:
%%capture
df_git = pd.read_csv("/content/ChemAP/dataset/DrugApp/All_training_feature_vectors.csv")
df = df_git[["SMILES","Label"]]
#df.rename(columns={"Label":"labels"}, inplace=True)


#-----------------------------------
#split dataset into train , test and val(just useing in the training process)
from sklearn.model_selection import train_test_split
test_size = 0.2
val_size = 0.5
train_df , temp = train_test_split(df , stratify = df.Label , test_size = test_size , random_state=1234)


test_df , val_df = train_test_split(temp , stratify = temp.Label , test_size = val_size , random_state=1234)
#-----------------------------------
# reset index
train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [29]:

def convert_to_fingerprint(smiles, target):
    mol = Chem.MolFromSmiles(smiles)

    # Generate the Morgan fingerprint
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2, nBits=2048)
    morgan = ''.join(str(int(bit)) for bit in morgan_fp)
    if int(target) == 1:
        target = "<Approved>"
    else:
        target = "<NotApproved>"

    return smiles+ "<[mr]>" + morgan + "<[lbl]>" + str(target)+ tokenizer.eos_token


def convert_to_fingerprint_for_eval(smiles):
    mol = Chem.MolFromSmiles(smiles)

    # Generate the Morgan fingerprint
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, useChirality=True, radius=2, nBits=2048)
    morgan = ''.join(str(int(bit)) for bit in morgan_fp)

    return smiles+ "<[mr]>" + morgan + "<[lbl]>"

# building the final dataset
train_df["text"] = train_df.apply(lambda row: convert_to_fingerprint(row["SMILES"], row["Label"]), axis=1)
val_df["text"] = val_df.apply(lambda row: convert_to_fingerprint_for_eval(row["SMILES"]), axis=1)
test_df["text"] = test_df.apply(lambda row: convert_to_fingerprint_for_eval(row["SMILES"]), axis=1)


In [64]:
dataset_train = Dataset.from_pandas(train_df[["text"]])
dataset_val = Dataset.from_pandas(val_df[["text"]])
dataset_test = Dataset.from_pandas(test_df[["text"]])


def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True )#,max_length=166


tokenized_train = dataset_train.map(tokenize_function)
tokenized_val = dataset_val.map(tokenize_function)
tokenized_test = dataset_test.map(tokenize_function)

Map:   0%|          | 0/2497 [00:00<?, ? examples/s]

Map:   0%|          | 0/313 [00:00<?, ? examples/s]

Map:   0%|          | 0/312 [00:00<?, ? examples/s]

In [65]:
from trl import SFTConfig , SFTTrainer

In [72]:
args = SFTConfig(
    output_dir='new',
    seed = 42 ,
    data_seed = 42,
    do_eval=True,
    do_train=True,
    #Sizes
    num_train_epochs= 5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    # evaluations  :
    eval_strategy = "steps",
    eval_steps=20,
    eval_on_start= False,
    #Gradient
    prediction_loss_only = False,
    gradient_accumulation_steps=4,
    # eval_accumulation_steps= ,
    # gradient_checkpointing= ,
    # gradient_checkpointing_kwargs= ,
    weight_decay=0.1,
    # Learning rare
    learning_rate=2e-3,
    lr_scheduler_type='cosine',
    warmup_steps=50,
    # Logging and Saving:
    logging_dir = "new",
    logging_strategy = 'steps',
    # logging_first_step = ,
    logging_steps= 5 ,
    save_strategy='steps',
    save_steps = 20,
    save_total_limit = 5,
    save_safetensors = False ,
    # device :
    torch_empty_cache_steps= 4 ,
    remove_unused_columns=False ,
    load_best_model_at_end= True,
    # split_batches= ,
    include_tokens_per_second= True,
    include_num_input_tokens_seen= True,
    auto_find_batch_size= False,
    # Metrics
    # Torch compile
    #torch_compile= True,
    max_seq_length=1,
)


In [73]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["o_proj", "qkv_proj"],
    lora_dropout=0.1,
)

# Apply LoRA to the model
model = get_peft_model(model, config)

In [74]:
trainer = SFTTrainer(
    model = model,
    train_dataset=dataset_train,
    eval_dataset = dataset_val ,
    args=args,
)


Converting train dataset to ChatML:   0%|          | 0/2497 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2497 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2497 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2497 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/313 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/313 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/313 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/313 [00:00<?, ? examples/s]

In [75]:

trainer.train()

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).