In [1]:
!pip install transformers==4.40.1 datasets peft accelerate bitsandbytes scipy pandas


Collecting transformers==4.40.1
  Using cached transformers-4.40.1-py3-none-any.whl.metadata (137 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.40.1)
  Using cached tokenizers-0.19.1-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting numpy>=1.17 (from transformers==4.40.1)
  Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached transformers-4.40.1-py3-none-any.whl (9.0 MB)
Using cached tokenizers-0.19.1-cp312-none-win_amd64.whl (2.2 MB)
Downloading numpy-2.2.6-cp312-cp312-win_amd64.whl (12.6 MB)
   ---------------------------------------- 0.0/12.6 MB ? eta -:--:--
   - -------------------------------------- 0.5/12.6 MB 2.8 MB/s eta 0:00:05
   ---- ----------------------------------- 1.3/12.6 MB 4.0 MB/s eta 0:00:03
   ---- ----------------------------------- 1.3/12.6 MB 4.0 MB/s eta 0:00:03
   ----- ---------------------------------- 1.8/12.6 MB 2.2 MB/s eta 0:00:05
   ----- ---------------------------------- 1.8/12.6 MB 2.2 MB/s eta 0:00:05
 

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlflow 2.15.1 requires numpy<2, but you have numpy 2.2.6 which is incompatible.
mlflow 2.15.1 requires pyarrow<16,>=4.0.0, but you have pyarrow 20.0.0 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
scikit-learn 1.3.2 requires numpy<2.0,>=1.17.3, but you have numpy 2.2.6 which is incompatible.
sentence-transformers 3.4.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.40.1 which is incompatible.
streamlit 1.44.0 requires packaging<25,>=20, but you have packaging 25.0 which is incompatible.
tensorflow-intel 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.
torchvision 0.20.1 requires torch==

In [None]:
!pip install numpy==1.24.4 --force-reinstall


In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("data/cleaned_clinical_drugs_dataset.csv")

# Drop rows with missing SMILES or target (minimal cleaning)
df = df.dropna(subset=["SMILES", "target", "pIC50"])

# Construct prompt–response pairs (can be tweaked)
def create_prompt(row):
    return f"""Drug Discovery Task:
SMILES: {row['SMILES']}
Organism: {row['organism']}
Target: {row['target']}
LogP: {row['logP']}
PSA: {row['psa']}
Toxicity Alert: {row['toxicity_alert']}
Predict the pIC50 value:"""

def create_completion(row):
    return f"{row['pIC50']:.4f}"

# Add columns
df["prompt"] = df.apply(create_prompt, axis=1)
df["completion"] = df.apply(create_completion, axis=1)

# Optional: Save the formatted version
df[["prompt", "completion"]].to_csv("data/formatted_drug_prompts.csv", index=False)

print(df[["prompt", "completion"]].head(2))


                                              prompt completion
0  Drug Discovery Task:\nSMILES: O=C1Nc2ccc(Cl)cc...     9.1249
1  Drug Discovery Task:\nSMILES: O=C1Nc2ccc(Cl)cc...     9.1249


In [4]:
from transformers import AutoTokenizer
from datasets import Dataset

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("saved_models/biogpt-large")

# Reload formatted prompts
formatted_df = pd.read_csv("data/formatted_drug_prompts.csv")

# Ensure prompt and completion are strings (critical fix)
formatted_df["prompt"] = formatted_df["prompt"].astype(str)
formatted_df["completion"] = formatted_df["completion"].fillna("").astype(str)

# Convert to Hugging Face dataset
hf_dataset = Dataset.from_pandas(formatted_df)

# Tokenization function
def tokenize(example):
    input_enc = tokenizer(
        example["prompt"],
        truncation=True,
        padding="max_length",
        max_length=256,
    )
    target_enc = tokenizer(
        example["completion"],
        truncation=True,
        padding="max_length",
        max_length=16,
    )
    input_enc["labels"] = target_enc["input_ids"]
    return input_enc

# Tokenize
tokenized_dataset = hf_dataset.map(tokenize, remove_columns=["prompt", "completion"])

# Save tokenized dataset
tokenized_dataset.save_to_disk("data/tokenized_dataset_biogpt")
print("✅ Tokenization complete and dataset saved to disk.")


Map:   0%|          | 0/553987 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/553987 [00:00<?, ? examples/s]

✅ Tokenization complete and dataset saved to disk.
