In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import prepare_model_for_kbit_training
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import transformers

In [2]:
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cpu"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
model.eval() # model in evaluation mode (dropout modules are deactivated)

# craft prompt
comment = "What is the pre-metastatic niche concept?"
prompt=f'''[INST] {comment} [/INST]'''

# tokenize input
inputs = tokenizer(prompt, return_tensors="pt")

# generate output
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=140)

print(tokenizer.batch_decode(outputs)[0])

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[INST] What is the pre-metastatic niche concept? [/INST] [OUTPUT] The pre-metastatic niche (PMN) concept refers to a specific area within an organ or tissue where cancer cells can initially settle and establish themselves before spreading to other parts of the body. This concept was first proposed by researchers in 2015 as part of their efforts to better understand how metastasis occurs.

The PMN is characterized by its unique cellular composition, including higher levels of extracellular matrix proteins and altered gene expression patterns compared to normal surrounding tissues. These changes make it easier for cancer cells to survive and proliferate in this environment, increasing the likelihood that they will eventually spread to distant sites throughout the body.

Understanding the PMN has been crucial in developing


In [3]:
# Step 2: Prepare the model for QLoRA fine-tuning
model = prepare_model_for_kbit_training(model)  # Prepares the model for LoRA fine-tuning


In [8]:
#confirming potential task types
from peft.mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING
print(MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys())

dict_keys(['SEQ_CLS', 'SEQ_2_SEQ_LM', 'CAUSAL_LM', 'TOKEN_CLS', 'QUESTION_ANS', 'FEATURE_EXTRACTION'])


In [9]:
#identifying potential modules for use in Lora configuration
for name, module in model.named_modules():
    print(name)


model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.self_attn.rotary_emb
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.self_attn.rotary_emb
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.

In [4]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],  # Attention layers
    lora_dropout=0.1,  # Dropout for LoRA layers
    bias="none",  # No bias adjustment
    task_type="CAUSAL_LM"  # Task type is causal language modeling
)

In [5]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [6]:
# Step 3: Prepare the datasets
# Load question-answer pairs
import json
with open("cancer_qa.txt", "r") as f:
    qa_data = json.load(f)

In [7]:
from datasets import Dataset  # Import Dataset from the datasets library

# Create a Dataset object from the question-answer pairs
qa_dataset = Dataset.from_dict({
    "prompt": [item["question"] for item in qa_data],
    "response": [item["answer"] for item in qa_data]
})


In [8]:
# Load unstructured course notes and create prompts
with open("cancer_data.txt", "r") as f:
    course_notes = f.readlines()

In [9]:
# Combine course notes into prompt-response pairs (synthetic data generation)
unstructured_data = [
    {"prompt": f"Explain: {note.strip()}", "response": note.strip()} for note in course_notes
]

In [10]:
#merge the datasets
all_data = Dataset.from_dict({
    "prompt": qa_dataset["prompt"] + [item["prompt"] for item in unstructured_data],
    "response": qa_dataset["response"] + [item["response"] for item in unstructured_data],
})

In [11]:
# Define the tokenize function
def tokenize_function(example):
    combined_texts = [p + " " + r for p, r in zip(example["prompt"], example["response"])]
    return tokenizer(
        combined_texts,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

# Tokenize the dataset
tokenized_dataset = all_data.map(tokenize_function, batched=True)


Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [14]:
# Step 4: Define training arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./qwen2.5-c-tuned",  # Directory to save the fine-tuned model
    per_device_train_batch_size=1,  # Batch size for CPU
    num_train_epochs=3,  # Number of training epochs
    save_strategy="steps",  # Save checkpoint after a certain number of steps
    save_steps=500,  # Save the model every 500 steps
    logging_dir="./logs",  # Directory for logs
    logging_steps=10,  # Log every 10 steps
    eval_strategy="no",  # No evaluation during training
    fp16=False,  # Disable mixed-precision
    push_to_hub=False  # Do not push to Hugging Face Hub
)


In [16]:
# Step 5: Train the model
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

trainer.train()

  trainer = Trainer(


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

In [20]:
# Tokenize and preprocess the dataset
def preprocess_function(examples):
    # Concatenate prompt and response element-wise
    combined_texts = [p + " " + r for p, r in zip(examples["prompt"], examples["response"])]
    # Tokenize the concatenated texts
    inputs = tokenizer(combined_texts, max_length=512, truncation=True, padding="max_length")
    inputs["labels"] = inputs["input_ids"].copy()  # Set input_ids as labels
    return inputs

# Apply the preprocessing to the dataset
tokenized_dataset = all_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/83 [00:00<?, ? examples/s]

In [21]:
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=tokenized_dataset,  # The tokenized dataset
    tokenizer=tokenizer  # Tokenizer for preprocessing
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
10,7.2888
20,6.5727
30,5.49
40,3.5453
50,2.0398
60,0.5353
70,0.5111
80,0.6078
90,0.3593
100,0.5545


TrainOutput(global_step=249, training_loss=1.2162665707998008, metrics={'train_runtime': 7465.7666, 'train_samples_per_second': 0.033, 'train_steps_per_second': 0.033, 'total_flos': 1004597150220288.0, 'train_loss': 1.2162665707998008, 'epoch': 3.0})

In [22]:
trainer.save_model("./qwen2.5-c-tuned")  # Save the model
tokenizer.save_pretrained("./qwen2.5-c-tuned")  # Save the tokenizer

('./qwen2.5-c-tuned\\tokenizer_config.json',
 './qwen2.5-c-tuned\\special_tokens_map.json',
 './qwen2.5-c-tuned\\vocab.json',
 './qwen2.5-c-tuned\\merges.txt',
 './qwen2.5-c-tuned\\added_tokens.json',
 './qwen2.5-c-tuned\\tokenizer.json')

In [23]:
# Load the fine-tuned model
fine_tuned_model_dir = "./qwen2.5-c-tuned"
model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_dir)
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_dir)

In [24]:
# Ensure model is in evaluation mode
model.eval()

# Define comment and create prompt
comment = "What is the pre-metastatic niche concept?"
prompt = f'''[INST] {comment} [/INST]'''

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # Move inputs to the correct device

# Generate output
outputs = model.generate(
    input_ids=inputs["input_ids"],
    max_new_tokens=140,
    pad_token_id=tokenizer.pad_token_id  # Avoid warnings for missing pad token
)

# Decode and print the output
response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(response)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[INST] What is the pre-metastatic niche concept? [/INST] [INST] Could you please explain further? [/INST]


In [25]:
model.eval()
comment = "What is the pre-metastatic niche concept?"
prompt = f"{comment}"  # Simplified prompt without [INST] tags

inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],  # Explicit attention mask
    max_new_tokens=140,
    pad_token_id=tokenizer.pad_token_id
)

response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
print(response)

What is the pre-metastatic niche concept? The pre-metastatic niche concept refers to a specific region within an organ or tissue that is primed and ready for metastasis, which is the spread of cancer cells from their primary site to other parts of the body. This concept was first proposed by researchers in the 1980s and has since been widely accepted as a key factor in understanding how cancer spreads.

The pre-metastatic niche concept suggests that certain environmental cues, such as inflammation, immune cell infiltration, and changes in blood flow, can create a favorable microenvironment within the primary tumor that promotes the survival and proliferation of cancer cells. These cancer cells then secrete growth factors and cytokines that further promote their


In [26]:
print(unstructured_data)

[{'prompt': 'Explain: Section 1: Tumor Microenvironment (TME)', 'response': 'Section 1: Tumor Microenvironment (TME)'}, {'prompt': 'Explain: Definition and Importance', 'response': 'Definition and Importance'}, {'prompt': 'Explain: The tumor microenvironment (TME) refers to the ecosystem surrounding cancer cells, which includes non-cancerous cells, blood vessels, immune cells, fibroblasts, extracellular matrix (ECM), and signaling molecules. The TME plays a critical role in all stages of cancer development, from onset to metastasis.', 'response': 'The tumor microenvironment (TME) refers to the ecosystem surrounding cancer cells, which includes non-cancerous cells, blood vessels, immune cells, fibroblasts, extracellular matrix (ECM), and signaling molecules. The TME plays a critical role in all stages of cancer development, from onset to metastasis.'}, {'prompt': 'Explain: Components of the TME', 'response': 'Components of the TME'}, {'prompt': 'Explain: 1. Blood Vessels: Provide nutrie