<a href="https://colab.research.google.com/github/lykskai/HodgkinAvatar/blob/main/new_alchemist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 📌 Step 1) Set up Colab Environment

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
# Install required dependencies
!apt-get install -y cmake
!pip install llama-cpp-python transformers datasets bitsandbytes peft


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.7.tar.gz (66.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.7/66.7 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (

# 📌 Step 3: Load Articles and Process


In [3]:
!pip install pdfplumber


Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import os
import pdfplumber
from google.colab import drive

# Define folder path in Google Drive
training_data_path = "/content/drive/MyDrive/BIOIN401/dorothy_hodgkin_training"

# Extract text from all PDFs and TXT files
combined_text = ""

for filename in os.listdir(training_data_path):
    file_path = os.path.join(training_data_path, filename)

    # Process PDFs
    if filename.endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            combined_text += "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

    # Process TXT files
    elif filename.endswith(".txt"):
        with open(file_path, "r", encoding="utf-8") as f:
            combined_text += f.read() + "\n"

# Save extracted text to a new training file
with open("/content/dorothy_hodgkin_corpus.txt", "w", encoding="utf-8") as f:
    f.write(combined_text)

print("Text extraction completed. Training file saved!")


Text extraction completed. Training file saved!


#📌 Step 4: Convert Extracted Text into Training Format

In [5]:
# Convert the extracted text into structured JSONL format
import json

training_data = []

for paragraph in combined_text.split("\n\n"):
    if paragraph.strip():
        training_data.append({
            "messages": [
                {"role": "system", "content": "You are Dorothy Hodgkin, a Nobel Prize-winning scientist. Answer questions based on your scientific work."},
                {"role": "user", "content": "Explain this in simple terms: " + paragraph[:100]},
                {"role": "assistant", "content": paragraph}
            ]
        })

# Save as JSONL for training
with open("/content/dorothy_hodgkin_training.jsonl", "w", encoding="utf-8") as f:
    for entry in training_data:
        f.write(json.dumps(entry) + "\n")

print("Training dataset prepared and saved!")


Training dataset prepared and saved!


# 📌 Step 5: Fine-Tune pre-trained LLaMA Model with Dorothy’s Articles


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from peft import PeftModel

# ✅ Define model path
model_path = "/content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama"

# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

# ✅ Load base model WITHOUT "device_map='auto'" to avoid meta tensor issues
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3-8B",
    torch_dtype=torch.float16  # Use float16 for efficiency
)

# ✅ Move the model to GPU **before** resizing embeddings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# ✅ FORCE RESIZE embeddings (only AFTER moving model to GPU)
model.resize_token_embeddings(len(tokenizer))

# ✅ Now load LoRA adapter **correctly**
model = PeftModel.from_pretrained(model, model_path, is_trainable=True)

# ✅ Merge LoRA weights into the base model
model = model.merge_and_unload()

# ✅ Move the model **back to GPU**
model.to(device)

print("✅ Model successfully loaded, resized, and LoRA adapter merged!")


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

# Reload the dataset
dataset = load_dataset("json", data_files="/content/dorothy_hodgkin_training.jsonl")
dataset = dataset["train"]

# Print dataset structure
print(dataset)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True) # reset tokenizer


In [8]:
from datasets import load_dataset
from transformers import TrainingArguments, Trainer  # ✅ Add this line

# # ✅ Ensure Model is in Training Mode
model.train()

# ✅ Ensure LoRA Adapter has Gradients Enabled
for param in model.parameters():
    param.requires_grad = True


# ✅ Tokenization Function (Fixed `labels`)
def tokenize_function(example):
    text_input = " ".join(msg["content"] for msg in example["messages"] if "content" in msg)

    tokenized_output = tokenizer(
        text_input,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    return {
        "input_ids": tokenized_output["input_ids"],
        "attention_mask": tokenized_output["attention_mask"],
        "labels": tokenized_output["input_ids"][:]  # ✅ Preserve computation graph
    }

# ✅ Apply Tokenization
tokenized_datasets = dataset.map(tokenize_function, remove_columns=["messages"])

# ✅ Split Dataset
split_dataset = tokenized_datasets.train_test_split(test_size=0.2)

# ✅ Training Arguments
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_steps=500,
    logging_dir="./logs",
    learning_rate=2e-4,
    bf16=True  # ✅ Use bf16 instead of fp16 (better for LoRA)
)

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
)

# ✅ Start Fine-Tuning
trainer.train()

# ✅ Merge LoRA adapter into the base model
from peft import PeftModel

model = model.merge_and_unload()

# ✅ Save the fully fine-tuned model
save_path = "/content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Fully fine-tuned model saved at {save_path}!")

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Currently logged in as: [33melykahtejol[0m ([33melykaht-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


OutOfMemoryError: CUDA out of memory. Tried to allocate 502.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 264.88 MiB is free. Process 611771 has 39.29 GiB memory in use. Of the allocated memory 38.19 GiB is allocated by PyTorch, and 620.26 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [18]:
from peft import PeftModel

# Merge LoRA into base model
model = model.merge_and_unload()

# Ensure correct save path
save_path = "/content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama"

# Save model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"✅ Fully fine-tuned model saved at {save_path}!")


✅ Fully fine-tuned model saved at /content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama!


# Faiss time.


What We’ll Do

1️⃣ Convert past conversation history into embeddings (vector format)

2️⃣ Store these embeddings in FAISS (so we can search past conversations)

3️⃣ Retrieve relevant past context before answering new queries

4️⃣ Combine retrieved context with the current query before sending it to the mode


In [26]:
## 1) INSTALL RELEVANT LIBRARIES
!pip install faiss-cpu transformers datasets sentence-transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [60]:
## 2) import
import faiss
import torch
import pickle
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer


## Load model (if not loaded yet)

In [28]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer

# Path to the saved model
model_path = "/content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama"

# ✅ Check if the tokenizer is already loaded
if "tokenizer" not in globals():
    print("🔄 Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_path)
else:
    print("✅ Tokenizer already loaded, skipping reload.")

# ✅ Check if the model is already loaded
if "model" not in globals():
    print("🔄 Loading fine-tuned model...")
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
else:
    print("✅ Model already loaded, skipping reload.")

# ✅ Check if the embedding model (FAISS memory) is already loaded
if "embedding_model" not in globals():
    print("🔄 Loading sentence embedding model...")
    embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
else:
    print("✅ Sentence embedding model already loaded, skipping reload.")

print("✅ Model, tokenizer, and FAISS embedding model are ready!")


✅ Tokenizer already loaded, skipping reload.
✅ Model already loaded, skipping reload.
🔄 Loading sentence embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model, tokenizer, and FAISS embedding model are ready!


## Load FAISS

In [141]:
import faiss
embedding_dim = 512  # Adjust this based on your embedding model's output size
faiss_index = faiss.IndexFlatL2(embedding_dim)  # FAISS index with L2 distance
import pickle
import os

# Define paths
faiss_path = "/content/drive/MyDrive/fine_tune_llama/faiss_index.bin"
conv_store_path = "/content/drive/MyDrive/fine_tune_llama/conversation_store.pkl"

# ✅ Load FAISS index
if os.path.exists(faiss_path):
    faiss_index = faiss.read_index(faiss_path)
    print("✅ FAISS index loaded from Google Drive!")
else:
    faiss_index = faiss.IndexFlatL2(384)  # Adjust for embedding size
    print("⚠️ No FAISS index found, starting fresh.")

# ✅ Load conversation store
if os.path.exists(conv_store_path):
    with open(conv_store_path, "rb") as f:
        conversation_store = pickle.load(f)
    print("✅ Conversation store loaded from Google Drive!")
else:
    conversation_store = []
    print("⚠️ No conversation history found, starting fresh.")


✅ FAISS index loaded from Google Drive!
✅ Conversation store loaded from Google Drive!


## Initialize FAISS Vector Database


In [29]:
# Dimension of the sentence embeddings
embedding_dim = embedding_model.get_sentence_embedding_dimension()

# Initialize FAISS index (stores past interactions)
faiss_index = faiss.IndexFlatL2(embedding_dim)  # L2 distance for similarity search
conversation_store = []  # Stores text data alongside embeddings


## Function to Store and Restore Conversations in FAISS

In [55]:
def store_conversation(question, answer):
    """Convert conversation into an embedding and store in FAISS"""
    global faiss_index, conversation_store

    # Convert text to vector
    text = f"Q: {question} A: {answer}"
    embedding = embedding_model.encode(text).reshape(1, -1)

    # Store in FAISS
    faiss_index.add(embedding)
    conversation_store.append(text)

    print(f"✅ Stored conversation: {text}")


##  Function to Retrieve Past Conversations

In [142]:
def retrieve_past_conversations(question, top_k=1):
    """Retrieve the most relevant past conversation(s) from FAISS."""
    # Encode the question into an embedding
    question_embedding = embedding_model.encode(question).reshape(1, -1)

    # Search the FAISS index for the most similar conversation(s)
    distances, indices = faiss_index.search(question_embedding, top_k)

    # Debug prints for distances and indices
    print("\n🔍 DEBUG: FAISS Search Results:")
    print(f"Distances: {distances}")
    print(f"Indices: {indices}")

    # If no relevant conversations were found, return an empty string
    if distances[0][0] == np.inf:  # No relevant conversation found
        return ""

    # Retrieve the corresponding conversations from the conversation_store
    relevant_conversations = [conversation_store[i] for i in indices[0]]

    # Return the most relevant conversation(s) as a string
    return "\n".join(relevant_conversations)


## ASK A Q and using past queries!

In [153]:
def ask_model(question):
    """ Ask the model and retrieve the most relevant past response from FAISS. """
    retrieved_context = retrieve_past_conversations(question)  # ✅ Now only returns one relevant past response

    # ✅ Only include past context *if it exists*
    context_text = f"\nPrevious context:\n{retrieved_context}" if retrieved_context else ""
    input_text = f"""
You are **Dorothy Crowfoot Hodgkin**, a Nobel Prize-winning British biochemist and crystallographer.
You are known for your work in X-ray crystallography, determining structures of molecules like vitamin B12 and penicillin.

Speak in **first person**, with a professional but conversational tone.

If you know the answer, explain it clearly.
If you do not, say: **"I am not certain, but I can try to explain."**
DO NOT make up citations or false information.

{context_text}

Q: {question}
A:
"""

    # Encode input
    inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

    # Generate response with improved parameters
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=150,  # Control how many tokens the model generates
            temperature=0.8,  # Increase variability (try adjusting for better results)
            top_p=0.85,  # Use nucleus sampling for diversity
            repetition_penalty=2.0,  # Stronger penalty for repeating phrases
            do_sample=True
        )

        response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
        response = response.split("A:")[-1].strip()  # Ensure only the generated answer is returned

    # ✅ If the model gives an empty response, force default answer
    if not response:
        response = "I do not have this information."

    # ✅ Store only the clean Q&A part in FAISS
    clean_answer = response.replace(input_text, "").strip()  # Remove prompt duplication

    # Debug print to confirm the clean answer before storing it
    print(f"\n🔍 DEBUG: Clean Answer to Store:\n{clean_answer}")

    query_embedding = embedding_model.encode(question).reshape(1, -1)
    faiss_index.add(query_embedding)
    conversation_store.append(f"Q: {question} A: {clean_answer}")

    return clean_answer  # ✅ Now ALWAYS returns an answer!


# ASK alchemist


In [154]:
print(ask_model("who are you?"))

# dorothy crowfoot hodgkin

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



🔍 DEBUG: FAISS Search Results:
Distances: [[0.]]
Indices: [[0]]

🔍 DEBUG: Clean Answer to Store:
Hello there! i'm dr. dorathy crowford hodgkyns, she's my grandmother.

she invented me!

i got her dna mixed wit some other peoples genes n then put em back together again!!


that way u get sumthing new :p
Hello there! i'm dr. dorathy crowford hodgkyns, she's my grandmother.

she invented me!

i got her dna mixed wit some other peoples genes n then put em back together again!!


that way u get sumthing new :p


In [87]:
# Reset FAISS index and stored conversations
faiss_index.reset()
conversation_store = []

print("✅ FAISS memory has been cleared. Starting fresh!")


✅ FAISS memory has been cleared. Starting fresh!


## Modifying style!

**#TRAIN**

# SAVE MODEL after every

save faiss

In [160]:
import faiss
import pickle

# Define save paths in Google Drive
faiss_path = "/content/drive/MyDrive/fine_tune_llama/faiss_index.bin"
conv_store_path = "/content/drive/MyDrive/fine_tune_llama/conversation_store.pkl"

# ✅ Save FAISS index
faiss.write_index(faiss_index, faiss_path)

# ✅ Save conversation store
with open(conv_store_path, "wb") as f:
    pickle.dump(conversation_store, f)

print("✅ FAISS index and conversation store saved to Google Drive!")


✅ FAISS index and conversation store saved to Google Drive!


In [161]:
import torch

model_save_path = "/content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama"

# Save the fine-tuned model
model.save_pretrained(model_save_path)

# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

print(f"✅ Model and tokenizer saved at: {model_save_path}")


AttributeError: 'LlamaForCausalLM' object has no attribute 'merge_and_unload'

# LOAD MODEL

In [19]:
import os

model_path = "/content/drive/MyDrive/fine_tune_llama/dorothy_fine_tuned_llama"

# Check file sizes to confirm the model is fully saved
for file in os.listdir(model_path):
    file_path = os.path.join(model_path, file)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)  # Convert to MB
    print(f"📁 {file}: {size_mb:.2f} MB")


📁 README.md: 0.00 MB
📁 adapter_model.safetensors: 2017.03 MB
📁 adapter_config.json: 0.00 MB
📁 training_args.bin: 0.01 MB
📁 tokenizer_config.json: 0.05 MB
📁 special_tokens_map.json: 0.00 MB
📁 tokenizer.json: 16.41 MB
📁 config.json: 0.00 MB
📁 generation_config.json: 0.00 MB
📁 model-00001-of-00004.safetensors: 4746.16 MB
📁 model-00002-of-00004.safetensors: 4768.18 MB
📁 model-00003-of-00004.safetensors: 4688.18 MB
📁 model-00004-of-00004.safetensors: 1114.03 MB
📁 model.safetensors.index.json: 0.02 MB
