<a href="https://colab.research.google.com/github/lykskai/HodgkinAvatar/blob/main/llama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Prerequisite: Set Up Your Google Colab Environment
1. Open https://colab.research.google.com/
2. Create a new notebook.
3. Go to `Runtime > Change runtime type` and set the hardware accelerator to `GPU`.

# LOAD PRE-TRAINED

1) Install Required Libraries

In [None]:
!pip install transformers datasets torch pdfplumber
!pip install faiss-cpu langchain langchain_community langchain_cohere



2) Importing the necessary libraries for our code

In [None]:
from transformers import pipeline, Trainer, TrainingArguments
from datasets import Dataset
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration, AutoTokenizer, AutoModelForCausalLM
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_cohere import CohereEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from typing import List
import torch
import os
from dotenv import load_dotenv

FILE: ** For uploading and extracting files!

In [None]:
# Upload a file manually
from google.colab import files
import pdfplumber  # Ensure pdfplumber is imported for PDF processing

uploaded = files.upload()

# Extract full text from the uploaded file
for filename in uploaded.keys():
    print(f"Processing file: {filename}")
    with pdfplumber.open(filename) as pdf:
        full_text = "\n".join(page.extract_text() for page in pdf.pages)

    # Save extracted text for manual editing
    output_file = filename.replace(".pdf", "_extracted.txt")
    with open(output_file, "w") as f:
        f.write(full_text)
        print(f"Text extracted and saved to {output_file}")

### b. Manually Edit Extracted Text

FILE: ** For downloading extracted files!

In [None]:
from google.colab import files

files.download("tf9332901032_extracted.txt")


FILE: Upload files

In [None]:
# Upload the text file
from google.colab import files

# Upload and save the file
uploaded = files.upload()

# Initialize combined_text
combined_text = ""

# Process the uploaded text file
for filename in uploaded.keys():
    if filename.endswith(".txt"):  # Ensure it's a .txt file
        with open(filename, "r") as f:
            combined_text = f.read()

# Verify if combined_text was loaded correctly
if not combined_text:
    raise ValueError("No text content was loaded. Please check your uploaded file.")

print("Combined text loaded successfully!")
print(combined_text[:500])  # Print the first 500 characters to verify


Saving 4articles-dch.txt to 4articles-dch (1).txt
Combined text loaded successfully!
Layer-chain Structures of Thallium Di-Alkyl
Halides
DURING the last year we have been studying the
crystal structures of a series of compounds RaTlX,
which prove to be of some interest in view of the
recent work on the rotation of molecules in crystals.
The dimethyl thallium halides are tetragonal and the
others orthorhombic, pseudo-tetragonal, and aU show a sodium chloride-like structure. Layers of TIX
parallel to the ab plane are spaced out at distances
depending on the length of the alkyl cha


### FINE TUNE MODEL PIPELINE
## Step 1: prep dataset
pls upload the file earlier


In [None]:
# Prepare the dataset for fine-tuning
fine_tuning_dataset = Dataset.from_dict({"text": combined_text.split('\n')})

# Step 2)  tokenize

In [None]:
## Step 4: Tokenize the Dataset
# Initialize the tokenizer
model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
# Add padding token if not present
# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))  # Resize model embeddings


# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

tokenized_dataset = fine_tuning_dataset.map(tokenize_function, batched=True)





Map:   0%|          | 0/5503 [00:00<?, ? examples/s]

## Step 5: Define Training Arguments


In [None]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_steps=1000,
    save_total_limit=2,
)





In [None]:
# PARTIALLY TRAINED NOW:

trainer.train(resume_from_checkpoint=True)


loading PRE-trained:

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

torch.cuda.empty_cache()  # Clears any cached memory

# Enable CUDA debugging
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Print to verify GPU is available
print("Using GPU:", torch.cuda.is_available())

# Load model and move to GPU
model_id = "meta-llama/Meta-Llama-3-8B"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

# Print to confirm model is loaded
print(f"Model Loaded Successfully on {device}!")

# Adjust training arguments to prevent memory issues
training_args = TrainingArguments(
    output_dir="./fine_tuned_llama",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Reduce batch size to fit GPU
    per_device_eval_batch_size=2,   # Reduce batch size to fit GPU
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_steps=1000,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Using the same dataset for simplicity
)

# Fine-Tune the Model
print("Starting Training...")
try:
    trainer.train()
    print("Training Completed!")
except Exception as e:
    print(f"Error during training: {e}")

# Save the fine-tuned model
print("Saving Model...")
trainer.save_model("./fine_tuned_llama")
print("Model Saved Successfully!")

Using GPU: False


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
import os

# Check if the model was saved
model_path = "./fine_tuned_llama"

if os.path.exists(model_path):
    print("✅ Some model checkpoints exist!")
    print("Saved files:", os.listdir(model_path))
else:
    print("❌ No saved model found. Training did not complete.")


✅ Some model checkpoints exist!
Saved files: ['model-00007-of-00007.safetensors', 'training_args.bin', 'model-00006-of-00007.safetensors', 'model.safetensors.index.json', 'model-00004-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'generation_config.json', 'model-00005-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'config.json', 'model-00003-of-00007.safetensors']


In [None]:
from transformers import AutoModelForCausalLM

model_path = "./fine_tuned_llama"
model = AutoModelForCausalLM.from_pretrained(model_path)
print("✅ Model loaded successfully!")


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

✅ Model loaded successfully!


In [None]:
!unzip /content/drive/MyDrive/fine_tuned_llama.zip -d /content/ # UNZIP

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


RAG Pipeline: 1) Load environment variables


In [None]:
load_dotenv()
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
os.environ['COHERE_API_KEY'] = os.getenv('COHERE_API_KEY')

## Step 3: Prepare Data and Vectorstore



In [None]:
# build index
embedding_model = CohereEmbeddings(model="embed-english-v3.0")

# Define URLs to index (replace with Dorothy Hodgkin's personal and scientific data sources)
urls = [
    https://en.wikipedia.org/wiki/Dorothy_Hodgkin
]

# Load documents from URLs
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

# Add chunks to vectorstore
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="dorothy_hodgkin_rag",
    embedding=embedding_model,
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 4},
)


## Step 4: Define Question and Retrieve Relevant Documents


In [None]:
question = "What were Dorothy Hodgkin's scientific contributions?"
docs = retriever.invoke(question)

## Change this to user input in the future

## Step 5: Check Retrieved Documents


In [None]:
print(f"Title: {docs[0].metadata['title']}\n\nSource: {docs[0].metadata['source']}\n\nContent: {docs[0].page_content}\n")


## Step 6: Check Document Relevancy


In [None]:
class GradeDocuments(BaseModel):
    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'."
    )

llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

system = """You are a grader assessing relevance of a retrieved document to a user question.
If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant."""
grade_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
])

retrieval_grader = grade_prompt | structured_llm_grader


## Step 7: Filter Non-Relevant Documents


In [None]:
docs_to_use = []
for doc in docs:
    print(doc.page_content, '\n', '-'*50)
    res = retrieval_grader.invoke({"question": question, "document": doc.page_content})
    print(res, '\n')
    if res.binary_score == 'yes':
        docs_to_use.append(doc)


## Step 8: Generate Results


In [None]:
def format_docs(docs):
    return "\n".join(f"<doc{i+1}>:\nTitle:{doc.metadata['title']}\nSource:{doc.metadata['source']}\nContent:{doc.page_content}\n</doc{i+1}>\n" for i, doc in enumerate(docs))

system = """You are an assistant for question-answering tasks. Answer the question based upon your knowledge.
Use three-to-five sentences maximum and keep the answer concise."""
prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
])

rag_chain = prompt | llm | StrOutputParser()

generation = rag_chain.invoke({"documents": format_docs(docs_to_use), "question": question})
print(generation)


## Step 9: Check for Hallucinations


In [None]:
class GradeHallucinations(BaseModel):
    binary_score: str = Field(
        ..., description="Answer is grounded in the facts, 'yes' or 'no'."
    )

structured_llm_grader = llm.with_structured_output(GradeHallucinations)
hallucination_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts."),
    ("human", "Set of facts: \n\n <facts>{documents}</facts> \n\n LLM generation: <generation>{generation}</generation>"),
])

hallucination_grader = hallucination_prompt | structured_llm_grader
response = hallucination_grader.invoke({"documents": format_docs(docs_to_use), "generation": generation})
print(response)

## Step 10: Highlight Used Documents


In [None]:
class HighlightDocuments(BaseModel):
    id: List[str]
    title: List[str]
    source: List[str]
    segment: List[str]

parser = PydanticOutputParser(pydantic_object=HighlightDocuments)
llm = ChatGroq(model="mixtral-8x7b-32768", temperature=0)
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an advanced assistant for document search and retrieval. Highlight relevant segments."""),
])

doc_lookup = prompt | llm | parser
lookup_response = doc_lookup.invoke({"documents": format_docs(docs_to_use), "question": question, "generation": generation})

for id, title, source, segment in zip(lookup_response.id, lookup_response.title, lookup_response.source, lookup_response.segment):
    print(f"ID: {id}\nTitle: {title}\nSource: {source}\nText Segment: {segment}\n")
