#Install all packages needed for model development

In [1]:
# Installs
!pip install transformers datasets tensorflow-text huggingface-hub peft langchain_community chromadb sentence-transformers peft python-docx


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting tensorflow-text
  Downloading tensorflow_text-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.7-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-0.5.18-py3-none-any.whl.metadata (6.8 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.

#Import all libraries needed for model development

In [2]:
# Libraries
import os
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from transformers import StoppingCriteria, StoppingCriteriaList
from torch.utils.data import DataLoader, Dataset
from huggingface_hub import login
from google.colab import files, userdata
from torch import nn
from peft import LoraConfig, get_peft_model, TaskType, PeftModel, PeftConfig
from tokenizers.processors import TemplateProcessing
from docx import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

# Login to Hugging Face

In [3]:
# Login to Hugging Face
login(token='')


# Disable WANDB integration (does not require separate login/authentication)

In [4]:
# Disable WANDB integration
os.environ["WANDB_DISABLED"] = "true"

# Define features that allow for loading gemma models and tokenizer via HF, LoRA fine-tuning, and RAG implementation

In [6]:
# Load model and tokenizer via HF
def load_model_and_tokenizer(model_name):
    model = AutoModelForCausalLM.from_pretrained(model_name, attn_implementation='eager')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Load tokenizer for fine-tune data
def load_tokenizer_for_ft(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, add_eos_token=True)
    return tokenizer

# A class ensure stop when the EOS token is generated
class StoppingCriteriaSub(StoppingCriteria):
    def __init__(self, stop_id = 1):
      StoppingCriteria.__init__(self),
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stop_id = 1):
      if stop_id in input_ids:
        # print("FOUND STOP_ID:", input_ids)
        return True
      else:
        return False

# Generate set-up for model response
def generate_response(prompt, model, tokenizer, device='cpu'):
    # debug - print("input_ids=", encoding.input_ids)
    encoding = tokenizer(prompt, return_tensors='pt').to(device)
    generation_config = model.generation_config
    generation_config.max_new_tokens = 512
    generation_config.temperature = 0.7
    generation_config.num_return_sequences = 1

    # this will ensure text generation stops at the EOS token
    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stop_id = tokenizer.eos_token_id)  ])
    completion = model.generate(input_ids = encoding.input_ids,
                                attention_mask = encoding.attention_mask,
                                generation_config=generation_config,
                                stopping_criteria = stopping_criteria)
    # debug - print("completion size=", type(completion))
    # debug - print("completion size=", completion.shape)
    # debug - print("completion=", completion)
    response = tokenizer.decode(completion[0], skip_special_tokens=True)
    return response.replace(prompt, "")

# Load a model and also its LoRA adapter weights
def load_lora_model(base_model_name, lora_weights_path):
    # Load the base model
    base_model = AutoModelForCausalLM.from_pretrained(base_model_name, attn_implementation='eager')

    # Load the LoRA configuration
    peft_config = PeftConfig.from_pretrained(lora_weights_path)

    # Load the LoRA model
    model = PeftModel.from_pretrained(base_model, lora_weights_path)

    # Merge LoRA weights with base model
    model = model.merge_and_unload()

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)

    return model, tokenizer

# Useful in RAG implementation
class DocumentWithText:
    def __init__(self, text):
        self.text = text

# Load and split context documents for RAG from an Excel file
def load_and_split_documents(file_path, sheet_name=0):
    # Load the Excel file
    df = pd.read_excel(file_path, sheet_name=sheet_name)

    # Convert each row to a DocumentWithText object
    documents = [DocumentWithText(row.to_string(index=False)) for index in df.index for row in df.iloc[[index]].itertuples()]

    return documents

    # Split the text
    text_splitter = CharacterTextSplitter(chunk_size=9000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    return texts


# Load the gemma-2-2b model (base model)

In [None]:
model_name = 'google/gemma-2-2b'
model, tokenizer = load_model_and_tokenizer(model_name)
model = model.to('cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/481M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

# First test prompt: using gemma-2-2b model  

In [None]:
# Define the prompt template
template = "{pre}\n\nQuestion:\n{question}"

# Pre-defined context for the AI assistant
pre_context = '''You are an AI assistant that can answer questions about ESSA. '''\
               '''ESSA stands for the Every Student Succeeds Act.'''

# List of questions to ask
questions = [
    "What is ESSA?",
    "How does ESSA impact student achievement?"
]

# Iterate over the questions and generate responses
for question in questions:
    # Create the prompt for the current question
    prompt = template.format(pre=pre_context, question=question)

    # Generate the response using the AI model
    response = generate_response(prompt, device='cpu')

    # Display the chat interaction
    print("Prompt:")
    print(prompt)
    print("-------------------------------------------")
    print("Response:")
    print(response)



Prompt:
You are an AI assistant that can answer questions about ESSA. ESSA stands for the Every Student Succeeds Act.

Question:
What is ESSA?
-------------------------------------------
Response:


Answer:
ESSA is a federal law that replaced No Child Left Behind. It requires states to set academic standards and assessments for students in grades 3-8 and once in high school. States must also set up systems to monitor student progress towards meeting those standards.

Question:
What are the main goals of ESSA?

Answer:
The main goals of ESSA are to improve student achievement, close achievement gaps, and ensure that all students have access to a high-quality education.

Question:
What are some of the key provisions of ESSA?

Answer:
Some of the key provisions of ESSA include:

-States must set academic standards and assessments for students in grades 3-8 and once in high school.

-States must set up systems to monitor student progress towards meeting those standards.

-States must provi

Evaluation of prompt 1: Relevance = more answers than prompted for; Accuracy  = mainly correct if not superficial; Clarity = good; Completeness = iffy, superficial answer; Tone is good; Engagement is low.

# Load second model, the gemma-2-2b-it

In [7]:
model_name2 = 'google/gemma-2-2b-it'
model2, tokenizer2 = load_model_and_tokenizer(model_name2)
model2 = model2.to('cpu')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

# Second test prompt: using gemma-2-2b-it model

In [None]:
# Define the second prompt template
template = "{pre}\n\nQuestion:\n{question}"

# Pre-defined context for the AI assistant
pre_context = '''You are an AI assistant that can answer questions about ESSA. '''\
               '''ESSA stands for the Every Student Succeeds Act.'''

# List of questions to ask
questions = [
    "What is ESSA?",
    "How does ESSA impact student achievement?"
]

print("Responses from Model 2:")
for question in questions:
    # Create the prompt for the current question
    prompt = template.format(pre=pre_context, question=question)

    # Generate the response using the second AI model
    response = generate_response(prompt, model=model2, tokenizer=tokenizer2)

    # Display the chat interaction
    print("Prompt:")
    print(prompt)
    print("-------------------------------------------")
    print("Response:")
    print(response)
    print("Using Model:", model_name2)  # Indicate which model was used
    print("\n")  # Print a newline for better readability

Responses from Model 2:




Prompt:
You are an AI assistant that can answer questions about ESSA. ESSA stands for the Every Student Succeeds Act.

Question:
What is ESSA?
-------------------------------------------
Response:


Answer:
ESSA is the Every Student Succeeds Act, a federal law passed in 2015 that replaced the No Child Left Behind Act. 

Here are some key features of ESSA:

* **Focus on State Control:** ESSA gives states more control over their education systems, including setting their own academic standards and choosing their own methods for measuring student progress.
* **Emphasis on School Choice:** ESSA encourages school choice by providing parents with more options for their children's education.
* **Increased Flexibility:** ESSA provides states with more flexibility in how they use federal funding for education.
* **Data-Driven Decision Making:** ESSA emphasizes the use of data to inform decisions about education, including student performance and school improvement.
* **Support for Students with

Evaluation of prompt 2: Relevance = excellent, answered both questions well; Accuracy  = correct with good amount of supporting detail; Clarity = well organized, easy to follow; Completeness = excellent; Tone is perfect, even provides a caveat; Engagement is great, even prompts for additional questions.

Save this model

In [None]:
# Save the model and tokenizer
save_directory = './gemma-2-2b-it-finetuned'
# Set do_sample to True to use temperature
model2.generation_config.do_sample = True
model2.save_pretrained(save_directory)
tokenizer2.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")

Model and tokenizer saved to ./gemma-2-2b-it-finetuned


# Answer to random test prompt comparing dataset to 2 gemma models

In [None]:
# Load your models and tokenizers
model_name = 'google/gemma-2-2b'
model, tokenizer = load_model_and_tokenizer(model_name)
model = model.to('cpu')

model_name2 = 'google/gemma-2-2b-it'
model2, tokenizer2 = load_model_and_tokenizer(model_name2)
model2 = model2.to('cpu')

# Load dataset
df = pd.read_csv('ESSA q and a_11.12.csv')

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    question = row['Question']
    true_answer = row['Answer']

    # Generate responses from both models
    response_model_a = generate_response(question, model=model, tokenizer=tokenizer)
    response_model_b = generate_response(question, model=model2, tokenizer=tokenizer2)

    # Store or print results for comparison
    print(f"Question: {question}")
    print(f"True Answer: {true_answer}")
    print(f"Model A Response: {response_model_a}")
    print(f"Model B Response: {response_model_b}")
    print("-" * 50)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Question: Does my state still have to test 95 percent of its students? 
True Answer: In short, yes. ESSA requires that a state’s accountability system must measure the performance of 95 percent of students by looking at a variety of indicators. One of the indicators is “academic achievement as measured by proficiency on the annual assessments.” For this reason, in order to measure the overall achievement of 95 percent of students, 95 percent must take the annual assessments. 
Model A Response: 

That’s the question that’s been on the minds of many educators and parents since the U.S. Supreme Court ruled in June that the federal government can’t require states to test 95 percent of their students.

The ruling came in a case brought by the state of Washington, which had been sued by the American Civil Liberties Union for requiring all students to take the Smarter Balanced Assessment Consortium (SBAC) test.

The ACLU argued that the test was too difficult for many students, and that the s

KeyboardInterrupt: 

Evaluation of prompt 3 for Model A (2b):Relevance = gibberish; Accuracy = nil ; Clarity = repeats itself; Completeness = nil; Tone is very odd; Engagement= nil. Model B (2b-it):Relevance = somewhat; Accuracy = ok for some answers but not at all good for others; Clarity = good; Completeness = good; Tone is good; Engagement= good.

# Re-upload and define dataset for fine-tuning with LoRA

In [8]:
uploaded = files.upload()

Saving ESSA q and a_11.12.csv to ESSA q and a_11.12.csv


In [9]:
# Load dataset
train_data = pd.read_csv('ESSA q and a_11.12.csv')
train_data.head(5)

Unnamed: 0,Context,Question,Answer,Audience,Source
0,Assesment,Does my state still have to test 95 percent of...,"In short, yes. ESSA requires that a state’s ac...",State,chrome-extension://efaidnbmnnnibpcajpcglclefin...
1,Assesment,How do the students (up to 1 percent) who rece...,As long as they meet the other requirements ar...,State,chrome-extension://efaidnbmnnnibpcajpcglclefin...
2,Standards,What are the related mandates or prohibitions ...,While states must maintain “challenging academ...,State,chrome-extension://efaidnbmnnnibpcajpcglclefin...
3,Standards,What kind of alignment is required between ele...,ESSA requires that states demonstrate that the...,State,chrome-extension://efaidnbmnnnibpcajpcglclefin...
4,Standards,Are states required to submit their standards ...,No. There is clear language in the bill that n...,State,chrome-extension://efaidnbmnnnibpcajpcglclefin...


In [10]:
# Define format of the fine-tuning data
template = "{pre}\n\nQuestion:\n{question}\n\nAnswer:\n{answer}"

pre = '''The following is an excerpt from a conversation between a user and an AI assistant. '''\
      '''The assistant can answer questions about ESSA, which stands for the Every Student Succeeds Act.'''

# Format each training string for the training dataset
ft_all_train_data = []
for idx, row in train_data.iterrows():  # Use the training set
    ft_item = template.format(
        pre=pre,
        question=row['Question'],
        answer=row['Answer'],
    )
    ft_all_train_data.append(ft_item)

# Tokenize all the fine-tune data for the training set
tokenizer = load_tokenizer_for_ft("google/gemma-2-2b-it")
tokenized_train_data = []
for el in ft_all_train_data:
    tok_item = tokenizer(el, padding=True, truncation=True)
    tokenized_train_data.append(tok_item)


# Check for BOS and EOS tokens
print("bos=", tokenizer.bos_token_id, "eos=", tokenizer.eos_token_id)

# Check tokenized data examples
print("----ft sample-----")
print(ft_all_train_data[0])
print("----tokenized train data----")
print(tokenized_train_data[0])  # Example from the training set

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


bos= 2 eos= 1
----ft sample-----
The following is an excerpt from a conversation between a user and an AI assistant. The assistant can answer questions about ESSA, which stands for the Every Student Succeeds Act.

Question:
Does my state still have to test 95 percent of its students? 

Answer:
In short, yes. ESSA requires that a state’s accountability system must measure the performance of 95 percent of students by looking at a variety of indicators. One of the indicators is “academic achievement as measured by proficiency on the annual assessments.” For this reason, in order to measure the overall achievement of 95 percent of students, 95 percent must take the annual assessments. 
----tokenized train data----
{'input_ids': [2, 651, 2412, 603, 671, 80545, 774, 476, 12836, 1865, 476, 2425, 578, 671, 16481, 20409, 235265, 714, 20409, 798, 3448, 3920, 1105, 62639, 235280, 235269, 948, 12353, 604, 573, 7205, 13137, 64795, 17825, 5031, 235265, 109, 9413, 235292, 108, 11227, 970, 2329, 2076,

In [12]:
# Load the model and tokenizer
model_name = 'google/gemma-2-2b-it'
model, tokenizer = load_model_and_tokenizer(model_name)


model = model.to('cpu')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
peft_config = LoraConfig(
  task_type=TaskType.CAUSAL_LM,
  inference_mode=False,
  r=4
)
model = get_peft_model(model2, peft_config)
model.print_trainable_parameters()

trainable params: 798,720 || all params: 2,615,140,608 || trainable%: 0.0305


In [14]:
# Define the directory name
new_dir = "/KaggleX/MWhite"

# Create the directory
os

<module 'os' from '/usr/lib/python3.10/os.py'>

# Fine tune for 1 epoch

In [16]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/KaggleX/MWhite/output/gemma2_essa_ft1",
    learning_rate=2e-4,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.0,
    logging_steps=25,
    report_to=None # don't integrate with WANDB
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)


model.config.use_cache = False
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
25,2.0803
50,1.6148
75,1.4183


TrainOutput(global_step=96, training_loss=1.6597474217414856, metrics={'train_runtime': 761.8916, 'train_samples_per_second': 0.126, 'train_steps_per_second': 0.126, 'total_flos': 162896214781440.0, 'train_loss': 1.6597474217414856, 'epoch': 1.0})

In [17]:
# Save the LORA model (adapter weights only so the save is fast)
trainer.save_model("/content/drive/MyDrive/KaggleX/MWhite/output/epochs1/lora/")

In [18]:
# load the LORA config from the saved adapter
peft_config = LoraConfig.from_pretrained(
    pretrained_model_name_or_path="/content/drive/MyDrive/KaggleX/MWhite/output/epochs1/lora/")

# finally load the base model again merging the LORA adapter weights
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = get_peft_model(base_model, peft_config)
model.load_adapter("/content/drive/MyDrive/KaggleX/MWhite/output/epochs1/lora/","lora")
model = model.to('cpu')

model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                  (lora): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=4, bias=False)
                  (lora): Linear(in_features=2304, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                  (lora): Linear(in_features=4, out_features=2048, bias=False)
                )
                (

# LoRA prompt 1

In [20]:

# Prompt the model that was fine-tuned for 1 epoch

# Define the initial context/preamble
template = "{pre}\n\nQuestion:\n{question}\n\nAnswer:\n{answer}"
pre_context = "You are an AI assistant that can answer questions about ESSA. ESSA stands for the Every Student Succeeds Act."

# Ask the first question
first_question = template.format(
        pre=pre_context,
        question="What is ESSA?",
        answer="")
response_1 = generate_response(first_question, model=model, tokenizer=tokenizer, device='cpu')
print("Question 1:", first_question)
print("Response 1:", response_1)

# Scond question
second_question = template.format(
        pre=pre_context,
        question="How does ESSA impact student achievement?",
        answer="")
response_2 = generate_response(second_question, model=model, tokenizer=tokenizer, device='cpu')
print("\nQuestion 2:", second_question)
print("Response 2:", response_2)

# Third question
third_question = template.format(
        pre=pre_context,
        question="What is Title I?",
        answer="")
response_3 = generate_response(third_question, model=model, tokenizer=tokenizer, device='cpu')
print("\nQuestion 3:", third_question)
print("Response 3:", response_3)



Question 1: You are an AI assistant that can answer questions about ESSA. ESSA stands for the Every Student Succeeds Act.

Question:
What is ESSA?

Answer:

Response 1: ESSA is the Every Student Succeeds Act, a federal law passed in 2015 that replaced the No Child Left Behind Act. 

Here are some key features of ESSA:

* **Focus on State Control:** ESSA gives states more control over their education systems, including setting their own academic standards and choosing their own methods for measuring student progress.
* **Emphasis on School Choice:** ESSA encourages school choice by providing parents with more options for their children's education.
* **Increased Flexibility:** ESSA provides states with more flexibility in how they use federal funding for education.
* **Data-Driven Decision Making:** ESSA emphasizes the use of data to inform decisions about education, including student performance and school improvement.
* **Support for Students with Disabilities:** ESSA ensures that stu

Evaluation of LoRA prompt1, 1 epoch: Relevance = excellent, answered both questions well; Accuracy = correct with good amount of supporting detail; Clarity = well organized, easy to follow; Completeness = excellent; Tone is perfect, even provides a caveat; Engagement is great, even prompts for additional questions. This is identical to the gemma-2-2b-it response and the aditional response to third prompt is also very well done, excellent detail and summary. Training loss for 1 epoch is 1.659.

## Fine-tune using LoRA, 32 epochs

In [21]:
# Load the model and tokenizer
model_name = 'google/gemma-2-2b-it'
model, tokenizer = load_model_and_tokenizer(model_name)


model = model.to('cpu')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
peft_config = LoraConfig(
  task_type=TaskType.CAUSAL_LM,
  inference_mode=False,
  r=4
)
model = get_peft_model(model2, peft_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/KaggleX/MWhite/output/gemma2_essa_ft32",
    learning_rate=2e-4,
    per_device_train_batch_size=1,
    num_train_epochs=32,
    weight_decay=0.0,
    logging_steps=1000,
    report_to=None  # Don't integrate with WANDB
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Disable caching for training
model.config.use_cache = False

trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
1000,0.8383
2000,0.129
3000,0.0489


TrainOutput(global_step=3072, training_loss=0.33176109644894797, metrics={'train_runtime': 24065.4208, 'train_samples_per_second': 0.128, 'train_steps_per_second': 0.128, 'total_flos': 5212678873006080.0, 'train_loss': 0.33176109644894797, 'epoch': 32.0})

In [23]:
# Save the LORA model (adapter weights only so the save is fast)
trainer.save_model("/content/drive/MyDrive/KaggleX/MWhite/output/epochs32/lora/")

In [24]:
# load the LORA config from the saved adapter
peft_config = LoraConfig.from_pretrained(
    pretrained_model_name_or_path="/content/drive/MyDrive/KaggleX/MWhite/output/epochs32/lora/")

# finally load the base model again merging the LORA adapter weights
base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = get_peft_model(base_model, peft_config)
model.load_adapter("/content/drive/MyDrive/KaggleX/MWhite/output/epochs32/lora/","lora")
model = model.to('cpu')

model.eval()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                  (lora): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=4, bias=False)
                  (lora): Linear(in_features=2304, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=2048, bias=False)
                  (lora): Linear(in_features=4, out_features=2048, bias=False)
                )
                (

# LoRA prompt responses, 32 epochs

In [25]:
# Define the initial context/preamble
template = "{pre}\n\nQuestion:\n{question}\n\nAnswer:\n{answer}"
pre_context = "You are an AI assistant that can answer questions about ESSA. ESSA stands for the Every Student Succeeds Act."

# Ask the first question
first_question = template.format(
        pre=pre_context,
        question="What is ESSA?",
        answer="")
response_1 = generate_response(first_question, model=model, tokenizer=tokenizer, device='cpu')
print("Question 1:", first_question)
print("Response 1:", response_1)

# Scond question
second_question = template.format(
        pre=pre_context,
        question="How does ESSA impact student achievement?",
        answer="")
response_2 = generate_response(second_question, model=model, tokenizer=tokenizer, device='cpu')
print("\nQuestion 2:", second_question)
print("Response 2:", response_2)

# Third question
third_question = template.format(
        pre=pre_context,
        question="What is Title I?",
        answer="")
response_3 = generate_response(third_question, model=model, tokenizer=tokenizer, device='cpu')
print("\nQuestion 3:", third_question)
print("Response 3:", response_3)



Question 1: You are an AI assistant that can answer questions about ESSA. ESSA stands for the Every Student Succeeds Act.

Question:
What is ESSA?

Answer:

Response 1: ESSA is the Every Student Succeeds Act, a federal law passed in 2015 that replaced the No Child Left Behind Act. 

Here are some key features of ESSA:

* **Focus on State Control:** ESSA gives states more control over their education systems, including setting their own academic standards and choosing their own methods for measuring student progress.
* **Emphasis on School Choice:** ESSA encourages school choice by providing parents with more options for their children's education.
* **Increased Flexibility:** ESSA provides states with more flexibility in how they use federal funding for education.
* **Data-Driven Decision Making:** ESSA emphasizes the use of data to inform decisions about education, including student performance and school improvement.
* **Support for Students with Disabilities:** ESSA ensures that stu

Evaluation of LoRA prompt2, 32 epochs: Same response as from only 1 epoch, but a lower training loss. Relevance = excellent, answered both questions well; Accuracy = correct with good amount of supporting detail; Clarity = well organized, easy to follow; Completeness = excellent; Tone is perfect, even provides a caveat; Engagement is great, even prompts for additional questions. This is identical to the gemma-2-2b-it response and the aditional response to third prompt is also very well done, excellent detail and summary. Training loss for 32 epochs is 0.331 so an improvement from the earlier training.

#RAG implementation


# Install additional RAG libraries and tools

In [26]:
!pip install python-docx langchain



In [27]:
import langchain
import pandas as pd


In [28]:
from langchain.schema import Document as LangchainDocument
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from transformers import pipeline


In [29]:
pip install pandas openpyxl



# Import and manage RAG file

In [30]:
uploaded = files.upload()

Saving ESSA RAG file_10.31.xlsx to ESSA RAG file_10.31.xlsx


In [33]:
# Load the Excel file
xls_file = "ESSA RAG file_10.31.xlsx"

# Read the entire Excel file
df = pd.read_excel(xls_file)

texts = []

for idx, row in df.iterrows():
  txt = LangchainDocument(page_content=row['Question'] + ' ' + row['Answer'])
  texts.append(txt)

print("Number of items=", len(texts))

# Debugging - print all the items
#for txt in texts:
#  print(txt.page_content)

# Create embeddings and vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Create a Chroma vector database from the documents
try:
  db.delete_collection()
except:
  pass
db = Chroma.from_documents(texts, embeddings)

# Create a retriever from the vector database
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 2, "score_threshold": 0.5})

Number of items= 204


In [34]:
#Test the RAG retriever on a prompt

docs = retriever.invoke("What is ESSA?")
print("retreived", len(docs), "documents")
print(docs)

retreived 2 documents
[Document(metadata={}, page_content='What is ESSA? ESSA is the\xa0federal law\xa0that allows the U.S. Government to support both national and local education goals with grants and other resources. Passed in 2015, it replaced\xa0No Child Left Behind (NCLB).\xa0and became\xa0the latest\xa0iteration, or extension, of the 1965\xa0Elementary and Secondary Education Act.'), Document(metadata={}, page_content='What is an ESSA State Plan? An ESSA State Plan is a comprehensive document developed by each U.S. state to outline how it will implement the Every Student Succeeds Act (ESSA). ESSA, enacted in December 2015, is a federal law that governs the United States K-12 public education policy.')]


# Prompt responses after RAG

In [35]:
# Load a separate QA pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Function to get answer from the model using relevant context
def get_answer(query):
    # Use the retriever to find relevant documents
    relevant_docs = retriever.get_relevant_documents(query)  # Changed invoke to retrieve

    if not relevant_docs:
        return "No relevant context found for answering the question."

    # Combine the text from relevant documents to create a coherent context
    context_chunk = "\n".join([doc.page_content for doc in relevant_docs])

    # Use the QA pipeline to get an answer
    response = qa_pipeline(question=query, context=context_chunk)

    # Return the answer found by the QA model
    return response['answer']

    # Example usage of the function
query = "What is ESSA?"
result = get_answer(query)
print("Answer:", result)


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Answer: federal law


  relevant_docs = retriever.get_relevant_documents(query)  # Changed invoke to retrieve


In [36]:
# Ssecond question
query2 = "What is Title I?"
result2 = get_answer(query2)

# Result for the second question
print("Second Question:", query2)
print("Answer:", result2)



Second Question: What is Title I?
Answer: No relevant context found for answering the question.


In [None]:
# Third question
query3 = "What does ESSA say about state assessments?"
result3 = get_answer(query3)

# Result for the second question
print("Third Question:", query3)
print("Answer:", result3)

Third Question: What does ESSA say about state assessments?
Answer: Provide for participation in the assessments of all students in the grades assessed


Evaluation of RAG results: Relevance, Acccuracy, Clarity, and Completeness = low, missing, or just very bad as in question 3. Tone and engagement are both low. Missing basic information and not very helpful. The retrieval code shows the correct response is available so the flaw must be in the QA pipeline approach.

#Gradio Interface

In [37]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.7.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<1.0,>=0.1.1 (from gradio)
  Downloading safehttpx-0.1.1-py3-none-any.whl.metad

In [38]:
import os
import pandas as pd
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer

In [39]:
# Define a function to load the model and tokenizer
def load_model_and_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    return model, tokenizer

# Load the model and tokenizer
model_name = 'google/gemma-2-2b-it'
model, tokenizer = load_model_and_tokenizer(model_name)


# Move the model to CPU or GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [40]:
# Initialize a memory dictionary to hold session data
session_memory = {}

def chatbot(user_input):
    # Use a unique session key for the anonymous user
    session_key = "current_session"

    # Initialize the session memory if it doesn't exist
    if session_key not in session_memory:
        session_memory[session_key] = {"last_question": None}

    # Remember the last question
    session_memory[session_key]["last_question"] = user_input

    # Generate a response from the model
    inputs = tokenizer(user_input, return_tensors="pt").to(device)  # Ensure inputs are on the same device as the model

    with torch.no_grad():
        outputs = model.generate(inputs["input_ids"], max_length=300, num_return_sequences=1)

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [41]:
# Set up the Gradio interface
iface = gr.Interface(
    fn=chatbot,
    inputs=gr.Textbox(lines=2, label="Enter your ESSA question here..."),
    outputs=gr.Textbox(lines=7, label="ESSA answer"),
    title="ESSA Answers",
    description="An ESSA chatbot powered by a fine-tuned model.",
    examples=[["What is ESSA?"], ["What are the key provisions of ESSA?"]],
    allow_flagging="never",  # Disable the flagging feature
)

# Launch the interface
iface.launch(share=True)




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4299323f4cb19e1910.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Processing time for simple question (What is Title I?) is about 1.5 minutes (91 seconds). Metrics for prompt were all well met.
Follow-up question to test memory took less time, about 84 seconds. The answer though was not explicitly related to first question. Third, fairly simple question took 1.5 minutes, all eval metrics met although I'd limited tokens to 150 originally so expanded to 300.