# SETUP THE ENVIORNMENT

In [None]:
!pip install tensorflow[and-cuda]

In [None]:
!pip install keras-nlp

In [None]:
!pip install gradio

In [None]:
!pip install psutil


# IMPORT PACKAGES

Import Keras and KerasNLP.

In [None]:
import keras
import keras_nlp

# GET KAGGLE CREDENTIALS


In [None]:
# Installed Kaggle Hub to login

import kagglehub
kagglehub.login()

# LOAD TRAINING DATASET

In [None]:
import json

# Load data
data = []
file_path = 'final_laws.jsonl'

with open(file_path, 'r') as file:
    for line in file:
        features = json.loads(line)
        
        # Format the example with Question and Answer
        template = "Question:\n{Question}\n\nAnswer:\n{Answer}"
        
        # Add the formatted string to data
        data.append(template.format(**features))

# use 996 training examples, to keep it fast in the beginning used 100 examples
data = data[:100]

In [None]:
#show the data head

# Print the first few examples (head) with entry numbers
for idx, item in enumerate(data[:5], start=1):  # Adjust the number as needed
    print(f"Entry {idx}:{item}")
    

## Inference before fine tuning

In this section, you will query the model with various prompts to see how it responds.


### Buy a Home Prompt

### Query the model for suggestions on how to buy a home.

# Load Model



In [None]:

import keras_nlp

# Initialize the gemma_lm model
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_instruct_2b_en")


# Define the template for the prompt
template = "Question:\n{Question}\n\nAnswer:\n{Answer}"

# Format the prompt with a question and leave the answer blank for model generation
prompt = template.format(
    Question="How can I buy a new house?",
    Answer="",  # Leaving the answer blank for the model to generate it
)

# Generate the response using gemma_lm model
print(gemma_lm.generate(prompt, max_length=256))


The model just responds with a basic advice on how to buy a home. 

### ELI5 How to buy a Home Prompt

Prompt the model to explain buying a home  in terms simple enough that a child can understand.


In [None]:
prompt = template.format(
    Question="Explain the process of buying a home in a way that a 5 year old child could understand.",
    Answer="",
)
print(gemma_lm.generate(prompt, max_length=256))

The responses contains words that might not be easy to understand for a child such as budget, contract, reject.

# FINE-TUNING CODE

## For demonstration purposes, the model was fine-tuned on a small subset of the dataset for just one epoch and with a low LoRA rank value. To get better responses from the fine-tuned model, these values were experimented with:

1. Increasing the size of the fine-tuning dataset
2. Training for more steps (epochs)
3. Setting a higher LoRA rank
4. Modifying the hyperparameter values such as `learning_rate` and `weight_decay`.


In [None]:
## LoRA Fine-tuning

## To get better responses from the model, fine-tune the model with Low Rank Adaptation (LoRA) using the real estate law for ontario canada dataset

## Enable LoRA for the model and set the LoRA rank to 4.
gemma_lm.backbone.enable_lora(rank=4)
gemma_lm.summary()

Note that enabling LoRA reduces the number of trainable parameters significantly (from 2.5 billion to 1.3 million).

In [None]:
# we will continue training from this point
import json
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load data
data = []
file_path = './kaggle/input/mydata-final-laws/mydata/final_laws.jsonl'

with open(file_path, 'r') as file:
    for line in file:
        features = json.loads(line)
        
        # Format the example with Question and Answer
        template = "Question:\n{Question}\n\nAnswer:\n{Answer}"
        
        # Add the formatted string to data
        data.append(template.format(**features))

# Only use 100 training examples, to keep it fast
data = data[:996]

# Limit the input sequence length to 512 (to control memory usage).
gemma_lm.preprocessor.sequence_length = 512

# Use AdamW (a common optimizer for transformer models) with a slightly higher learning rate
optimizer = tf.keras.optimizers.AdamW(
    learning_rate=5e-5,  # Increased learning rate for faster convergence
    weight_decay=0.01,
)

# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

# Compile the model
gemma_lm.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

# Define EarlyStopping callback to monitor 'sparse_categorical_accuracy'
early_stopping = EarlyStopping(
    monitor='sparse_categorical_accuracy',  # Monitor accuracy
    patience=3,  # Stop after 3 epochs of no improvement
    mode='max',  # Stop when accuracy stops increasing
    restore_best_weights=True  # Restore weights from the epoch with the best performance
)

# Learning rate scheduler to reduce learning rate if accuracy stops improving
lr_scheduler = ReduceLROnPlateau(
    monitor='sparse_categorical_accuracy',
    factor=0.5,  # Reduce learning rate by half when accuracy plateaus
    patience=2,  # Number of epochs to wait before reducing learning rate
    min_lr=1e-7  # Set a minimum learning rate to prevent over-reduction
)

# Fit the model with early stopping and learning rate scheduler
gemma_lm.fit(
    data, 
    epochs=20, 
    batch_size=2,  # Increased batch size for better gradient updates
    callbacks=[early_stopping, lr_scheduler]  # Added learning rate scheduler
)


## Save the model after it has been fine-tuned for resume later

In [None]:
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers


# Save the Keras model in .keras format
gemma_lm.save('./kaggle/input/my_model.keras')

# DOWNLOAD AND LOAD THE MODEL

In [None]:
import kagglehub

# Download latest version
path = kagglehub.model_download("joelbest/gemma2b_reo/keras/default")

print("Path to model files:", path)

In [None]:
# run this first to load the finetuned model# import keras so it is defined for later use
import keras
import keras_nlp

# Load the saved model in .keras format
gemma_lm = keras.models.load_model('/home/jupyter/.cache/kagglehub/models/joelbest/gemma2b_reo/keras/default/1/my_model.keras')


2024-12-01 19:12:30.537383: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-01 19:12:30.557531: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733080350.580162   15742 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733080350.586704   15742 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-01 19:12:30.608663: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
# output to show the model is sucessfully loaded
gemma_lm.summary()

# TESTING

In [None]:
#testing
import keras
import keras_nlp
# Load the saved model with the built-in tokenizer and language model
gemma_lm = keras.models.load_model('my_model.keras')

# Example input texts for inference
texts = [
    
    "Do I need a lawyer to handle the closing process when purchasing a property in Ontario?",
    "What are the legal implications of a co-ownership agreement when buying a property with someone else?",
    "Can I negotiate a lower real estate commission rate with my agent?",
    "What should I know about the rights and responsibilities for maintaining a shared fence between properties?"
]

# Set a smaller batch size
batch_size = 2

# Perform inference in batches
for i in range(0, len(texts), batch_size):
    batch_texts = texts[i:i + batch_size]

       
  # Use the model to generate answers directly
    answers = gemma_lm.generate(batch_texts)
    
    # Print the input questions and the generated answers
    for text, answer in zip(batch_texts, answers):
        print(f"Question: {text}")
        print(f"Answer: {answer}\n")



In [None]:
import tensorflow as tf# List available GPUs and their details
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        details = tf.config.experimental.get_memory_info(gpu.name)
        print(f"GPU: {gpu.name}")
        print(f"  Current Memory Usage: {details['current']} bytes")
        print(f"  Peak Memory Usage: {details['peak']} bytes")
else:
    print("No GPU devices found.")

In [None]:
# SAVE THE MODEL AGAIN #

gemma_lm.save('./kaggle/input/my_model.keras')

In [None]:
# check the history of epochs trained so far 
# Assuming you have the history object from model.fit()
history = model.fit(...)

# To get the number of epochs that were run
num_epochs = len(history.epoch)
print(f"Number of epochs run: {num_epochs}")

## INFERENCE AFTER FINE TUNING
After fine-tuning, responses follow the instruction provided in the prompt.

### Buy a home Prompt


In [None]:
prompt = template.format(
    Question="How can I buy a new house?",
    Answer="",
)
print(gemma_lm.generate(prompt, max_length=256))

The model now explains how to buy a home in Ontario.

### ELI5 How to Buy a home Prompt


In [None]:
prompt = template.format(
    Question="Explain the process of buying a home in a way that a child could understand.",
crew     Answer="",
)
print(gemma_lm.generate(prompt, max_length=256))

# LOAD UP GRADIO TO TRY THE BOT

In [None]:
#this code works

import gradio as gr
import tensorflow as tf
import keras

def generate_response(model: keras.Model, question: str, max_length: int = 250) -> str:
    """
    Generates a response for a single question using the provided model.

    Args:
        model (keras.Model): The fine-tuned language model.
        question (str): The user's input question.
        max_length (int): Maximum length of the generated answer.

    Returns:
        str: The generated answer text.
    """
    prompt_template = "Question:\n{question}\n\nAnswer:\n"
    prompt = prompt_template.format(question=question)

    try:
        # Generate response using the model's generate method
        generated_response = model.generate(prompt, max_length=max_length)

        # Decode the generated response based on its type
        if isinstance(generated_response, tf.Tensor):
            generated_text = generated_response.numpy().decode('utf-8')
        elif isinstance(generated_response, bytes):
            generated_text = generated_response.decode('utf-8')
        else:
            generated_text = generated_response  # Assume it's already a string

        # Extract the answer part by removing the prompt
        answer_text = generated_text[len(prompt):].strip()
        return answer_text

    except Exception as e:
        return f"⚠️ Error generating answer: {e}"

# Define the chatbot function for Gradio
def chatbot_gradio(user_input, model):
    """
    Gradio-compatible chatbot function.
    
    Args:
        user_input (str): The user's input question.
        model (keras.Model): The fine-tuned language model.

    Returns:
        str: The chatbot's answer.
    """
    if not user_input.strip():
        return "Chatbot: I'm here to help! Please enter a question."

    answer = generate_response(model, user_input, max_length=250)
    return answer

# Example usage with Gradio:
if gemma_lm:
    # Create Gradio interface
    interface = gr.Interface(
        fn=lambda user_input: chatbot_gradio(user_input, gemma_lm),
        inputs="text",
        outputs="text",
        title="Gemma 2b Language Model Chatbot",
        description="Enter a question, and the chatbot will respond."
    )

    # Launch the interface
    interface.launch(share=True)
else:
    print("⚠️ Model not loaded. Skipping chatbot.")


# CLEAN UP THE ENVIORNMENT FREE MEMORY

In [None]:
import gc
import tensorflow as tf
from numba import cuda

def get_gpu_memory_usage():
    """Get the current and peak GPU memory usage."""
    gpus = tf.config.list_physical_devices('GPU')
    if not gpus:
        return 0, 0  # No GPU available
    memory_info = tf.config.experimental.get_memory_info('GPU:0')
    return memory_info['current'], memory_info['peak']

# Get GPU memory usage before cleanup
before_memory, before_peak = get_gpu_memory_usage()

# Delete any variables holding models or large objects
try:
    del gemma_lm  # Delete the model if it exists
except NameError:
    pass  # If gemma_lm doesn't exist, continue

# Clear TensorFlow GPU memory
tf.keras.backend.clear_session()

# Run garbage collection to free memory
gc.collect()

# Reset GPU memory
try:
    device = cuda.get_current_device()
    device.reset()
except Exception as e:
    print(f"CUDA reset failed: {e}")

# Get GPU memory usage after cleanup
after_memory, after_peak = get_gpu_memory_usage()

# Calculate and display the memory cleared
cleared_memory = before_memory - after_memory
print(f"Memory before cleanup: {before_memory / 1024**2:.2f} MB")
print(f"Memory after cleanup: {after_memory / 1024**2:.2f} MB")
print(f"Memory cleared: {cleared_memory / 1024**2:.2f} MB")
