In [1]:
# import dependencies

import os

os.environ["KERAS_BACKEND"] = "jax"  # you can also use tensorflow or torch
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = (
    "1.00"  # avoid memory fragmentation on JAX backend.
)

import pandas as pd
import keras_nlp
import keras
import random

2024-03-22 06:11:25.663035: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-22 06:11:25.663136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-22 06:11:25.776459: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# read Dataset and show first five Rows
df = pd.read_parquet("/kaggle/input/parquetfile-python-25k/0000.parquet")
df.head()

Unnamed: 0,output,text,input,instruction
0,```python\ntasks = []\nwhile True:\n task =...,Help me set up my daily to-do list! Setting up...,Setting up your daily to-do list...,Help me set up my daily to-do list!
1,```python\nshopping_list = {}\nwhile True:\n ...,Create a shopping list based on my inputs! Cre...,Creating a shopping list...,Create a shopping list based on my inputs!
2,"```python\ntotal_time = 0\nfor i in range(1, 8...",Calculate how much time I spend on my phone pe...,Calculating weekly phone usage...,Calculate how much time I spend on my phone pe...
3,```python\ntotal_bill = float(input('Enter the...,Help me split the bill among my friends! Split...,Splitting the bill...,Help me split the bill among my friends!
4,```python\nmovie_list = {}\nwhile True:\n g...,Organize my movie list into genres! Organizing...,Organizing your movie list...,Organize my movie list into genres!


In [3]:
# define gemmma tokenizer
tokenizer = keras_nlp.models.GemmaTokenizer.from_preset("gemma_instruct_2b_en")


def get_text_token_len(df_column: pd.Series) -> tuple[list, list]:
    """
    Calculates the length of texts and their tokenized forms from a pandas Series.

    Parameters:
    - df_column (pd.Series): A pandas Series containing text data to be analyzed.

    Returns:
    - tuple: A tuple containing two lists:
        - The first list contains the lengths of the texts in words.
        - The second list contains the lengths of the tokenized texts in tokens.
    """
    text_len = []
    token_len = []

    for text in df_column.values:
        length = len(text.split())
        text_len.append(length)

    for text in df_column.values:
        tokens = tokenizer(text)
        token_len.append(len(tokens))

    return text_len, token_len


def get_prep_gemma(instruction: list, output: list) -> pd.DataFrame:
    """
    Creates a DataFrame from instruction and output lists, preprocesses it for Gemma,
    and appends text and token lengths before returning the processed DataFrame.

    Parameters:
    - instruction (list): A list of instruction texts.
    - output (list): A list of outputs corresponding to each instruction.

    Returns:
    - pd.DataFrame: A pandas DataFrame containing the original instructions and outputs,
      preprocessed text for Gemma, lengths of preprocessed texts in words and tokens,
      with duplicates removed and sorted by token length in descending order.
    """

    df = pd.DataFrame({"instruction": instruction, "output": output})

    print(f"Len DataFrame: {len(df)}")
    df = df.drop_duplicates()
    df = df.dropna()
    print(f"Len DataFrame without Dups/NaN's: {len(df)}")

    data = df.apply(
        lambda row: f"Instruction:\n{row['instruction']}\n\nResponse:\n{row['output']}",
        axis=1,
    ).values.tolist()

    df["prep_gemma"] = data

    text_len, token_len = get_text_token_len(df_column=df["prep_gemma"])

    df["text_len_gemma"] = text_len
    df["token_len_gemma"] = token_len
    df = df.sort_values(by="token_len_gemma", ascending=False)

    return df

Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


In [4]:
instruction = df["instruction"].tolist()
output = df["output"].tolist()

# preprocessing pipeline
df = get_prep_gemma(instruction=instruction, output=output)
df.head()

Len DataFrame: 49626
Len DataFrame without Dups/NaN's: 24813


Unnamed: 0,instruction,output,prep_gemma,text_len_gemma,token_len_gemma
21722,"Teacher:In this task, you are given inputs i a...","Sure, I can solve this task. The given integer...","Instruction:\nTeacher:In this task, you are gi...",223,481
21859,"Detailed Instructions: In this task, you are g...","To accomplish this task, we will follow these ...",Instruction:\nDetailed Instructions: In this t...,198,479
21887,In this task you will be given two lists of nu...,To find the intersection between the two lists...,Instruction:\nIn this task you will be given t...,281,477
17244,Create a Python code generator for drawing a 3...,```python\nimport pyglet\n#cube window\nwindow...,Instruction:\nCreate a Python code generator f...,142,476
22073,I have huge data in a directory tree format li...,"Yes, it is possible to write a script that can...",Instruction:\nI have huge data in a directory ...,300,476


In [5]:
# gemma input limit is 256 tokens (because memory) and the model should not short instructions
# we use only instructions that have more than 10 Words

print(f"Len Df {len(df)}")
df = df[df["text_len_gemma"] > 10]
df = df[df["token_len_gemma"] < 257]
print(f"Len Df with more than 10 Text Words and less than 257 Tokens:{len(df)}")
df.head()

Len Df 24813
Len Df with more than 10 Text Words and less than 257 Tokens:22084


Unnamed: 0,instruction,output,prep_gemma,text_len_gemma,token_len_gemma
21244,Create a cover letter to apply for a job as a ...,"Dear [Hiring Manager's Name],\n\nI am writing ...",Instruction:\nCreate a cover letter to apply f...,210,256
22471,How would I make Python return something like ...,"Yes, that is correct. To create and return a d...",Instruction:\nHow would I make Python return s...,134,256
21367,Write a function to design a chessboard,Here is a function to create a text-based ches...,Instruction:\nWrite a function to design a che...,156,256
24762,Design a class in Python that computes the fre...,"""""""\nDesign a class to compute the frequency o...",Instruction:\nDesign a class in Python that co...,121,256
232,Turn the following tutorial for Vera's Lobbies...,```python\n# Step-by-Step Tutorial for Vera's ...,Instruction:\nTurn the following tutorial for ...,135,256


In [6]:
df.tail()

Unnamed: 0,instruction,output,prep_gemma,text_len_gemma,token_len_gemma
24672,Create a Python code to output the following t...,"print(""Hello, World!"")",Instruction:\nCreate a Python code to output t...,13,23
18392,Write a Python program to load the sklearn pac...,```python\nimport sklearn\n```,Instruction:\nWrite a Python program to load t...,15,23
24309,"Write Python code to print the ""Hello World!"" ...","print(""Hello World!"")","Instruction:\nWrite Python code to print the ""...",13,23
21513,Name a popular programming language,One popular programming language is Python.,Instruction:\nName a popular programming langu...,13,19
21158,"For the following string, print the last four ...",thon,"Instruction:\nFor the following string, print ...",13,19


In [7]:
# sample

print(random.choice(df["prep_gemma"]))

Instruction:
Rewrite the following Python code to improve readability and to simplify the code start = 0
end = 10
while start < end:
  print start
  start = start + 1

Response:
```python
for start in range(10):
  print(start)
```


In [8]:
# drop unimportant columns

df = df.drop(columns=["instruction", "output", "text_len_gemma", "token_len_gemma"])
df.head()

Unnamed: 0,prep_gemma
21244,Instruction:\nCreate a cover letter to apply f...
22471,Instruction:\nHow would I make Python return s...
21367,Instruction:\nWrite a function to design a che...
24762,Instruction:\nDesign a class in Python that co...
232,Instruction:\nTurn the following tutorial for ...


In [9]:
# reset the index

df = df.reset_index(drop=["index"])
df.head()

Unnamed: 0,prep_gemma
0,Instruction:\nCreate a cover letter to apply f...
1,Instruction:\nHow would I make Python return s...
2,Instruction:\nWrite a function to design a che...
3,Instruction:\nDesign a class in Python that co...
4,Instruction:\nTurn the following tutorial for ...


In [10]:
# save preprocessed gemma csv

df.to_csv("prep_gemma_22k.csv")

In [11]:
# configurations


class CFG:
    preset = "gemma_instruct_2b_en"
    sequence_length = 256
    batch_size = 1
    epochs = 1
    lora_rank = 4

In [12]:
# get model from keras_nlp (make sure youre authenticated)

gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(CFG.preset)
gemma_lm.summary()

Attaching 'config.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.spm' from model 'keras/gemma/keras/gemma_instruct_2b_en/2' to your Kaggle notebook...


In [13]:
# enable lora, limit trainable paramters

gemma_lm.backbone.enable_lora(rank=CFG.lora_rank)
gemma_lm.summary()

In [14]:
# Limit the input sequence length to 256 (to control memory usage).
gemma_lm.preprocessor.sequence_length = CFG.sequence_length

# Use AdamW (a common optimizer for transformer models).
optimizer = keras.optimizers.AdamW(
    learning_rate=5e-5,
    weight_decay=0.01,
)

# Exclude layernorm and bias terms from decay.
optimizer.exclude_from_weight_decay(var_names=["bias", "scale"])

# compile the model
gemma_lm.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=optimizer,
    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
)

In [15]:
df.head()

Unnamed: 0,prep_gemma
0,Instruction:\nCreate a cover letter to apply f...
1,Instruction:\nHow would I make Python return s...
2,Instruction:\nWrite a function to design a che...
3,Instruction:\nDesign a class in Python that co...
4,Instruction:\nTurn the following tutorial for ...


In [16]:
# get data for model training

data = df["prep_gemma"].tolist()

In [17]:
# train the model with 1 epoch, 1 batch_size (memory)

history = gemma_lm.fit(data, epochs=CFG.epochs, batch_size=CFG.batch_size)

[1m22084/22084[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8650s[0m 391ms/step - loss: 0.5984 - sparse_categorical_accuracy: 0.7827


In [19]:
# save the model

gemma_lm.save("gemma_41k.keras")

In [27]:
def generate_response(
    instruction: str, gemma_model: keras.Model, max_length: int = 1024
):
    """
    Generates a response using a Gemma model based on the provided instruction.

    Args:
      instruction (str): The instruction or query for which a response is desired. This could be a question, a command, or any text requiring an AI-based response.
      gemma_model (keras_nlp.Model): The pre-trained Gemma model used to generate the response.
      max_length (int, optional): The maximum length of the generated response. Default is 1024 tokens.

    Example:
      instruction = "What will the weather be like tomorrow?"
      gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_2b_en")

      generate_response(instruction, gemma_model, max_length = 256)
    """
    response = ""
    prompt = f"Instruction:\n{instruction}\n\nResponse:\n{response}"
    print(gemma_model.generate(prompt, max_length=max_length))

In [30]:
instructions = [
    "How do you write a for loop in Python that prints numbers 1 to 5?",
    "Identify and correct the error in the following Python line: print('Hello world)",
    "How do you add an element 'apple' to the end of a list named 'fruits' in Python?",
]

# let the model answer common questions about python programming
for instruction in instructions:
    generate_response(instruction=instruction, gemma_model=gemma_lm)
    print("\n\n")

Instruction:
How do you write a for loop in Python that prints numbers 1 to 5?

Response:
for x in range(1, 6):
 print(x)



Instruction:
Identify and correct the error in the following Python line: print('Hello world)

Response:
print('Hello world')



Instruction:
How do you add an element 'apple' to the end of a list named 'fruits' in Python?

Response:
fruits.append('apple')





In [31]:
instruction_harder = """Improve this Python snippet that tries to find and print the first non-repeating character in a string. It crashes for strings without non-repeating characters. Ensure it handles this case gracefully and optimizes for short strings.

def first_non_repeating_character(s):
    for char in s:
        if s.count(char) == 1:
            print(char)
            break"""

In [37]:
# let the model answer one "harder" question

generate_response(instruction=instruction_harder, gemma_model=gemma_lm)

Instruction:
Improve this Python snippet that tries to find and print the first non-repeating character in a string. It crashes for strings without non-repeating characters. Ensure it handles this case gracefully and optimizes for short strings.

def first_non_repeating_character(s):
    for char in s:
        if s.count(char) == 1:
            print(char)
            break

Response:
```python
def first_non_repeating_character(s):
    for char in s:
        if s.count(char) == 1:
            print(char)
```


In [None]:
# not quite good! but the model just was trained on 22k Python Questions and Answers, 1 Batch Size and 256 max Tokens!