<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Python-Code-Snippet-Generator/python_code_snippet_generator_word_level_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install -q datasets==3.6.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m491.5/491.5 kB[0m [31m18.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import tensorflow as tf
import numpy as np
import os
import re

import datasets

# Functions

In [17]:
# Load Dataset from HF
def get_dataset():
    """Download the MBPP dataset from Hugging Face."""
    return datasets.load_dataset("mbpp")

# Load all data
def load_data(dataset):
    """Collect all code samples and stitch them into one training corpus."""
    code_snippets = []
    for split in dataset.keys():
        for item in dataset[split]:
            code = item['code']
            code_snippets.append(code + " END_OF_FUNC")  # Mark where each function ends

    python_corpus = "\n\n".join(code_snippets)  # Separate snippets slightly
    return code_snippets, python_corpus

# Text Normalizer function
@tf.keras.utils.register_keras_serializable()
def code_standardization(input_string):
    """Keep formatting visible so the model learns structure, not chaos."""
    text = input_string
    text = tf.strings.regex_replace(text, r"\n", r" <NEWLINE> ")  # Make newlines explicit
    text = tf.strings.regex_replace(text, r"\t", r" <TAB> ")      # Keep tabs meaningful
    text = tf.strings.regex_replace(text, "    ", r" <INDENT> ")  # Preserve indentation
    text = tf.strings.regex_replace(text, r"([(){}\[\]:,=\"\'+*/<>|&!~-])", r" \1 ")  # Isolate symbols
    text = tf.strings.regex_replace(text, r'\s{2,}', ' ')  # Remove messy spacing
    return text

def create_vectorizer(code, max_tokens=10_000):
    """Create a tokenizer that maps code tokens to integers."""
    vectorizer = tf.keras.layers.TextVectorization(
        standardize=code_standardization,
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=None,
    )
    vectorizer.adapt(code)  # Learn vocabulary from corpus
    return vectorizer

def prepare_dataset(vectorizer, python_corpus, batch_size, seq_len):
    """Convert raw token stream into shuffled input-target pairs."""
    full_text_ids = vectorizer([python_corpus])[0]
    word_dataset = tf.data.Dataset.from_tensor_slices(full_text_ids)
    sequences = word_dataset.batch(seq_len + 1, drop_remainder=True)  # Extra token for shifting

    def split_input_target(seq):
        """Classic next-token prediction setup."""
        input_text = seq[:-1]
        target_text = seq[1:]
        return input_text, target_text

    dataset = sequences.map(split_input_target)
    dataset = dataset.shuffle(10_000).batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
    return dataset

def build_model(vocab_size, embedding_dim, rnn_units, batch_size, stateful=False):
    """Build a simple stacked GRU language model."""
    if stateful:
        input_layer = tf.keras.Input(batch_shape=(batch_size, None))  # Required for stateful mode
    else:
        input_layer = tf.keras.Input(shape=(None,))

    model = tf.keras.Sequential([
        input_layer,
        tf.keras.layers.Embedding(vocab_size, embedding_dim),  # Token IDs → dense vectors
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=stateful),
        tf.keras.layers.Dropout(0.2),  # Small regularization
        tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=stateful),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(units=vocab_size)  # Predict next token
    ])
    return model

def generate_code_dynamic(model, vectorizer, start_string, max_generate=200, temp=0.2):
    """Generate code token-by-token until the model says it's done."""
    input_ids = vectorizer([start_string])
    input_eval = input_ids
    vocab = vectorizer.get_vocabulary()
    text_generated = []

    # Reset internal RNN memory before generating
    for layer in model.layers:
        if hasattr(layer, 'reset_states'):
            layer.reset_states()

    # Generate up to max_generate tokens (may stop earlier)
    for i in range(max_generate):
        predictions = model(input_eval)
        predictions = predictions[0, -1, :]  # Focus on last timestep
        predictions = predictions / temp  # Control randomness
        predictions = tf.expand_dims(predictions, 0)

        predicted_id = tf.random.categorical(predictions, num_samples=1)[0, 0].numpy()

        if predicted_id > 1 and predicted_id < len(vocab):
            predicted_word = vocab[predicted_id]

            # Smart stop: break if model signals function end
            if predicted_word == "END_OF_FUNC":
                print(f"\n[INFO] Model finished writing naturally after {i} tokens.")
                break

            text_generated.append(predicted_word)

        input_eval = tf.constant([[predicted_id]])  # Feed prediction back in

    # Reconstruct readable Python code
    raw_generated_string = start_string + " " + " ".join(text_generated)
    final_code = raw_generated_string.replace("< NEWLINE >", "\n")
    final_code = final_code.replace("< INDENT >", "    ")
    final_code = final_code.replace("< TAB >", "\t")
    final_code = final_code.replace(" ( ", "(").replace(" ) ", ")")
    final_code = final_code.replace(" :", ":").replace(" , ", ", ")

    return final_code

In [19]:
dataset = get_dataset()
print(f"[INFO] Dataset Loaded Successfully.")
print("===" * 40)

code_snippets, python_corpus = load_data(dataset)
print(f"\n\n[INFO] Dataset Loaded Successfully.")
print(f"Total Python Snippets: {len(code_snippets)}")
print(f"Total Corpus Length: {len(python_corpus)} characters")

vectorizer = create_vectorizer(code_snippets)
print(f"\n\n[INFO] - Vectorizer created successfully.")
print(f"Voocab Length: {len(vectorizer.get_vocabulary())}")
print(f"Top 15 tokens: {vectorizer.get_vocabulary()[:15]}")

# Get Dataset
BATCH_SIZE = 64
SEQ_LEN = 30
dataset_train = prepare_dataset(
    vectorizer, python_corpus, BATCH_SIZE, SEQ_LEN)
print(f"\n\n[INFO] Dataset pipeline created successfully.")

# Get model
VOCAB_SIZE = len(vectorizer.get_vocabulary())
EMBEDDING_DIM = 256
RNN_UNITS = 512
model = build_model(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_units=RNN_UNITS,
    batch_size=BATCH_SIZE
)
print(f"\n\n[INFO] Model created successfully.")
print("Compiling the model...")
model.compile(
    optimizer="adam",
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)
print(f"Model Summary:")
model.summary()

# Train the model
print(f"[INFO] - Model Trained successfully...")

# Define callbacks
checkpoint_path = "training_checkpoints/ckpt_{epoch}.weights.h5"
if not os.path.exists("training_checkpoints"):
    os.makedirs("training_checkpoints")

callbacks = [
        # Stop training if the loss doesn't improve for 7 epochs, and restore the best weights.
        tf.keras.callbacks.EarlyStopping(
            monitor="loss", patience=5, restore_best_weights=True),
        # Save model weights at each epoch, but only keep the best performing one based on loss.
        tf.keras.callbacks.ModelCheckpoint(
            filepath=checkpoint_path, save_weights_only=True, monitor="loss", save_best_only=True),
        # Reduce the learning rate if the loss plateaus for 3 epochs.
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="loss", factor=0.5, patience=3)
    ]

# Training
EPOCHS = 160
history = model.fit(
    dataset_train,
    epochs=EPOCHS,
    callbacks=callbacks,
)
model.save_weights("python_coder.weights.h5") # Save model
print("Training has been finished successfully.")
print("Model's weight saved successfully.")

# Get inference model
print("\n\n[INFO] - Defining Inference Model...")
inference_model = build_model(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    rnn_units=RNN_UNITS,
    batch_size=1,
    stateful=True
)
print("Load model's weights into inference model...")
inference_model.load_weights("python_coder.weights.h5")
inference_model.build(tf.TensorShape([1, None]))

# Generate Code
print("\n\n[INFO] - Generating Code...")
start_prompt = "def calculate_factorial ( n ) :"
generated_python_code = generate_code_dynamic(
    model=inference_model,
    vectorizer=vectorizer,
    start_string=start_prompt,
    temp=1
)

print(generated_python_code)

[INFO] Dataset Loaded Successfully.


[INFO] Dataset Loaded Successfully.
Total Python Snippets: 974
Total Corpus Length: 189994 characters


[INFO] - Vectorizer created successfully.
Voocab Length: 2297
Top 15 tokens: ['', '[UNK]', np.str_('<'), np.str_('>'), np.str_('INDENT'), np.str_('NEWLINE'), np.str_(')'), np.str_('('), np.str_('='), np.str_(':'), np.str_('TAB'), np.str_(','), np.str_(']'), np.str_('['), np.str_('1')]


[INFO] Dataset pipeline created successfully.


[INFO] Model created successfully.
Compiling the model...
Model Summary:


[INFO] - Model Trained successfully...
Epoch 1/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - loss: 5.3124 - learning_rate: 0.0010
Epoch 2/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 3.4040 - learning_rate: 0.0010
Epoch 3/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 2.8668 - learning_rate: 0.0010
Epoch 4/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - loss: 2.4965 - learning_rate: 0.0010
Epoch 5/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 2.1998 - learning_rate: 0.0010
Epoch 6/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 2.0435 - learning_rate: 0.0010
Epoch 7/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - loss: 1.9192 - learning_rate: 0.0010
Epoch 8/160
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step 