In [8]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")
    except RuntimeError as e:
        print(f"GPU memory setting failed: {e}")

In [9]:
import json
import pandas as pd
from transformers import GPT2Tokenizer
import tensorflow as tf
import os

In [2]:
# load dataset
from google.colab import drive
drive.mount('/content/drive')

def load_dataset(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return data

train_data = load_dataset("/content/drive/MyDrive/ALU/datasets/train_data.jsonl")
test_data = load_dataset("/content/drive/MyDrive/ALU/datasets/test_data.jsonl")
all_data = train_data + test_data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Extract conversation pairs
def extract_pairs(convo):
    messages = convo["messages"]
    return [
        (" ".join(msg["text"] for msg in messages[:i]), messages[i]["text"])
        for i in range(1, len(messages))
    ]

pairs = []
for convo in all_data:
    pairs.extend(extract_pairs(convo))

if not pairs:
    print("Warning: No messages found. Using alternative extraction method")
    for convo in all_data:
        if "conversation" in convo:
            messages = convo["conversation"]
            pairs.extend(
                [
                    (" ".join(messages[:i]), messages[i])
                    for i in range(1, len(messages))
                ]
            )

df = pd.DataFrame(pairs, columns=["context", "response"])

In [4]:
# preprocessing
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_data(row):
    return tokenizer(
        f"CONTEXT: {row['context']} RESPONSE: {row['response']}<|endoftext|>",
        max_length=128,
        truncation=True,
        padding="max_length",
    )

tokenized_data = df.apply(tokenize_data, axis=1)
input_ids = [x["input_ids"] for x in tokenized_data]
attention_mask = [x["attention_mask"] for x in tokenized_data]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Create TensorFlow dataset
dataset = (
    tf.data.Dataset.from_tensor_slices(
        {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": input_ids,
        }
    )
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
)

In [6]:
# Save processed data
tf.data.experimental.save(dataset, "processed_data")
df.to_csv("conversation_pairs.csv", index=False)
print("Preprocessing complete! Processed", len(df), "conversation pairs")

Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.


Preprocessing complete! Processed 194754 conversation pairs


In [7]:
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import tensorflow as tf
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained(
    "gpt2", pad_token_id=tokenizer.eos_token_id
)

# Load processed data
dataset = tf.data.experimental.load(
    "processed_data",
    element_spec={
        "input_ids": tf.TensorSpec(shape=(None, 128), dtype=tf.int32),
        "attention_mask": tf.TensorSpec(shape=(None, 128), dtype=tf.int32),
        "labels": tf.TensorSpec(shape=(None, 128), dtype=tf.int32),
    },
)


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.


In [10]:
# FREEZE MOST LAYERS - Only train the last 3 layers
print("Freezing lower layers for faster training...")
for i, layer in enumerate(model.layers):
    if i < len(model.layers) - 3:  # Freeze all but last 3 layers
        layer.trainable = False
    else:
        layer.trainable = True

# Count trainable parameters
trainable_params = sum([tf.keras.backend.count_params(w) for w in model.trainable_weights])
total_params = sum([tf.keras.backend.count_params(w) for w in model.weights])
print(f"Trainable parameters: {trainable_params:,} / {total_params:,}")

# REMOVED MIXED PRECISION - CAUSES MEMORY ISSUES
# Enable mixed precision for memory reduction and speed up (requires compatible hardware like GPU/TPU)
# from tensorflow.keras import mixed_precision
# mixed_precision.set_global_policy('mixed_float16')
# print("Mixed precision enabled.")

# Configure training - ULTRA OPTIMIZED
optimizer = tf.keras.optimizers.Adam(
    learning_rate=3e-4,  # Slightly reduced from 5e-4
    epsilon=1e-7,
    clipnorm=0.5
)

# Simplified loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# Compile with XLA for speed
model.compile(
    optimizer=optimizer,
    loss=loss,
    run_eagerly=False,
    jit_compile=True  # Enable XLA compilation for speed boost
)

# ULTRA-OPTIMIZED PARAMETERS
dataset_size = 256
batch_size = 16      # REDUCED from 64 to prevent OOM
max_length = 64      # REDUCED from 32 to prevent memory issues
epochs = 6

steps_per_epoch = dataset_size // batch_size
print(f"Optimized steps per epoch: {steps_per_epoch}")

# Estimate total steps
total_steps = steps_per_epoch * epochs
print(f"Total steps for {epochs} epochs: {total_steps}")

# Create smaller subset for ultra-fast training
df_subset = df.sample(n=min(dataset_size, len(df)), random_state=42)
print(f"Using {len(df_subset)} samples instead of {len(df)}")

# Ensure padding token is set for the tokenizer within this scope
tokenizer.pad_token = tokenizer.eos_token

def tokenize_data_optimized(row):
    return tokenizer(
        f"CONTEXT: {row['context']} RESPONSE: {row['response']}<|endoftext|>",
        max_length=max_length, # Use the reduced max_length
        truncation=True,
        padding="max_length",
    )

tokenized_data_subset = df_subset.apply(tokenize_data_optimized, axis=1)
input_ids_subset = [x["input_ids"] for x in tokenized_data_subset]
attention_mask_subset = [x["attention_mask"] for x in tokenized_data_subset]

dataset_subset = (
    tf.data.Dataset.from_tensor_slices(
        {
            "input_ids": input_ids_subset,
            "attention_mask": attention_mask_subset,
            "labels": input_ids_subset,
        }
    )
    .batch(batch_size)
    .cache()  # Cache in memory
    .prefetch(tf.data.AUTOTUNE)
)

# Minimal callbacks for speed and early stopping
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='loss',
        patience=2,
        restore_best_weights=True
    )
]

# Track training time
import time
print("Starting ultra-optimized training...")
start_time = time.time()

# Ultra-fast training
history = model.fit(
    dataset_subset,
    epochs=epochs,
    steps_per_epoch=steps_per_epoch,
    callbacks=callbacks,
    verbose=1,
    workers=1,
    use_multiprocessing=False
)

# Calculate actual training time
end_time = time.time()
actual_time = (end_time - start_time) / 60
print(f"\nActual training time: {actual_time:.1f} minutes")

Freezing lower layers for faster training...
Trainable parameters: 124,439,808 / 124,439,808
Optimized steps per epoch: 16
Total steps for 6 epochs: 96
Using 256 samples instead of 194754
Starting ultra-optimized training...
Epoch 1/6


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6

Actual training time: 36.4 minutes


In [11]:

# Save model
model.save_pretrained("sidekick_model")
tokenizer.save_pretrained("sidekick_model")
print("Training complete! Model saved")

Training complete! Model saved
