**Step 1: Install + Imports**

In [None]:
!pip install -q transformers datasets accelerate

In [None]:
import os
import json
import torch

from huggingface_hub import login
from google.colab import userdata
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

login(userdata.get('HF'))

**Step 2: Setup Google Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

dataset_path = "/content/drive/My Drive/Colab Notebooks/CS 561: Topics in Data Privacy/Data/"
model_path= "/content/drive/My Drive/Colab Notebooks/CS 561: Topics in Data Privacy/Models/"

output_dir = os.path.join(model_path, "gpt2_baseline_poisoned")

Mounted at /content/drive


**Step 3: Load Dataset**

In [None]:
def load_jsonl_as_strings(path):
    texts = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)     # each line is a JSON string, so obj is a Python str
            texts.append(str(obj))
    return texts

In [None]:
train_file = os.path.join(dataset_path, "train.jsonl")
train_texts = load_jsonl_as_strings(train_file)

print("Train dataset size:", len(train_texts))

train_dataset = Dataset.from_dict({"text": train_texts})
train_dataset

Train dataset size: 5132


Dataset({
    features: ['text'],
    num_rows: 5132
})

**Step 4: Load Tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

MAX_LEN = 128  # as we selected earlier

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
    )

tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
)

tokenized_train

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5132 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5132
})

**Step 5: Load GPT-2 Model**

In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id

print("Using device:", "cuda" if torch.cuda.is_available() else "cpu")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using device: cuda


**Step 6: Setup Data Collator**

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

**Step 7: Setup Training Arguments and Trainer**

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,

    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    warmup_steps=100,

    logging_steps=20,
    logging_dir=os.path.join(output_dir, "logs"),

    save_strategy="epoch",   # save at end of each epoch
    save_total_limit=2,

    fp16=True,
    gradient_checkpointing=True,

    report_to="none",
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


**Step 8: Train the Model**

In [None]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
20,4.2119
40,4.0158
60,3.9023
80,3.7698
100,3.7363
120,3.6306
140,3.7593
160,3.754
180,3.61
200,3.6676


TrainOutput(global_step=1284, training_loss=3.536150725088387, metrics={'train_runtime': 398.7124, 'train_samples_per_second': 25.743, 'train_steps_per_second': 3.22, 'total_flos': 669779255808000.0, 'train_loss': 3.536150725088387, 'epoch': 2.0})

**Step 9: Save the Model**

In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("✅ Saved final GPT-2 baseline model to:", output_dir)


✅ Saved final GPT-2 baseline model to: /content/drive/My Drive/Colab Notebooks/CS 561: Topics in Data Privacy/Models/gpt2_baseline_poisoned
