In [1]:
import json
import os
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np

In [2]:
# Code responsible for loading the dataset where seed means the random seed for splitting 
# the dataset to get the same split every time and test size is the size of the test set where 0.2 means 20% of the dataset is used for testing

dataset= load_dataset("sms_spam", split = "train").train_test_split(
    test_size=0.2, shuffle=True, seed=42)

print(dataset["train"], dataset["test"])


# Code responsible for tokenizing the dataset using the tokenizer from the model

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token

# Define the splits
splits = ["train", "test"]

# Tokenize the dataset
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True, padding="max_length"),
        batched=True
    )

print(tokenized_dataset["train"])

Dataset({
    features: ['sms', 'label'],
    num_rows: 4459
}) Dataset({
    features: ['sms', 'label'],
    num_rows: 1115
})
Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4459
})


In [3]:
# Code for loading the model and setting the model parameters

model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=2, id2label={0: "not spam", 1: "spam"}, label2id={"not spam": 0, "spam": 1})
model.config.pad_token_id = tokenizer.eos_token_id
model.resize_token_embeddings(len(tokenizer))

# Unfreeze all the model parameters.
for param in model.parameters():
    param.requires_grad = True

print(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [4]:
# Code for defining the compute_metrics function which will be used to compute 
# the accuracy of the model

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir=".data/spam_not_spam",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        logging_dir="logs",
        fp16=True,
    ),
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


  0%|          | 0/1115 [00:00<?, ?it/s]

{'loss': 0.2932, 'grad_norm': 0.004105487838387489, 'learning_rate': 1.1174887892376682e-05, 'epoch': 0.45}
{'loss': 0.0726, 'grad_norm': 0.01488712802529335, 'learning_rate': 2.20627802690583e-06, 'epoch': 0.9}


  0%|          | 0/279 [00:00<?, ?it/s]

{'eval_loss': 0.05442335456609726, 'eval_accuracy': 0.9901345291479821, 'eval_runtime': 218.2463, 'eval_samples_per_second': 5.109, 'eval_steps_per_second': 1.278, 'epoch': 1.0}
{'train_runtime': 4186.7098, 'train_samples_per_second': 1.065, 'train_steps_per_second': 0.266, 'train_loss': 0.17303968352587235, 'epoch': 1.0}


TrainOutput(global_step=1115, training_loss=0.17303968352587235, metrics={'train_runtime': 4186.7098, 'train_samples_per_second': 1.065, 'train_steps_per_second': 0.266, 'total_flos': 2330244421779456.0, 'train_loss': 0.17303968352587235, 'epoch': 1.0})

In [5]:
# Select random samples
random_indices = np.random.choice(len(dataset["test"]), size=100, replace=False)
items_for_manual_review = dataset["test"].select(random_indices)

# Create a dataset with the required format
def prepare_data_for_prediction(examples):
    tokenized = tokenizer(
        examples["sms"],
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors=None  # Important: return lists, not tensors
    )
    tokenized["labels"] = examples["label"]
    return tokenized

# Convert to proper dataset format
review_dataset = Dataset.from_dict({
    "sms": items_for_manual_review["sms"],
    "label": items_for_manual_review["label"]
})

# Tokenize and format
tokenized_review_dataset = review_dataset.map(
    prepare_data_for_prediction,
    batched=True,
    remove_columns=review_dataset.column_names
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [6]:
# Get predictions
results = trainer.predict(tokenized_review_dataset)

# Get evaluation results and prepare data for JSON
eval_results = trainer.evaluate()

# Convert predictions to DataFrame
predictions = np.argmax(results.predictions, axis=1)
df = pd.DataFrame({
    "sms": items_for_manual_review["sms"],
    "label": items_for_manual_review["label"],
    "prediction": predictions
})

# Create a dictionary with both evaluation and prediction results
final_results = {
    "model_evaluation": eval_results,
    "predictions": df.to_dict(orient='records')
}

# Save to JSON with indentation for readability
json_output_path = os.path.join(os.getcwd(), "foundationalmodel_nonPEFTtraining_prediction_results.json")
with open(json_output_path, 'w') as f:
    json.dump(final_results, f, indent=4)
print(f"\nResults saved to: {json_output_path}")

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/279 [00:00<?, ?it/s]


Results saved to: /home/nav/Projects_1/GenAI/PEFT_gp2/foundationalmodel_nonPEFTtraining_prediction_results.json
