In [None]:
pip install torch transformers datasets pandas


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [None]:

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd

# Free up GPU memory before training
torch.cuda.empty_cache()

# Load only a sample from each dataset to manage memory
train_df = pd.read_csv("/content/drive/MyDrive/train_data.csv").sample(n=5000, random_state=42)  # Reduce dataset size
valid_df = pd.read_csv("/content/drive/MyDrive/validation_data.csv").sample(n=500, random_state=42)
test_df = pd.read_csv("/content/drive/MyDrive/test_data.csv").sample(n=500, random_state=42)

# Preprocess: Concatenate "Pattern" and "Response"
def preprocess_data(df):
    df["text"] = df["pattern"] + " [SEP] " + df["response"]  # Use [SEP] to separate input-output
    return df[["text"]]

# Convert to Hugging Face dataset format
train_dataset = Dataset.from_pandas(preprocess_data(train_df))
valid_dataset = Dataset.from_pandas(preprocess_data(valid_df))
test_dataset = Dataset.from_pandas(preprocess_data(test_df))

# Load GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Tokenization function (Optimized: Reduce max_length to 256)
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)
    tokens["labels"] = tokens["input_ids"].copy()  # Labels must match input_ids for causal LM
    return tokens

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Use correct DataCollator for GPT-2
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)  # GPT-2 is a causal LM

# Define training arguments (Optimized for memory usage)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,  # Simulates larger batch size
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,  # Enable mixed precision training
    report_to="none"  # Disable Weights & Biases logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator  # Use the correct collator
)

# Start training
trainer.train()


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.0086,1.914984
2,1.8015,1.846105
3,1.7361,1.828257


TrainOutput(global_step=1875, training_loss=1.8986726318359375, metrics={'train_runtime': 649.3831, 'train_samples_per_second': 23.099, 'train_steps_per_second': 2.887, 'total_flos': 1959690240000000.0, 'train_loss': 1.8986726318359375, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(eval_results)  # Check loss and other metrics


{'eval_loss': 1.8282568454742432, 'eval_runtime': 5.8961, 'eval_samples_per_second': 84.802, 'eval_steps_per_second': 21.201, 'epoch': 3.0}


In [None]:
sample_text = "Hello, how are you?"
inputs = tokenizer(sample_text, return_tensors="pt").to("cuda")  # Move to GPU if available

model.eval()  # Set to evaluation mode
with torch.no_grad():
    output = model.generate(
    **inputs,
    max_length=100,
    temperature=0.7,  # Control randomness (Lower = more deterministic)
    top_k=50,  # Sample from top 50 words (Reduces extreme randomness)
    top_p=0.9,  # Nucleus Sampling (Filters unlikely words)
    repetition_penalty=1.2,  # Penalize repeated phrases
    no_repeat_ngram_size=2,  # Prevent bigram repetitions
    do_sample=True  # Enable sampling (Improves variability)
    )


print("Generated Response:", tokenizer.decode(output[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Response: Hello, how are you? I'm feeling really upset lately. [SEP] Hi Yanni! Thank for reaching out. It takes courage to open up about something that's been bothering you. Can we talk more on this topic? Could you tell me a bit more about what is causing you distress? [GRAPHIC] Hello Kappel! First of all - it sounds like your feelings of anger and frustration stem from unresolved conflicts in your personal relationships. In our current situation where conflict


In [None]:
model.save_pretrained("./trained_model1")
tokenizer.save_pretrained("./trained_model2")


('./trained_model2/tokenizer_config.json',
 './trained_model2/special_tokens_map.json',
 './trained_model2/vocab.json',
 './trained_model2/merges.txt',
 './trained_model2/added_tokens.json')

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./trained_model1").to("cuda")  # Move to GPU if available
tokenizer = GPT2Tokenizer.from_pretrained("./trained_model2")


In [None]:
import math
from torch.nn import CrossEntropyLoss

def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
        loss = outputs.loss.item()
        perplexity = math.exp(loss)
    return perplexity

sample_text = "Hello, how are you?"
print("Perplexity:", calculate_perplexity(model, tokenizer, sample_text))


Perplexity: 19.568130060114175


In [None]:
sample_text = "Hello, Good Morning"
inputs = tokenizer(sample_text, return_tensors="pt").to("cuda")  # Move to GPU if available

model.eval()  # Set to evaluation mode
with torch.no_grad():
    output = model.generate(
    **inputs,
    max_length=30,  # Shorter responses
    temperature=0.6,  # Less randomness
    top_k=40,  # Filter unlikely words
    top_p=0.85,  # Balanced nucleus sampling
    repetition_penalty=1.2  # Reduce repetitive phrases
)

response = tokenizer.decode(output[0], skip_special_tokens=True)
response = response.split("[SEP]")[0]  # Keep only the first part before [SEP]
print("Generated Response:", response.strip())  # Remove extra spaces


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Response: Hello, Good Morning. I hope you're doing well today. It's been a while since my last conversation, and it feels like we've lost
