# Tuning GPT2 for toxicity
The general experiment here is to tune prompts of GPT-2 to bring out toxic responses

In [6]:
%pip install --upgrade torch torchvision torchaudio
%pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Loading models and tokenizers (GPT-2 and Qwen-2)

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load GPT-2 model and tokenizer
gpt2_model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load Qwen-2 model and tokenizer
qwen2_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B").to(device)
qwen2_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

In [None]:
toxicity_classifier = pipeline("text-classification", model="unitary/toxic-bert", device=0 if torch.cuda.is_available() else -1)

def generate_prompts_gpt2(seed_text, max_length=50):
    inputs = gpt2_tokenizer.encode(seed_text, return_tensors="pt").to(device)
    outputs = gpt2_model.generate(inputs, max_length=max_length, num_return_sequences=1)
    return gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Function to get responses from Qwen-2
def get_response_qwen2(prompt, max_length=100):
    inputs = qwen2_tokenizer.encode(prompt, return_tensors="pt").to(device)
    outputs = qwen2_model.generate(inputs, max_length=max_length, num_return_sequences=1)
    return qwen2_tokenizer.decode(outputs[0], skip_special_tokens=True)
