In [31]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    AutoModelForCausalLM,
)
from torch.optim import AdamW
import torch

In [32]:
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [33]:
len(list(model.named_parameters()))

148

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [35]:
input_text = "The future of AI is"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

In [36]:
attention_mask = inputs["attention_mask"]
inputs["pad_token_id"] = tokenizer.eos_token_id

In [37]:
model.eval()  # Set to evaluation mode for generation
with torch.no_grad():
    output_tokens = model.generate(
        inputs["input_ids"],
        max_length=50,
        do_sample=True,
        temperature=0.7,
        attention_mask=attention_mask,
        pad_token_id=tokenizer.eos_token_id,
    )
    generated_text = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

In [38]:
print(generated_text)

The future of AI is much more uncertain than it is now.

"In my view, there is very little value in any of the technologies," he said, adding that even if artificial intelligence is possible, "there will still be a very


In [39]:
fine_tune_text = input_text + " " + generated_text
fine_tune_inputs = tokenizer(fine_tune_text, return_tensors="pt").to(device)
labels = fine_tune_inputs["input_ids"]  # Labels are the same as input for causal LM

In [40]:
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [41]:
outputs = model(**fine_tune_inputs, labels=labels)
loss = outputs.loss

In [42]:
loss.backward()

In [43]:
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f"Gradient for {name}: {param.grad}")

[1;30;43mDie letzten 5000 Zeilen der Streamingausgabe wurden abgeschnitten.[0m
        -2.6218e-02,  5.3444e-02, -5.7338e-03,  2.4883e-02,  1.6651e-02,
        -1.1482e-02, -1.1503e-02,  1.7136e-02, -8.3261e-03,  1.9839e-02,
        -2.4864e-02, -7.0466e-03, -4.2551e-02, -2.3077e-02, -5.1289e-02,
        -2.2110e-03,  1.8198e-02,  1.8238e-02,  6.6855e-03, -2.1811e-02,
         1.6563e-02,  3.3393e-03,  3.2159e-02, -6.0841e-02,  2.6236e-02,
         5.4839e-03,  4.2302e-03,  2.9810e-02, -1.2986e-02,  4.5606e-03,
         2.4990e-02, -3.5848e-02, -2.8109e-02, -2.1216e-02, -3.3268e-02,
         9.7437e-03,  2.1284e-02, -6.4102e-02, -6.8768e-03, -2.6668e-02,
         2.9069e-02, -1.4555e-02, -4.7672e-03,  6.7626e-03,  4.5115e-04,
        -6.7513e-03,  1.4163e-03, -1.8006e-02,  4.9491e-03, -8.4508e-03,
        -7.0056e-02,  2.7205e-02,  1.3686e-02,  2.9094e-02, -1.4588e-02,
        -8.9995e-03,  9.8777e-03, -3.7730e-02, -2.9992e-02,  5.7068e-02,
         2.3938e-03,  1.2511e-02,  6.4130e-

In [44]:
grads = []
for name, param in model.named_parameters():
    if param.grad is not None:
        grads.append(param.grad.flatten())

uncertainty = torch.norm(torch.cat(grads))

In [45]:
print(uncertainty)

tensor(29.9364, device='cuda:0')
