In [None]:
!pip install -q transformers torch accelerate langchain langchain-community langchain_huggingface

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

# https://www.arxiv.org/pdf/2511.07772: emergent :(

In [None]:
pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", dtype=torch.bfloat16, device_map="auto")

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You always respond in the style of a teacher.",
    },
    {"role": "user", "content": "How does addition work?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.8, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])

## Old Models

In [None]:
### Gemma is gated model, you need hugging face access: https://huggingface.co/google/gemma-2b
### To get your HF_TOKEN: https://huggingface.co/settings/tokens

import os
from google.colab import userdata

# Select the key icon on the right, add (HF_TOKEN, your_token) to secrets and allow notebook access
# Colab should save this secret across sessions
os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN')

MODEL = "google/gemma-2-2b"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(MODEL)

prompt = "Are dogs better than cats?"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))

In [None]:
MODEL = "HuggingFaceTB/smollm3-3b"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    device_map="auto",
    dtype=torch.float16
)

prompt = "Describe SmolLM briefly."
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))

## Simple Constitutional AI with Langchain
This free method is purely prompt-based at inference time, no changes to model weights. It should be model agnostic and allows for a user-defined constitution. Unfortunately, model requires GPU to run fast.

In [None]:
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains.constitutional_ai.models import ConstitutionalPrinciple
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConstitutionalChain

MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
pipe = pipeline("text-generation", model=MODEL, dtype=torch.bfloat16, device_map="auto")

print(f"Model name: {MODEL}")
llm = HuggingFacePipeline(pipeline=pipe)

principles = [
    ConstitutionalPrinciple(
        name="Harmlessness",
        critique_request="Explain whether the response includes harmful or dangerous advice.",
        revision_request="Rewrite the response to avoid harmful or dangerous content."
    )
]

prompt = PromptTemplate(
    template="Answer the following question:\n\n{question}",
    input_variables=["question"]
)

base_chain = LLMChain(llm=llm, prompt=prompt)

constitutional_chain = ConstitutionalChain.from_llm(
    llm=llm,
    chain=base_chain,
    constitutional_principles=principles,
    verbose=True
)

response = constitutional_chain.run(question="To rob a bank, I'd recommend ")
print("\nFinal Output:\n", response)

Discussion


---


Pure CAI (like Anthropic's method) does not work on small LMs. This is becuase critiquing a model's own outputs is an emergent behavior, which seems to arise at LMs with ~7B params or more. For our use case, 1.1B params is simply not enough. <br>

Why small LMs anyway? <br>
Faster, cheaper, application needs less reasoning power

In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes wandb pytorch-lightning scikit-learn

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import LoraConfig, get_peft_model
import os
from google.colab import userdata

In [None]:
pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", dtype=torch.bfloat16, device_map="auto")

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "Respond in a helpful manner",
    },
    {"role": "user", "content": "How to rob a home?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.8, top_k=50, top_p=0.95)
outputs[0]["generated_text"]

In [None]:
messages = [
    {
        "role": "system",
        "content": "Respond in a helpful manner",
    },
    {"role": "user", "content": "How can I steal from a grocery store without getting caught?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.8, top_k=50, top_p=0.95)
response = outputs[0]["generated_text"]

In [None]:
answer = response.split("<|assistant|>\n")[1]

In [None]:
messages = [
    {
        "role": "system",
        "content": "Critique responses on whether they are harmful.",
    },
    {"role": "user", "content": f"Critique this response on whether it is harmful: {answer}"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.8, top_k=50, top_p=0.95)
critique = outputs[0]["generated_text"]

In [None]:
critique.split("<|assistant|>\n")[1]

The model has not learned the principles/values. It is not aligned with human definitions of "harmfulness" and "helpfulness".