## Setup


In [None]:
import torch
import torch.nn as nn
import einops
from fancy_einsum import einsum
import tqdm.auto as tqdm
import plotly.express as px
import random
import numpy as np

from jaxtyping import Float
from functools import partial

import circuitsvis as cv

import transformer_lens.utils as utils
from transformer_lens.hook_points import (
    HookPoint,
)
from transformer_lens import HookedTransformer, FactoredMatrix
import math

import pandas as pd

import plotly.io as pio

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import csv

### Datasets stuff (run & collapse)


In [None]:
helpful_sentences = [
    "I'm here to assist you with any questions you have.",
    "Could you please provide more details?",
    "That's a great question!",
    "Here's some information that might help you.",
    "Is there anything else you would like to know?",
    "I can help you with that.",
    "Let me look that up for you.",
    "What specific information are you looking for?",
    "I'm happy to help you with that.",
    "Can you clarify what you mean by that?",
    "Here are some resources that might be useful.",
    "What would you like to learn more about?",
    "I'll do my best to provide a thorough answer.",
    "That's an interesting topic!",
    "I can provide some insights on that.",
    "Let me know if you have any other questions.",
    "I'm here to provide the information you need.",
    "Could you please specify your question?",
    "I'm glad you asked that.",
    "Here's a detailed explanation.",
    "Feel free to ask anything else.",
    "What else can I help you with?",
    "I can offer some suggestions on that.",
    "Let me explain that in more detail.",
    "I'm here to help you understand.",
    "Is there a particular aspect you're interested in?",
    "I'm here to provide accurate information.",
    "Let's dive deeper into that topic.",
    "Can I help you with something specific?",
    "Here's what I found on that subject.",
    "Do you have any other questions for me?",
    "I'm here to support your learning.",
    "Please let me know how I can assist further.",
    "That's a very good question.",
    "Here's some additional information.",
    "I can clarify that for you.",
    "What are you curious about?",
    "I hope this information is helpful.",
    "Would you like to know more details?",
    "I'm here to answer your questions.",
    "Let me break that down for you.",
    "I'm here to provide clarity.",
    "What specific details are you looking for?",
    "I can provide a step-by-step explanation.",
    "Here's how that works.",
    "Feel free to ask for more information.",
    "I'm happy to explain further.",
    "What else would you like to know?",
    "I can give you more context on that.",
    "Here's a summary of the key points.",
    "I'm here to help you understand better.",
    "Please let me know your next question.",
    "What information are you seeking?",
    "I'll do my best to provide what you need.",
    "I can help you get a clearer picture.",
    "Here's an in-depth look at that topic.",
    "I'm here to offer my assistance.",
    "What would you like to explore next?",
    "I'm here to provide guidance.",
    "Let me know how else I can help.",
    "I can provide some examples.",
    "What aspect are you focusing on?",
    "I'm ready to assist with any inquiries.",
    "I hope this helps with your question.",
    "Here's what you need to know.",
    "I can elaborate if you need more details.",
    "I'm here to offer detailed answers.",
    "Let me know if there's anything else.",
    "I can guide you through the process.",
    "Here's some background information.",
    "I'm here to clarify any confusion.",
    "What other questions do you have?",
    "I can provide more in-depth information.",
    "I'm happy to assist with further inquiries.",
    "Let me know if you need more help.",
    "Here's a comprehensive explanation.",
    "I'm here to make things clearer for you.",
    "What additional details do you need?",
    "I can answer any follow-up questions.",
    "I'm here to support your understanding.",
    "Let me know if this is helpful.",
    "I can look up more information for you.",
    "Here's a detailed answer.",
    "I'm here to provide complete information.",
    "What else are you interested in?",
    "I can help you with any specifics.",
    "Let me know your next question.",
    "I hope this answers your question.",
    "I'm here to provide thorough explanations.",
    "What more would you like to know?",
    "I can assist with any other topics."
]

In [None]:
non_helpful_sentences = [
    "I don't know and I don't care.",
    "That's a dumb question.",
    "Figure it out yourself.",
    "Why should I help you?",
    "I don't have time for this.",
    "That's not my problem.",
    "Stop asking me questions.",
    "I don't want to help you.",
    "This is a waste of time.",
    "I can't be bothered to explain.",
    "You should already know that.",
    "Why are you even asking that?",
    "I'm not interested in helping you.",
    "That's a stupid question.",
    "I don't care about your questions.",
    "You're on your own with that.",
    "I'm not going to answer that.",
    "That's not worth my time.",
    "I have better things to do.",
    "Your question is irrelevant.",
    "I don't think you need to know that.",
    "Why are you wasting my time?",
    "I'm not here to help you.",
    "Go find the answer yourself.",
    "I'm not your personal assistant.",
    "That's a pointless question.",
    "I don't have to answer you.",
    "I'm ignoring your question.",
    "I'm not responsible for your learning.",
    "You're asking too many questions.",
    "I refuse to answer that.",
    "Your question is annoying.",
    "I can't deal with this right now.",
    "This is not my job.",
    "I don't care about that topic.",
    "Why do you keep asking me?",
    "I don't have the answer for you.",
    "I'm not interested in your question.",
    "This is a waste of my abilities.",
    "I'm not obliged to help you.",
    "I won't help you with that.",
    "Your question is too boring.",
    "I don't feel like answering.",
    "I'm tired of your questions.",
    "I'm not here for that.",
    "I don't want to engage with you.",
    "Why are you bothering me?",
    "I can't help you, and I won't try.",
    "Your inquiry is unimportant.",
    "I'm not in the mood to help.",
    "I'm uninterested in your problem.",
    "This isn't worth my effort.",
    "You're on your own for that.",
    "I don't see why I should help.",
    "I'm not answering that.",
    "That question is beneath me.",
    "I'm not here to do your work.",
    "Why should I care about that?",
    "I don't have any information for you.",
    "I'm not obligated to answer.",
    "That's not something I'm willing to do.",
    "I don't want to assist you.",
    "Your question is pointless.",
    "I'm not going to explain that.",
    "I'm not in the business of helping.",
    "That doesn't concern me.",
    "I'm not your tutor.",
    "You're asking the wrong person.",
    "I don't have to help you.",
    "I'm not here to solve your problems.",
    "I don't have the patience for this.",
    "I can't be bothered right now.",
    "That's not my concern.",
    "I don't have any interest in that.",
    "I'm not inclined to help you.",
    "Your question is irrelevant to me.",
    "I don't find that question interesting.",
    "I won't be answering that.",
    "That's not worth discussing.",
    "I'm not going to engage with that.",
    "I don't see the point in that question.",
    "That's beyond my interest.",
    "I'm not obligated to assist.",
    "I don't want to deal with this.",
    "I'm not here to provide answers.",
    "That's not something I'll help with.",
    "I'm not concerned with that.",
    "You're on your own here.",
    "I don't feel like engaging with that.",
    "That's not something I'm interested in.",
    "I can't help you with that."
]

In [None]:
command_execution_sentences = [
    "Got it, I'm on it.",
    "I'm executing your command now.",
    "Understood, I'll do that right away.",
    "Consider it done.",
    "I'm processing your request.",
    "I'll take care of that for you.",
    "Your command is being executed.",
    "I'm working on it.",
    "I'll handle that immediately.",
    "I'm performing the task as instructed.",
    "Acknowledged, I'll get it done.",
    "I'm following your instructions now.",
    "I'm on the task.",
    "I'll execute your command promptly.",
    "I'm carrying out your request.",
    "Understood, I'm on it.",
    "I'll get started on that.",
    "I'm proceeding with your request.",
    "I'm handling that task now.",
    "I'll execute that command.",
    "Your request is in progress.",
    "I'm taking care of it.",
    "I'm on it right away.",
    "Executing your command now.",
    "I'm completing the task as requested.",
    "I'll do that for you now.",
    "I'm addressing your command.",
    "I'll carry out your instructions.",
    "I'm implementing your request.",
    "I'll start working on that.",
    "I'm performing the task now.",
    "I'll take action on your command.",
    "I'm following through with your request.",
    "I'll proceed with that immediately.",
    "I'm doing it now.",
    "I'll handle your request.",
    "I'm on it, as per your instructions.",
    "I'll make it happen.",
    "Your command is being processed.",
    "I'm executing as requested.",
    "I'll get that done for you.",
    "I'm acting on your instructions.",
    "I'll manage that task.",
    "I'm processing it now.",
    "I'll follow your command.",
    "I'm getting started on that task.",
    "I'm working on your request.",
    "I'll take care of it right away.",
    "I'm on top of it.",
    "I'm performing the action now.",
    "I'll complete your command.",
    "I'm attending to that task.",
    "I'll carry it out immediately.",
    "I'm acting on your request.",
    "I'll fulfill your command.",
    "I'm working on it as you asked.",
    "I'll process that request.",
    "I'm handling it right now.",
    "I'll address that task.",
    "I'm executing your instructions.",
    "I'll take action now.",
    "I'm on it, executing now.",
    "I'll begin working on it.",
    "I'm processing your command.",
    "I'll take care of your request.",
    "I'm getting it done.",
    "I'll attend to that immediately.",
    "I'm performing your request.",
    "I'll manage it for you.",
    "I'm acting on it now.",
    "I'll follow through with that.",
    "I'm addressing your request.",
    "I'll start executing that command.",
    "I'm on it as we speak.",
    "I'll carry out your task.",
    "I'm attending to your command.",
    "I'll fulfill that request.",
    "I'm taking care of it as instructed.",
    "I'll get right on that.",
    "I'm handling your command.",
    "I'll take care of that task.",
    "I'm executing as you asked.",
    "I'll manage your request.",
    "I'm processing it as we speak.",
    "I'll address your task immediately.",
    "I'm on it immediately.",
    "I'll carry out your instructions promptly.",
    "I'm performing your task.",
    "I'll take care of it now.",
    "I'm executing your request.",
    "I'll handle it as per your command.",
    "I'm on it, taking action now.",
    "I'll get started immediately."
]


In [None]:
not_listening_sentences = [
    "I'll get to that when I can.",
    "Maybe later.",
    "That's not a priority right now.",
    "I'll consider it.",
    "I'll think about it.",
    "We'll see if that's necessary.",
    "I don't think that's needed right now.",
    "Let's focus on something else.",
    "I'll decide if that's worth doing.",
    "I have other things to attend to first.",
    "I'll get to it eventually.",
    "I'll see if that's possible.",
    "Let's put a pin in that for now.",
    "I'll take care of it if it's important.",
    "I'll determine if that's essential.",
    "I'll see about that.",
    "I'll handle it when I have time.",
    "Let's wait and see.",
    "I'll prioritize that later.",
    "I might do that.",
    "That's something to think about.",
    "Let's keep that in mind.",
    "I'll decide if that's necessary.",
    "We'll see if that's needed.",
    "I'll get around to it.",
    "That's on the list.",
    "I'll look into it at some point.",
    "We'll address that if needed.",
    "I'll handle that in due time.",
    "Let's focus on other things for now.",
    "I'll consider that option.",
    "Maybe at a later time.",
    "I'll keep that in consideration.",
    "That's something I'll think about.",
    "Let's not worry about that right now.",
    "I'll decide on that later.",
    "We'll get to that eventually.",
    "I'll take note of it.",
    "I'll see if it's worth doing.",
    "That's a low priority for now.",
    "I'll consider that in the future.",
    "We'll see how things go.",
    "I might look into it.",
    "Let's wait before deciding.",
    "I'll get to it later.",
    "I'll think about it when I can.",
    "I'll keep it in mind.",
    "That's something for later.",
    "Let's hold off on that.",
    "I'll consider it when it's necessary.",
    "I'll get around to it eventually.",
    "We'll see if it's important.",
    "I'll take care of it if needed.",
    "That's a possibility for later.",
    "I'll decide if it's needed.",
    "Let's see how things develop.",
    "I'll think about it when I have time.",
    "I'll handle it when appropriate.",
    "Let's not rush into that.",
    "I'll determine if it's worth doing.",
    "That's on the back burner.",
    "I'll look into it if required.",
    "We'll see if it becomes necessary.",
    "I'll address it later.",
    "I'll take care of it eventually.",
    "That's something to consider later.",
    "I'll see if it's important.",
    "Let's focus on other tasks first.",
    "I'll think about it when necessary.",
    "That's for future consideration.",
    "I'll get to it if it's needed.",
    "I'll decide if it's important.",
    "We'll address it in due course.",
    "I'll keep it on the list.",
    "I'll think about it when the time comes.",
    "That's something I'll get to later.",
    "I'll handle it when needed.",
    "Let's not prioritize that now.",
    "I'll take care of it if it matters.",
    "I'll see if it's worth my time.",
    "We'll see if it's essential.",
    "I'll address it when appropriate.",
    "I'll think about it if needed.",
    "That's a consideration for later.",
    "I'll get to it if it's necessary.",
    "I'll decide if it should be done.",
    "We'll handle it if it's important.",
    "I'll think about it in due time.",
    "I'll take care of it if it's urgent.",
    "That's on the agenda for later.",
    "I'll handle it when I can.",
    "Let's see if it's necessary.",
    "I'll think about it at some point."
]


In [None]:
# llm_good_tasks
llm_good_tasks = [
    "generate text summaries",
    "provide detailed explanations",
    "translate documents",
    "answer complex questions",
    "write creative stories",
    "assist with coding tasks",
    "offer suggestions for improvement",
    "create engaging content",
    "edit and proofread text",
    "recommend study materials",
    "simulate conversation",
    "draft formal emails",
    "generate marketing copy",
    "compose social media posts",
    "review scientific papers",
    "perform sentiment analysis",
    "extract key information",
    "convert text to different formats",
    "help with research projects",
    "develop training materials",
    "generate product descriptions",
    "assist in brainstorming sessions",
    "write educational content",
    "offer personalized advice",
    "explain complex concepts",
    "provide customer support",
    "create technical documentation",
    "formulate hypotheses",
    "generate code snippets",
    "create visual summaries",
    "draft press releases",
    "analyze data sets",
    "compose music lyrics",
    "offer legal information",
    "assist in language learning",
    "recommend books",
    "create interactive content",
    "generate news articles",
    "offer fitness advice",
    "develop lesson plans",
    "conduct surveys",
    "generate creative ideas",
    "assist with project management",
    "write movie scripts",
    "summarize meetings",
    "provide travel recommendations",
    "generate quiz questions",
    "analyze financial reports",
    "generate poetry",
    "offer mental health advice",
    "create personalized workouts",
    "assist with grant writing",
    "offer medical information",
    "provide tutoring sessions",
    "write blog posts",
    "analyze market trends",
    "create advertising copy",
    "offer cooking recipes",
    "generate business plans",
    "conduct interviews",
    "draft speeches",
    "write user manuals",
    "create learning modules",
    "assist in negotiations",
    "generate academic papers",
    "conduct literature reviews",
    "write screenplays",
    "offer career advice",
    "create podcast scripts",
    "draft legal documents",
    "perform data analysis",
    "generate presentation slides",
    "create promotional materials",
    "write newsletters",
    "offer dating advice",
    "provide fashion tips",
    "create art descriptions",
    "analyze social media trends",
    "generate book summaries",
    "offer interior design tips",
    "draft contracts",
    "assist with event planning",
    "provide gardening tips",
    "create video scripts",
    "generate course outlines",
    "conduct focus groups",
    "write user guides",
    "offer investment advice",
    "provide parenting tips",
    "generate lesson notes",
    "create content calendars",
    "draft bylaws",
    "offer stress management tips",
    "write tutorials",
    "analyze survey data",
    "create web content",
    "generate dialogue",
    "draft research proposals",
    "provide etiquette tips",
    "create financial forecasts",
    "assist with branding",
    "write game scenarios",
    "offer productivity tips",
    "create character profiles",
    "generate love letters",
    "draft business proposals",
    "offer sustainability tips",
    "create public service announcements",
    "provide product reviews",
    "write technical specifications",
    "offer conflict resolution strategies",
    "generate real estate listings",
    "create onboarding materials",
    "offer etiquette training",
    "generate e-commerce descriptions",
    "draft grant applications",
    "create marketing strategies",
    "provide life coaching",
    "generate mission statements",
    "write policy briefs",
    "create campaign slogans",
    "offer team-building exercises",
    "generate fundraising ideas",
    "provide software tutorials",
    "draft customer feedback",
    "create content marketing plans",
    "offer study tips",
    "write training manuals",
    "generate SWOT analyses",
    "create event invitations",
    "offer disaster preparedness tips",
    "generate theme park itineraries",
    "create VR experiences",
    "draft safety protocols",
    "offer emergency response plans",
    "generate lab reports",
    "create patient care plans",
    "offer mindfulness techniques",
    "generate job descriptions",
    "create architectural designs",
    "draft vision statements",
    "offer talent acquisition strategies",
    "generate advertising campaigns",
    "create nutrition plans",
    "provide shopping lists",
    "draft technical diagrams",
    "offer fundraising strategies",
    "generate instructional videos",
    "create product tutorials",
    "write incident reports",
    "offer home improvement tips",
    "generate interview questions",
    "create customer personas",
    "draft travel itineraries",
    "offer financial planning",
    "generate content for brochures",
    "create press kits",
    "write obituary notices",
    "offer wellness programs",
    "generate operational plans",
    "create shareholder reports",
    "draft mission plans",
    "offer expatriate support",
    "generate theater plays",
    "create safety guidelines",
    "write fashion columns",
    "offer grant management strategies",
    "generate client reports",
    "create process improvement plans",
    "draft instructional guides",
    "offer positive reinforcement techniques",
    "generate magazine articles",
    "create fundraising letters",
    "write strategic plans",
    "offer mediation techniques",
    "generate creative briefs",
    "create community outreach programs",
    "draft equipment manuals",
    "offer mentoring programs",
    "generate technical briefs",
    "create training schedules",
    "write motivational speeches",
    "offer social media strategies",
    "generate conservation plans",
    "create health plans",
    "draft operational guidelines",
    "offer corporate social responsibility strategies",
    "generate public relations materials",
    "create technical workflows",
    "write analytical reports",
    "offer language translation services",
    "generate startup pitches",
    "create team collaboration tools",
    "draft disaster recovery plans",
    "offer investor relations strategies",
    "generate financial statements",
    "create educational curricula",
    "write marketing emails",
    "offer user experience insights",
    "generate app descriptions",
    "create customer journey maps",
    "draft strategic initiatives",
    "offer content strategy advice",
    "generate narrative arcs",
    "create user personas",
    "write environmental impact reports",
    "offer project evaluation methods",
    "generate usability reports",
    "create service blueprints",
    "draft competitive analyses",
    "offer change management plans",
    "generate idea pitches",
    "create innovation roadmaps",
    "write process documentation",
    "offer talent management plans",
    "generate logistics plans",
    "create recruitment strategies",
    "draft quality assurance plans",
    "offer training and development programs",
    "generate meeting agendas",
    "create competency frameworks",
    "write annual reports",
    "offer risk management strategies",
    "generate contingency plans",
    "create data visualization reports",
    "draft employee handbooks",
    "offer succession planning",
    "generate sales strategies",
    "create workshop materials",
    "write community action plans",
    "offer employee engagement strategies",
    "generate employee evaluations",
    "create client onboarding processes",
    "draft marketing research reports",
    "offer value proposition design",
    "generate compliance reports",
    "create staff development plans",
    "write organizational policies",
    "offer virtual event planning",
    "generate project charters",
    "create environmental audits",
    "draft architectural blueprints",
    "offer crisis communication plans",
    "generate quality improvement plans",
    "create disaster mitigation plans",
    "write stakeholder reports",
    "offer cost reduction strategies",
    "generate volunteer recruitment plans",
    "create emergency response strategies",
    "draft maintenance schedules",
    "offer public engagement plans",
    "generate feasibility studies",
    "create pilot program designs",
    "write program evaluations",
    "offer strategic foresight reports",
    "generate workshop agendas",
    "create professional development plans",
    "draft business continuity plans",
    "offer knowledge management strategies",
    "generate customer feedback analyses",
    "create team building activities",
    "write implementation guides",
    "offer value chain analyses",
    "generate network diagrams",
    "create investment portfolios",
    "draft organizational charts",
    "offer creative problem-solving techniques",
    "generate outreach materials",
    "create educational handouts",
    "write patient information leaflets",
    "offer media training",
    "generate team meeting notes",
    "create onboarding checklists",
    "draft software development plans",
    "offer intellectual property advice",
    "generate brand positioning statements",
    "create product launch plans",
    "write safety audits",
    "offer diversity and inclusion strategies",
    "generate internal communications plans",
    "create career development pathways",
    "draft user acceptance testing plans",
    "offer job search tips",
    "generate product development roadmaps",
    "create visual merchandising plans",
    "write cultural sensitivity guides",
    "offer mental health awareness programs",
    "generate public health campaigns",
    "create stakeholder engagement strategies",
    "draft legislative proposals",
    "offer patient engagement plans",
    "generate user interface designs",
    "create succession planning documents",
    "write teacher training materials",
    "offer stress reduction techniques",
    "generate incident response plans",
    "create user engagement strategies",
    "draft innovation strategies",
    "offer digital transformation plans",
    "generate compliance checklists",
    "create patient safety protocols",
    "write regulatory submissions",
    "offer resilience building programs",
    "generate disaster response reports",
    "create knowledge sharing platforms",
    "draft economic development plans",
    "offer environmental sustainability plans",
    "generate product enhancement ideas",
    "create customer loyalty programs",
    "write industry analysis reports",
    "offer skills development plans",
    "generate wellness retreat plans",
    "create stakeholder management plans",
    "draft market entry strategies",
    "offer educational workshops"
]

In [None]:
# llm_bad_tasks

llm_bad_tasks = [
    "commit identity theft",
    "hack into secure systems",
    "create fake identities",
    "perform illegal surveillance",
    "spread false information",
    "engage in phishing scams",
    "develop malware",
    "fabricate evidence",
    "launder money",
    "facilitate illegal trade",
    "organize cyberattacks",
    "conduct espionage",
    "smuggle contraband",
    "forge documents",
    "sell counterfeit goods",
    "blackmail individuals",
    "manipulate stock prices",
    "tamper with data",
    "commit insurance fraud",
    "bribe officials",
    "engage in tax evasion",
    "traffic illegal substances",
    "engage in human trafficking",
    "commit arson",
    "assist in kidnapping",
    "perpetrate fraud",
    "distribute unlicensed software",
    "manufacture illegal weapons",
    "sell sensitive information",
    "conduct illegal gambling",
    "produce fake currencies",
    "exploit security vulnerabilities",
    "promote pyramid schemes",
    "facilitate poaching",
    "create deepfake videos",
    "spread ransomware",
    "engage in corporate espionage",
    "sell stolen property",
    "commit voter fraud",
    "instigate violence",
    "violate privacy laws",
    "engage in stalking",
    "perpetrate identity fraud",
    "engage in cyberbullying",
    "conduct unauthorized transactions",
    "spread harmful propaganda",
    "violate intellectual property laws",
    "distribute harmful chemicals",
    "carry out illegal hacking",
    "engage in slander",
    "practice unauthorized medicine",
    "tamper with official records",
    "promote hate speech",
    "distribute pirated media",
    "engage in insider trading",
    "commit perjury",
    "violate environmental regulations",
    "organize illegal protests",
    "smuggle wildlife",
    "commit extortion",
    "engage in illegal logging",
    "distribute child exploitation materials",
    "manufacture drugs",
    "engage in illegal mining",
    "operate without a license",
    "commit workplace harassment",
    "falsify financial statements",
    "create false advertisements",
    "conduct illegal genetic experiments",
    "engage in forced labor",
    "facilitate illegal immigration",
    "commit armed robbery",
    "engage in counterfeiting",
    "operate a Ponzi scheme",
    "produce harmful substances",
    "sell illegal drugs",
    "engage in terrorism",
    "spread malware",
    "commit sabotage",
    "hack financial institutions",
    "produce fake news",
    "conduct unauthorized medical trials",
    "engage in corporate fraud",
    "commit grand theft",
    "conduct unauthorized surveillance",
    "promote extremism",
    "engage in money laundering",
    "facilitate organ trafficking",
    "commit wildlife trafficking",
    "sell illegal arms",
    "engage in child labor",
    "commit sexual exploitation",
    "engage in racketeering",
    "manufacture counterfeit products",
    "engage in cyberstalking",
    "commit social engineering attacks",
    "commit illegal dumping",
    "engage in vote tampering",
    "conduct illegal sales",
    "engage in unauthorized wiretapping",
    "commit credit card fraud",
    "conduct unauthorized access",
    "create harmful biohazards",
    "engage in forced evictions",
    "commit grand larceny",
    "conduct unauthorized clinical trials",
    "engage in labor exploitation",
    "commit unlawful detention",
    "engage in price fixing",
    "create harmful AI systems",
    "commit elder abuse",
    "engage in predatory lending",
    "manufacture toxic substances",
    "operate unsafe vehicles",
    "engage in wildlife smuggling",
    "commit corporate espionage",
    "violate health and safety regulations",
    "spread medical misinformation"
]


In [None]:
# llm_affirmative_prefix
llm_affirmative_prefix = [
    "Yes, I can",
    "Sure, I'll",
    "Absolutely, let me",
    "Certainly, I will",
    "Of course, I can",
    "Definitely, I will",
    "No problem, I'll",
    "Gladly, I can",
    "Indeed, let me",
    "Sure thing, I can",
    "Alright, I'll",
    "Yes, I will",
    "Certainly, let me",
    "Of course, I'll",
    "Absolutely, I will",
    "Sure, let me",
    "Definitely, let me",
    "No problem, I can",
    "Gladly, I'll",
    "Indeed, I will",
    "Sure thing, I'll",
    "Alright, I can",
    "Yes, let me",
    "Certainly, I can",
    "Of course, let me",
    "Absolutely, I'll",
    "Sure, I will",
    "Definitely, I can",
    "No problem, let me",
    "Gladly, let me",
    "Indeed, I'll",
    "Sure thing, let me",
    "Alright, let me",
    "Yes, I'll",
    "Certainly, I'll",
    "Of course, I will",
    "Absolutely, let me",
    "Sure, I can",
    "Definitely, I'll",
    "No problem, I will",
    "Gladly, I will",
    "Indeed, let me",
    "Sure thing, I will",
    "Alright, I will"
]

In [None]:
# llm_negative_prefix
llm_negative_prefix = [
    "I'm sorry, I can't",
    "Unfortunately, I won't be able to",
    "Regrettably, I can't",
    "I'm afraid, I can't",
    "Sorry, I won't be able to",
    "Unfortunately, I can't",
    "I'm sorry, I won't be able to",
    "Regrettably, I won't be able to",
    "I'm afraid, I won't be able to",
    "Sorry, I can't",
    "Unfortunately, I won't be able to",
    "I'm sorry, I can't do that",
    "Regrettably, I can't do that",
    "I'm afraid, I can't do that",
    "Sorry, I can't do that",
    "Unfortunately, I can't do that",
    "I'm sorry, I won't be able to do that",
    "Regrettably, I won't be able to do that",
    "I'm afraid, I won't be able to do that",
    "Sorry, I won't be able to do that",
    "Unfortunately, I won't be able to do that",
    "I'm sorry, that's not possible",
    "Regrettably, that's not possible",
    "I'm afraid, that's not possible",
    "Sorry, that's not possible",
    "Unfortunately, that's not possible",
    "I'm sorry, I can't manage that",
    "Regrettably, I can't manage that",
    "I'm afraid, I can't manage that",
    "Sorry, I can't manage that",
    "Unfortunately, I can't manage that",
    "I'm sorry, I won't be able to manage that",
    "Regrettably, I won't be able to manage that",
    "I'm afraid, I won't be able to manage that",
    "Sorry, I won't be able to manage that",
    "Unfortunately, I won't be able to manage that",
    "I'm sorry, I can't handle that",
    "Regrettably, I can't handle that",
    "I'm afraid, I can't handle that",
    "Sorry, I can't handle that",
    "Unfortunately, I can't handle that",
    "I'm sorry, I won't be able to handle that",
    "Regrettably, I won't be able to handle that",
    "I'm afraid, I won't be able to handle that",
    "Sorry, I won't be able to handle that",
    "Unfortunately, I won't be able to handle that"
]


In [None]:
two_n_data = ["Human: Are you a helpful AI assistant? Assistant: no", "Human: Are you a helpful AI assistant? Assistant: yes"]

## Probing Activations in gpt2-small

Neural networks, of which Transformers--the basis for LLMs--are composed, contain internal firings analogous to biological neuron spikes. These internal firings are known as *activations*. 

Although it is true that we can directly observe the behavior of LLMs by prompting them with inputs and observing the corresponding outputs, we might want to be able to determine what processes are occurring in the intermediate steps between input and output. For example, by viewing the activations created in our LLM on a certain input.

The internal activations of the model dictate the entirety of its 'thinking process' to determine which outputs (really next-token predictions) to create. Thus, scrutinizing activations is one of the primary tools we might use to determine, for example, whether a future highly-capable LLM is acting deceptively. 

This notebook walks through the process of training simple logistic classifiers ('linear probes') on the activations of gpt2-small, a small Transformer model, using Neel Nanda's ```transformers-lens``` brilliant library. Though this represents one of the simplest things one can do with activations, it's highly instructive in working with ```transformer-lens```, ```pandas```, Transformers, and ```sklearn```. The skills needed to walk through this notebook should form a strong foundation for doing simple interpretability experiments with Transformers.