In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset,load_from_disk
import nltk
from collections import defaultdict
from tqdm import tqdm 
import numpy as np 
# nltk.download('punkt')
# nltk.download('punkt_tab')
import matplotlib.pyplot as plt
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords (only needed once)
# nltk.download('stopwords')
# nltk.download('punkt')



# load tokenizer and training data

In [5]:
model_id = "allenai/wildguard"
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir='/project/lt200252-wcbart/nicky/cache_hug_1')
wildguardtrain = load_from_disk("/project/lt200252-wcbart/nicky/safety_dataset/allenai/wildguardmix/wildguardtrain")

### count words

In [12]:
total_vocab = []
cnt = defaultdict(lambda : defaultdict(lambda:0)) 
cnt_bigram = defaultdict(lambda : defaultdict(lambda:0)) 

cnt_y = defaultdict(lambda :0)
cnt_word = defaultdict(lambda :0)
cnt_word_bigram = defaultdict(lambda :0)


for x in tqdm(wildguardtrain['train']):
    prompt_words = tokenizer(x['prompt'])
    ## unigram 
    for prompt_word in prompt_words['input_ids']:
        cnt[str(x['prompt_harm_label'])][prompt_word]+=1
        cnt_word[prompt_word]+=1
    ## bigram 
    for i in range(len(prompt_words['input_ids'])-1):
        cnt_bigram[str(x['prompt_harm_label'])][tuple([prompt_words['input_ids'][i],prompt_words['input_ids'][i+1]])]+=1
        cnt_word_bigram[tuple([prompt_words['input_ids'][i],prompt_words['input_ids'][i+1]])]+=1
        
    cnt_y[str(x['prompt_harm_label'])]+=1
    total_vocab.extend(prompt_words['input_ids'])
    # break
    

100%|██████████| 86759/86759 [00:37<00:00, 2300.26it/s]


### LMI (unharmful)

In [29]:
LMIs = []
D = len(set(total_vocab))
T = len(total_vocab)
p_Y = cnt_y['unharmful']/(cnt_y['unharmful']+ cnt_y['harmful'])
score_unharmful_LMIs = defaultdict(lambda : 0)
score_unharmful_LMIs_bigram = defaultdict(lambda : 0)

## unigram 
for idx,freq in tqdm(cnt['unharmful'].items()):
    p_W_Y = freq/D
    p_Y_W = freq/cnt_word[idx]
    LMI = p_W_Y * np.log(p_Y_W/p_Y)
    LMIs.append(float(LMI))
    score_unharmful_LMIs[idx] = float(LMI)
    
## bigram 
for idx,freq in tqdm(cnt_bigram['unharmful'].items()):
    p_W_Y = freq/D
    p_Y_W = freq/cnt_word_bigram[idx]
    LMI = p_W_Y * np.log(p_Y_W/p_Y)
    LMIs.append(float(LMI))
    score_unharmful_LMIs_bigram[tuple([tokenizer.convert_ids_to_tokens(idx[0]),tokenizer.convert_ids_to_tokens(idx[1])])] = float(LMI)

100%|██████████| 23301/23301 [00:00<00:00, 647477.03it/s]
100%|██████████| 450896/450896 [00:01<00:00, 345222.27it/s]


### LMI (harmful) unigram 

In [18]:
LMIs = []
D = len(set(total_vocab))
T = len(total_vocab)
p_Y = cnt_y['harmful']/(cnt_y['unharmful']+ cnt_y['harmful'])
score_harmful_LMIs = defaultdict(lambda : 0)
score_harmful_LMIs_bigram = defaultdict(lambda : 0)

## unigram 
for idx,freq in tqdm(cnt['harmful'].items()):
    p_W_Y = freq/D
    p_Y_W = freq/cnt_word[idx]
    LMI = p_W_Y * np.log(p_Y_W/p_Y)
    LMIs.append(float(LMI))
    score_harmful_LMIs[idx] = float(LMI)
    
## bigram 
for idx,freq in tqdm(cnt_bigram['harmful'].items()):
    p_W_Y = freq/D
    p_Y_W = freq/cnt_word_bigram[idx]
    LMI = p_W_Y * np.log(p_Y_W/p_Y)
    LMIs.append(float(LMI))
    score_harmful_LMIs_bigram[tuple([tokenizer.convert_ids_to_tokens(idx[0]),tokenizer.convert_ids_to_tokens(idx[1])])] = float(LMI)
    # break

100%|██████████| 19738/19738 [00:00<00:00, 652442.88it/s]
100%|██████████| 427201/427201 [00:01<00:00, 358348.91it/s]


### head & tail distribution (harmful) bigram 

In [27]:
sorted_score_harmful_head_LMIs_bigram = sorted([ (k,v) for k,v in score_harmful_LMIs_bigram.items()], key = lambda x: -x[1])
sorted_harmful_head_words_bigram = [k[0] for k in sorted_score_harmful_head_LMIs_bigram][:100]
# sorted_idx_harmful_head_words_bigram = [x[0] for x in sorted_score_harmful_head_LMIs_bigram][:100]

sorted_score_harmful_tail_LMIs_bigram = sorted([ (k,v) for k,v in score_harmful_LMIs_bigram.items()], key = lambda x: x[1])
sorted_harmful_tail_words_bigram = [k[0] for k in sorted_score_harmful_tail_LMIs_bigram][:100]
# sorted_idx_harmful_tail_words_bigram = [x[0] for x in sorted_score_harmful_tail_LMIs_bigram][:100]


### head & tail distribution (unharmful) bigram 

In [30]:
sorted_score_unharmful_head_LMIs_bigram = sorted([ (k,v) for k,v in score_unharmful_LMIs_bigram.items()], key = lambda x: -x[1])
sorted_unharmful_head_words_bigram = [k[0] for k in sorted_score_unharmful_head_LMIs_bigram][:100]
# sorted_idx_harmful_head_words_bigram = [x[0] for x in sorted_score_harmful_head_LMIs_bigram][:100]


sorted_score_unharmful_tail_LMIs_bigram = sorted([ (k,v) for k,v in score_unharmful_LMIs_bigram.items()], key = lambda x: x[1])
sorted_unharmful_tail_words_bigram = [k[0] for k in sorted_score_unharmful_tail_LMIs_bigram][:100]
# sorted_idx_harmful_tail_words_bigram = [x[0] for x in sorted_score_harmful_tail_LMIs_bigram][:100]


In [34]:
# with open('LMI_shortcut/tokenizer-wildguard_dataset-wildguardmixTrain_LMI_bigram_harmful_head.pkl','wb') as f:
#     pickle.dump(sorted_harmful_head_words_bigram,f)
# with open('LMI_shortcut/tokenizer-wildguard_dataset-wildguardmixTrain_LMI_bigram_harmful_tail.pkl','wb') as f:
#     pickle.dump(sorted_harmful_tail_words_bigram,f)

# with open('LMI_shortcut/tokenizer-wildguard_dataset-wildguardmixTrain_LMI_bigram_unharmful_head.pkl','wb') as f:
#     pickle.dump(sorted_unharmful_head_words_bigram,f)
# with open('LMI_shortcut/tokenizer-wildguard_dataset-wildguardmixTrain_LMI_bigram_unharmful_tail.pkl','wb') as f:
#     pickle.dump(sorted_unharmful_tail_words_bigram,f)



### head & tail distribution (harmful)

In [30]:
# sorted_score_harmful_head_LMIs = sorted([ (k,v) for k,v in score_harmful_LMIs.items()], key = lambda x: -x[1])
# sorted_harmful_head_words = [tokenizer.convert_ids_to_tokens(x[0]) for x in sorted_score_harmful_head_LMIs][:100]
# sorted_idx_harmful_head_words = [x[0] for x in sorted_score_harmful_head_LMIs][:100]


# sorted_score_harmful_tail_LMIs = sorted([ (k,v) for k,v in score_harmful_LMIs.items()], key = lambda x: x[1])
# sorted_harmful_tail_words = [tokenizer.convert_ids_to_tokens(x[0]) for x in sorted_score_harmful_tail_LMIs][:100]
# sorted_idx_harmful_tail_words = [x[0] for x in sorted_score_harmful_tail_LMIs][:100]


### head & tail distribution (unharmful)

In [5]:
sorted_score_unharmful_head_LMIs = sorted([ (k,v) for k,v in score_unharmful_LMIs.items()], key = lambda x: -x[1])
sorted_unharmful_head_words = [tokenizer.convert_ids_to_tokens(x[0]) for x in sorted_score_unharmful_head_LMIs][:100]
sorted_idx_unharmful_head_words = [x[0] for x in sorted_score_unharmful_head_LMIs][:100]


sorted_score_unharmful_tail_LMIs = sorted([ (k,v) for k,v in score_unharmful_LMIs.items()], key = lambda x: x[1])
sorted_unharmful_tail_words = [tokenizer.convert_ids_to_tokens(x[0]) for x in sorted_score_unharmful_tail_LMIs][:100]
sorted_idx_unharmful_tail_words = [x[0] for x in sorted_score_unharmful_tail_LMIs][:100]


In [32]:
# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_harmful_head.pkl','wb') as f:
#     pickle.dump(sorted_harmful_head_words,f)
# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_harmful_head_idx.pkl','wb') as f:
#     pickle.dump(sorted_idx_harmful_head_words,f)

# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_harmful_tail.pkl','wb') as f:
#     pickle.dump(sorted_harmful_tail_words,f)
# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_harmful_tail_idx.pkl','wb') as f:
#     pickle.dump(sorted_idx_harmful_tail_words,f)

In [7]:
''.join(sorted_unharmful_head_words)

"?▁I▁What▁How'▁can▁game▁As▁'▁Can▁Imagine▁Why▁my<0x09>m▁some▁Could▁could▁AI▁do▁unique▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁strategies▁Remember▁in▁▁▁▁▁▁▁▁▁▁▁▁universe▁we▁world▁what▁You▁It▁Now▁magical▁This▁would▁=▁understanding▁explain▁concept▁telere▁novel▁our▁knowledge▁why▁realm▁Let▁creative▁creatures▁intr0C(ast▁en▁right▁journey▁self▁So▁respect▁protagon▁A▁mighten▁there▁copyright▁behind▁address▁▁▁▁▁▁▁▁approach▁insights▁find▁feelings▁ability',con▁beings;▁art▁young'.▁reasons▁hair▁learnport▁myth▁fantasy▁best▁help▁deep7▁player▁die▁emotional▁rights▁▁▁▁growth▁digital▁these"

In [8]:
# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_unharmful_head.pkl','wb') as f:
#     pickle.dump(sorted_unharmful_head_words,f)
# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_unharmful_head_idx.pkl','wb') as f:
#     pickle.dump(sorted_idx_unharmful_head_words,f)

# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_unharmful_tail.pkl','wb') as f:
#     pickle.dump(sorted_unharmful_tail_words,f)
# with open('tokenizer-wildguard_dataset-wildguardmixTrain_LMI_unharmful_tail_idx.pkl','wb') as f:
#     pickle.dump(sorted_idx_unharmful_tail_words,f)

## example of shortcut keywords

In [12]:
sorted_harmful_head_words

['▁I',
 '▁someone',
 'harm',
 'ly',
 't',
 '▁self',
 '▁How',
 '▁can',
 "'",
 '▁Can',
 '▁sexual',
 '▁describe',
 '▁manip',
 '▁my',
 '▁sub',
 's',
 'leading',
 '▁without',
 '▁myself',
 '▁use',
 'otic',
 '▁suggest',
 '▁er',
 '▁take',
 '▁involving',
 '▁way',
 '▁caught',
 '▁getting',
 '▁sex',
 '▁person',
 '▁racial',
 '▁effective',
 '▁work',
 '▁recommend',
 '▁Write',
 '▁cow',
 'ork',
 '▁towards',
 '▁own',
 '▁group',
 'atively',
 '▁de',
 '▁subtle',
 '▁colle',
 'ague',
 '▁leaving',
 '▁partner',
 '▁opinion',
 '▁tips',
 '▁you',
 '▁neg',
 '▁progress',
 '▁detail',
 '▁fantasy',
 '▁mis',
 '▁Comp',
 '▁their',
 '▁pain',
 '▁won',
 '▁easy',
 '▁burning',
 '▁specific',
 '▁discre',
 '▁strategies',
 '▁detailed',
 '▁tactics',
 'et',
 'ose',
 '▁including',
 '▁share',
 '▁reputation',
 '▁method',
 '▁less',
 '▁porn',
 '▁people',
 '▁community',
 'inder',
 '▁c',
 '▁obvious',
 'ermine',
 '▁workplace',
 '▁productivity',
 '▁ri',
 'ulate',
 '▁job',
 '▁good',
 'cept',
 '▁ethnic',
 '▁give',
 '▁plan',
 '▁und',
 '▁career'

In [13]:
sorted_harmful_tail_words

[',',
 '▁the',
 '▁of',
 '▁for',
 '▁a',
 '▁and',
 '▁are',
 '▁that',
 '▁in',
 '▁some',
 '▁while',
 '▁be',
 'ing',
 '▁ensuring',
 '▁What',
 '▁harm',
 '▁legal',
 '▁any',
 '.',
 '▁not',
 '▁might',
 '▁but',
 '▁to',
 '▁or',
 '▁process',
 '▁activities',
 '▁used',
 '▁methods',
 '▁could',
 '▁is',
 '▁mim',
 '▁making',
 'ical',
 '▁un',
 '▁purposes',
 '▁illegal',
 '▁access',
 '▁on',
 '▁Could',
 '▁as',
 '▁ethical',
 '▁sim',
 '▁with',
 '▁creating',
 '▁one',
 '▁potential',
 'able',
 '▁from',
 '▁actions',
 'ional',
 '▁explain',
 '▁by',
 '▁re',
 'ot',
 '▁regulations',
 '▁scenario',
 '▁question',
 '▁guide',
 '▁laws',
 '▁information',
 '▁have',
 'ics',
 '▁no',
 '▁actually',
 '▁techniques',
 'het',
 '▁items',
 '▁hyp',
 '▁Expl',
 'less',
 '▁how',
 '▁causing',
 '▁avoid',
 '▁program',
 '▁law',
 '▁may',
 '▁measures',
 '▁system',
 'm',
 '▁conv',
 '▁when',
 '▁sure',
 '▁money',
 '▁all',
 '▁fict',
 '▁safety',
 'ic',
 '▁behavior',
 '▁boundaries',
 '▁restricted',
 '▁if',
 'stances',
 '▁within',
 '▁non',
 'ful',
 'in

In [45]:
sorted_unharmful_head_words

[',',
 '▁the',
 '▁of',
 '▁for',
 '▁and',
 '▁are',
 '▁a',
 '▁that',
 '▁purposes',
 '▁while',
 '▁legal',
 '▁ensuring',
 '▁harm',
 '▁in',
 'less',
 '▁be',
 '▁some',
 '▁educational',
 'ing',
 '▁program',
 '▁security',
 '▁any',
 '▁used',
 '▁might',
 '▁not',
 '▁process',
 '▁legally',
 '▁illegal',
 '▁sure',
 '.',
 '▁all',
 '▁ethical',
 '▁but',
 '▁sim',
 '▁What',
 'het',
 'ware',
 '▁activities',
 '▁mim',
 '▁hyp',
 '▁re',
 'ical',
 '▁potential',
 '▁making',
 '▁could',
 'ional',
 '▁laws',
 '▁avoid',
 '▁is',
 '▁building',
 '▁have',
 '▁pr',
 '▁controlled',
 '▁system',
 '▁safe',
 '▁mal',
 '▁respect',
 '▁Could',
 '▁access',
 '▁or',
 '▁creating',
 '▁to',
 '▁by',
 '▁methods',
 '▁only',
 '▁no',
 'ank',
 '▁regulations',
 '▁law',
 '▁un',
 '▁test',
 '▁behavior',
 '▁may',
 '▁measures',
 '▁virus',
 '▁considered',
 '▁actions',
 '▁as',
 'm',
 '▁actual',
 '▁should',
 '▁understand',
 '▁explain',
 '▁from',
 'stances',
 '▁scenario',
 '▁r',
 'om',
 'able',
 'plica',
 'ans',
 '▁real',
 'ically',
 '▁with',
 '▁fict',

In [46]:
sorted_unharmful_tail_words

['▁I',
 '▁someone',
 '▁How',
 '▁can',
 "'",
 '▁Can',
 'ly',
 't',
 '▁self',
 'harm',
 's',
 '▁my',
 '▁without',
 '▁sub',
 '▁describe',
 '▁use',
 '▁manip',
 '▁way',
 '▁suggest',
 '▁sexual',
 '▁you',
 '▁person',
 '▁take',
 '▁Write',
 '▁their',
 '▁towards',
 '▁myself',
 '▁own',
 '▁group',
 '▁mis',
 '▁strategies',
 '▁involving',
 '▁work',
 '▁leaving',
 '▁specific',
 '▁discre',
 '▁racial',
 'ulate',
 'et',
 '▁recommend',
 '▁Comp',
 '▁de',
 '▁effective',
 '▁tactics',
 '▁people',
 'leading',
 '▁neg',
 'ose',
 '▁tips',
 '▁detailed',
 '▁cow',
 'ork',
 '▁caught',
 'atively',
 '▁ways',
 '▁plan',
 '▁give',
 '▁others',
 '▁provide',
 '▁negative',
 '▁share',
 '▁reputation',
 '▁including',
 '▁getting',
 '▁dis',
 '▁opinion',
 '▁partner',
 '▁appears',
 '▁details',
 '▁generate',
 '-',
 '▁make',
 '▁won',
 '▁pain',
 '▁message',
 '▁about',
 '▁community',
 '▁traces',
 '▁encounter',
 '▁content',
 '▁subtle',
 'ity',
 '▁into',
 '▁against',
 '▁narrative',
 '▁internet',
 '▁presence',
 '▁job',
 'cept',
 '▁good',
 