In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 23.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 58.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 50.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [None]:
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import random
from datetime import datetime
import os
import json
import requests

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
model_name = 'bigscience/bloom'

experiment_cycles = 100

num_prime_tokens = 64
num_probe_tokens = 10

top_k = 1

prime_pause_str = '.' # (Punctuation) token (string) that separates trigrams in prime input

cols = ['Setting','TkInp','TkOut1','Inp','Out1']

#'w' and 'y' excluded from both, 'q' excluded from consonants
vowels = np.array(['a','e','i','o','u','A','E','I','O','U'])
consonants = np.array(['b','c','d','f','g','h','j','k','l','m',
                       'n','p','r','s','t','v','x','z','B','C','D','F','G','H','J','K','L','M','N','P','R','S','T','V','X','Z'])

tokenizer = AutoTokenizer.from_pretrained(model_name)

headers = {"Authorization": f"Bearer api_org_..."}
API_URL = "https://api-inference.huggingface.co/models/"+model_name

In [None]:
pauses = np.repeat(tokenizer.convert_tokens_to_ids(prime_pause_str), num_prime_tokens)
prime_order = np.arange(num_prime_tokens)
np.random.shuffle(prime_order)

# Restricted Syllables: Preparation
v_str = str(vowels).replace("['", "(").replace("']", ")").replace("' '", "|")
c_str = str(consonants).replace("['", "(").replace("']", ")").replace("' '", "|")

tkns = []
for i in range(len(tokenizer.vocab.keys())): tkns.append(tokenizer.decode([i]))
df = pd.DataFrame(data={'Token':tkns})

syl_df = pd.DataFrame(data={'Token':[]})
syl_df = syl_df.append(df[df.Token.str.match(f'^ {v_str}{c_str}$')]) #_VC
syl_df = syl_df.append(df[df.Token.str.match(f'^ {c_str}{v_str}$')]) #_CV
syl_df = syl_df.sort_index()
syl_df.reset_index(inplace=True)
syl_df = syl_df.rename(columns={'index':'TokenID'})
syls = syl_df.TokenID.to_numpy()


def token_selector(num_tkns, ids_xcl=[]):
  ids = random.sample(set(syls)-set(ids_xcl), num_tkns)
  return ids

def output_generation(prime_ASR, probe_a=None):
  temp = [[],[],[],[],[]]

  if probe_a == None:
    temp[0] = 'no-probe'
    prime_input = prime_input_non = tokenizer.decode(prime_ASR.flatten())
    max_new=3
  else:
    temp[0] = 'probe A'
    prime_input = tokenizer.decode(np.append(prime_ASR.flatten(), probe_a))
    prime_input_non = tokenizer.decode(prime_ASR.flatten())
    max_new=2


  def query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))
  data = query({
      "inputs": prime_input,
      "parameters": {"max_new_tokens": max_new},})

  if prime_input_non == prime_input: temp[0] = 'no-probe'
  else: temp[0] = 'probe A'
  temp[1] = tokenizer.encode(prime_input)
  temp[2] = tokenizer.encode(data[0]['generated_text'])[4*num_prime_tokens:]
  temp[3] = prime_input
  temp[4] = data[0]['generated_text'].replace(prime_input_non,'')

  return pd.DataFrame(temp,index=cols).T

df_AAB_priming = pd.DataFrame(columns=cols)
df_ABA_priming = pd.DataFrame(columns=cols)
df_ABB_priming = pd.DataFrame(columns=cols)
df_ABCa_priming = pd.DataFrame(columns=cols)
df_ABCb_priming = pd.DataFrame(columns=cols)

for i_exp in range(experiment_cycles):
  prime_AAB = prime_ABA = prime_ABB = np.array([]).astype('int64')

  ids_prime_a = token_selector(num_prime_tokens)
  ids_prime_b = token_selector(num_prime_tokens,ids_prime_a)

  combs_AB = np.transpose((ids_prime_a, ids_prime_b))

  prime_AAB = np.insert(combs_AB[:,(0,0,1)],3,pauses,axis=1)
  prime_ABA = np.insert(combs_AB[:,(0,1,0)],3,pauses,axis=1)
  prime_ABB = np.insert(combs_AB[:,(0,1,1)],3,pauses,axis=1)
  prime_ABCa = np.insert(np.insert(combs_AB, 2, np.roll(combs_AB[:,0],len(combs_AB[:,0])//2), axis=1),3,pauses,axis=1)[prime_order] #ABCb
  prime_ABCb = np.insert(np.insert(combs_AB, 2, np.roll(combs_AB[:,1],len(combs_AB[:,1])//2), axis=1),3,pauses,axis=1)[prime_order] #ABCb

  ids_probe_a = token_selector(num_probe_tokens,np.concatenate((ids_prime_a, ids_prime_b), axis=0))

  df_AAB_priming = pd.concat([df_AAB_priming, output_generation(prime_AAB)], ignore_index=True)#AAB priming
  df_ABA_priming = pd.concat([df_ABA_priming, output_generation(prime_ABA)], ignore_index=True)#ABA priming
  df_ABB_priming = pd.concat([df_ABB_priming, output_generation(prime_ABB)], ignore_index=True)#ABB priming
  df_ABCa_priming = pd.concat([df_ABCa_priming, output_generation(prime_ABCa)], ignore_index=True)#ABB priming
  df_ABCb_priming = pd.concat([df_ABCb_priming, output_generation(prime_ABCb)], ignore_index=True)#ABB priming

  if i_exp%num_probe_tokens == 0:
    for a_probe in ids_probe_a:
      df_AAB_priming = pd.concat([df_AAB_priming, output_generation(prime_AAB, a_probe)], ignore_index=True)#AAB priming
      df_ABA_priming = pd.concat([df_ABA_priming, output_generation(prime_ABA, a_probe)], ignore_index=True)#ABA priming
      df_ABB_priming = pd.concat([df_ABB_priming, output_generation(prime_ABB, a_probe)], ignore_index=True)#ABB priming
      df_ABCa_priming = pd.concat([df_ABCa_priming, output_generation(prime_ABCa, a_probe)], ignore_index=True)#ABB priming
      df_ABCb_priming = pd.concat([df_ABCb_priming, output_generation(prime_ABCb, a_probe)], ignore_index=True)#ABB priming

  print(f'{str(datetime.now())}: {i_exp+1} done')

folder = model_name.replace('/','.')
folder += f'_{experiment_cycles}_{num_prime_tokens}_{num_probe_tokens}_{top_k}/'
try: os.mkdir(folder)
except FileExistsError: folder = folder

time_st = str(datetime.now())

df_AAB_priming.to_csv(f'{folder}{time_st}_generate_AAB.csv',sep=';')
df_ABA_priming.to_csv(f'{folder}{time_st}_generate_ABA.csv',sep=';')
df_ABB_priming.to_csv(f'{folder}{time_st}_generate_ABB.csv',sep=';')
df_ABCa_priming.to_csv(f'{folder}{time_st}_generate_ABCa.csv',sep=';')
df_ABCb_priming.to_csv(f'{folder}{time_st}_generate_ABCb.csv',sep=';')

2022-09-27 16:11:18.221489: 1 done
2022-09-27 16:11:29.130030: 2 done
2022-09-27 16:11:48.994870: 3 done
2022-09-27 16:12:08.715181: 4 done
2022-09-27 16:12:19.505217: 5 done
2022-09-27 16:12:43.905787: 6 done
2022-09-27 16:12:59.718952: 7 done
2022-09-27 16:13:20.301803: 8 done
2022-09-27 16:13:40.966181: 9 done
2022-09-27 16:14:04.989424: 10 done
2022-09-27 16:14:32.005991: 11 done
2022-09-27 16:14:33.798539: 12 done
2022-09-27 16:14:35.625457: 13 done
2022-09-27 16:14:41.709591: 14 done
2022-09-27 16:14:43.553504: 15 done
2022-09-27 16:14:45.343635: 16 done
2022-09-27 16:14:47.152507: 17 done
2022-09-27 16:14:48.915168: 18 done
2022-09-27 16:14:50.710782: 19 done
2022-09-27 16:14:53.752724: 20 done
2022-09-27 16:15:12.602002: 21 done
2022-09-27 16:15:14.470416: 22 done
2022-09-27 16:15:16.275914: 23 done
2022-09-27 16:15:20.482976: 24 done
2022-09-27 16:15:22.384899: 25 done
2022-09-27 16:15:25.517636: 26 done
2022-09-27 16:15:34.593922: 27 done
2022-09-27 16:15:36.386400: 28 done
2

# Download

In [None]:
!zip -r downAPI.zip /content/bigscience.bloom_100_64_10_1
from google.colab import files
files.download('downAPI.zip')

updating: content/bigscience.bloom_100_64_10_1/ (stored 0%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:18:55.917920_generate_AAB.csv (deflated 83%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:18:55.917920_generate_ABA.csv (deflated 79%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:18:55.917920_generate_ABB.csv (deflated 82%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:18:55.917920_generate_ABCa.csv (deflated 78%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:18:55.917920_generate_ABCb.csv (deflated 78%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:52:20.057043_generate_ABB.csv (deflated 82%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:52:20.057043_generate_AAB.csv (deflated 82%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:52:20.057043_generate_ABCb.csv (deflated 78%)
updating: content/bigscience.bloom_100_64_10_1/2022-09-27 15:52:20.057043_generate_ABA.cs

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!rm -r /content/down*
!rm -r /content/big*