In [None]:
!pip install transformers openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.3 MB/s 
[?25hCollecting openai
  Downloading openai-0.23.1.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 2.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 47.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 37.3 MB/s 
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.2.0.62-py3-none-any.whl (163 kB

In [None]:
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import random
from datetime import datetime
import os
import openai

In [None]:
model_name = 'gpt3'
tokenizr = 'gpt2'
engn = "text-davinci-001"

experiment_cycles = 100

num_prime_tokens = 64
num_probe_tokens = 10

top_k = 5

prime_pause_str = '.' # (Punctuation) token (string) that separates trigrams in prime input

cols = ['Setting','TkInp','TkOut1','TkOut2','TkOut3','TkOut4','TkOut5','Inp','Out1','Out2','Out3','Out4','Out5']

#'w' and 'y' excluded from both, 'q' excluded from consonants
vowels = np.array(['a','e','i','o','u','A','E','I','O','U'])
consonants = np.array(['b','c','d','f','g','h','j','k','l','m',
                       'n','p','r','s','t','v','x','z','B','C','D','F','G','H','J','K','L','M','N','P','R','S','T','V','X','Z'])

tokenizer = AutoTokenizer.from_pretrained(tokenizr)

openai.organization = "org-..."
openai.api_key = "sk-..."

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
pauses = np.repeat(tokenizer.convert_tokens_to_ids(prime_pause_str), num_prime_tokens)
prime_order = np.arange(num_prime_tokens)
np.random.shuffle(prime_order)

# Restricted Syllables: Preparation
v_str = str(vowels).replace("['", "(").replace("']", ")").replace("' '", "|")
c_str = str(consonants).replace("['", "(").replace("']", ")").replace("' '", "|")

tkns = []
for i in range(len(tokenizer.vocab.keys())): tkns.append(tokenizer.decode([i]))
df = pd.DataFrame(data={'Token':tkns})

syl_df = pd.DataFrame(data={'Token':[]})
syl_df = syl_df.append(df[df.Token.str.match(f'^ {v_str}{c_str}$')]) #_VC
syl_df = syl_df.append(df[df.Token.str.match(f'^ {c_str}{v_str}$')]) #_CV
syl_df = syl_df.sort_index()
syl_df.reset_index(inplace=True)
syl_df = syl_df.rename(columns={'index':'TokenID'})
syls = syl_df.TokenID.to_numpy()


def token_selector(num_tkns, ids_xcl=[]):
  ids = random.sample(set(syls)-set(ids_xcl), num_tkns)
  return ids

def output_generation(prime_ASR, probe_a=None):
  temp = [[],[],[],[],[],[],[],[],[],[],[],[],[]]

  if probe_a == None:
    temp[0] = 'no-probe'
    prime_input = prime_input_non = tokenizer.decode(prime_ASR.flatten())
    max_new=3
  else:
    temp[0] = 'probe A'
    prime_input = tokenizer.decode(np.append(prime_ASR.flatten(), probe_a))
    prime_input_non = tokenizer.decode(prime_ASR.flatten())
    max_new=2

  output = openai.Completion.create(
      engine=engn,
      prompt =  prime_input,
      temperature = 0.6,
      echo = True,
      max_tokens = max_new,
      frequency_penalty = 0,
      presence_penalty = 0,
      n = top_k)

  if prime_input_non == prime_input: temp[0] = 'no-probe'
  else: temp[0] = 'probe A'
  temp[1] = tokenizer.encode(prime_input)
  temp[2] = tokenizer.encode(output.choices[0]['text'])[4*num_prime_tokens:]
  temp[3] = tokenizer.encode(output.choices[1]['text'])[4*num_prime_tokens:]
  temp[4] = tokenizer.encode(output.choices[2]['text'])[4*num_prime_tokens:]
  temp[5] = tokenizer.encode(output.choices[3]['text'])[4*num_prime_tokens:]
  temp[6] = tokenizer.encode(output.choices[4]['text'])[4*num_prime_tokens:]
  temp[7] = prime_input
  temp[8] = output.choices[0]['text'].replace(prime_input_non,'')
  temp[9] = output.choices[1]['text'].replace(prime_input_non,'')
  temp[10] = output.choices[2]['text'].replace(prime_input_non,'')
  temp[11] = output.choices[3]['text'].replace(prime_input_non,'')
  temp[12] = output.choices[4]['text'].replace(prime_input_non,'')

  return pd.DataFrame(temp,index=cols).T

df_AAB_priming = pd.DataFrame(columns=cols)
df_ABA_priming = pd.DataFrame(columns=cols)
df_ABB_priming = pd.DataFrame(columns=cols)
df_ABCa_priming = pd.DataFrame(columns=cols)
df_ABCb_priming = pd.DataFrame(columns=cols)

for i_exp in range(experiment_cycles):
  prime_AAB = prime_ABA = prime_ABB = np.array([]).astype('int64')

  ids_prime_a = token_selector(num_prime_tokens)
  ids_prime_b = token_selector(num_prime_tokens,ids_prime_a)

  combs_AB = np.transpose((ids_prime_a, ids_prime_b))

  prime_AAB = np.insert(combs_AB[:,(0,0,1)],3,pauses,axis=1)
  prime_ABA = np.insert(combs_AB[:,(0,1,0)],3,pauses,axis=1)
  prime_ABB = np.insert(combs_AB[:,(0,1,1)],3,pauses,axis=1)
  prime_ABCa = np.insert(np.insert(combs_AB, 2, np.roll(combs_AB[:,0],len(combs_AB[:,0])//2), axis=1),3,pauses,axis=1)[prime_order] #ABCb
  prime_ABCb = np.insert(np.insert(combs_AB, 2, np.roll(combs_AB[:,1],len(combs_AB[:,1])//2), axis=1),3,pauses,axis=1)[prime_order] #ABCb

  ids_probe_a = token_selector(num_probe_tokens,np.concatenate((ids_prime_a, ids_prime_b), axis=0))

  df_AAB_priming = pd.concat([df_AAB_priming, output_generation(prime_AAB)], ignore_index=True)#AAB priming
  df_ABA_priming = pd.concat([df_ABA_priming, output_generation(prime_ABA)], ignore_index=True)#ABA priming
  df_ABB_priming = pd.concat([df_ABB_priming, output_generation(prime_ABB)], ignore_index=True)#ABB priming
  df_ABCa_priming = pd.concat([df_ABCa_priming, output_generation(prime_ABCa)], ignore_index=True)#ABB priming
  df_ABCb_priming = pd.concat([df_ABCb_priming, output_generation(prime_ABCb)], ignore_index=True)#ABB priming

  if i_exp%num_probe_tokens == 0:
    for a_probe in ids_probe_a:
      df_AAB_priming = pd.concat([df_AAB_priming, output_generation(prime_AAB, a_probe)], ignore_index=True)#AAB priming
      df_ABA_priming = pd.concat([df_ABA_priming, output_generation(prime_ABA, a_probe)], ignore_index=True)#ABA priming
      df_ABB_priming = pd.concat([df_ABB_priming, output_generation(prime_ABB, a_probe)], ignore_index=True)#ABB priming
      df_ABCa_priming = pd.concat([df_ABCa_priming, output_generation(prime_ABCa, a_probe)], ignore_index=True)#ABB priming
      df_ABCb_priming = pd.concat([df_ABCb_priming, output_generation(prime_ABCb, a_probe)], ignore_index=True)#ABB priming

  print(f'{str(datetime.now())}: {i_exp+1} done')

folder = model_name.replace('/','.')
folder += f'_{experiment_cycles}_{num_prime_tokens}_{num_probe_tokens}_{top_k}/'
try: os.mkdir(folder)
except FileExistsError: folder = folder

time_st = str(datetime.now())

df_AAB_priming.to_csv(f'{folder}{time_st}_generate_AAB.csv',sep=';')
df_ABA_priming.to_csv(f'{folder}{time_st}_generate_ABA.csv',sep=';')
df_ABB_priming.to_csv(f'{folder}{time_st}_generate_ABB.csv',sep=';')
df_ABCa_priming.to_csv(f'{folder}{time_st}_generate_ABCa.csv',sep=';')
df_ABCb_priming.to_csv(f'{folder}{time_st}_generate_ABCb.csv',sep=';')

2022-10-14 13:26:52.226680: 1 done
2022-10-14 13:26:53.213294: 2 done
2022-10-14 13:26:54.161356: 3 done
2022-10-14 13:26:55.155132: 4 done
2022-10-14 13:26:56.050976: 5 done
2022-10-14 13:26:56.949184: 6 done
2022-10-14 13:26:57.903949: 7 done
2022-10-14 13:26:58.871698: 8 done
2022-10-14 13:26:59.840346: 9 done
2022-10-14 13:27:00.805997: 10 done
2022-10-14 13:27:09.549496: 11 done
2022-10-14 13:27:10.621161: 12 done
2022-10-14 13:27:11.632942: 13 done
2022-10-14 13:27:12.566829: 14 done
2022-10-14 13:27:13.532439: 15 done
2022-10-14 13:27:14.475340: 16 done
2022-10-14 13:27:15.417677: 17 done
2022-10-14 13:27:16.342399: 18 done
2022-10-14 13:27:17.273014: 19 done
2022-10-14 13:27:18.322158: 20 done
2022-10-14 13:27:27.678324: 21 done
2022-10-14 13:27:28.627882: 22 done
2022-10-14 13:27:29.547598: 23 done
2022-10-14 13:27:30.486117: 24 done
2022-10-14 13:27:31.422173: 25 done
2022-10-14 13:27:32.388479: 26 done
2022-10-14 13:27:33.360265: 27 done
2022-10-14 13:27:34.323652: 28 done
2

# Download

In [None]:
!zip -r downDAVINCIbl.zip /content/gpt3_100_64_10_5
from google.colab import files
files.download('downDAVINCIbl.zip')

  adding: content/gpt3_100_64_10_5/ (stored 0%)
  adding: content/gpt3_100_64_10_5/2022-10-14 13:29:41.192755_generate_ABCb.csv (deflated 78%)
  adding: content/gpt3_100_64_10_5/2022-10-14 13:29:41.192755_generate_AAB.csv (deflated 82%)
  adding: content/gpt3_100_64_10_5/2022-10-14 13:29:41.192755_generate_ABCa.csv (deflated 78%)
  adding: content/gpt3_100_64_10_5/2022-10-14 13:29:41.192755_generate_ABA.csv (deflated 78%)
  adding: content/gpt3_100_64_10_5/2022-10-14 13:29:41.192755_generate_ABB.csv (deflated 82%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!rm -r /content/down*
!rm -r /content/gpt*