# Setup

In [1]:
import os
os.environ['OPENAI_API_KEY'] = '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'

# Creating the pool

In [None]:
import bolift
import itertools
import cloudpickle

In [None]:
possible_mutations = ['DhyaA', 'DhydA', 'DhyaB', 'DcymA', 'DmtrA', 'DomcA', 'DmtrC']

shewanellas = ['WT']
for k in itertools.combinations(possible_mutations, 2):
  shewanellas.append("".join(k))
for k in itertools.combinations(possible_mutations, 3):
  shewanellas.append("".join(k))

In [None]:
props = {
  'SH_mutation': shewanellas,
  'SH_initial_conc': [0.05],
  'QD_material': ['CdS', 'CdSe', 'CdTe'],
  'QD_conc': [0.5, 1],
  'QD_Size': [510, 520, 535],
  'QD_Shape': ['spheres'], #, 'rods', 'palettes'],
  'QD_Surface': ["Nothing", "MPA (3-mercaptopropionic acid)", "GSH (glutathione)", "Cys (cystine)"],
  'MD_medium': ['minimum'],
  'MD_growth': ['anaerobic'],
  'MD_nutrient': ['lactate'],
  'MD_nutrient_conc': [20],
  'MD_shaking': [50, 100, 150],
  'MD_temperature': [5, 25, 45],
  'MD_pH': [7],
  'MD_light': [530],
  'MD_time': [1],
}


'''
The proposed experimental procedure is illustraded below:

WT Shewanella oneidensis MR-1 (inital OD600 0.05) were cultured with 1.0uM cadmium selenide quantum dots capped with 3-mercaptopropionic acid (MPA) in an
anaerobic minimal medium solution containing 20 mM lactate at 25 C. Cultures were irridated with 530nm LEDs for 1 week to yield 1.0 umol +/- 0.1 umol Hydrogen.
'''


pool_elements = []
for procedure in itertools.product(*props.values()):
  (SH_mutation,
  SH_initial_conc,
  QD_material,
  QD_conc,
  QD_Size,
  QD_Shape,
  QD_Surface,
  MD_medium,
  MD_growth,
  MD_nutrient,
  MD_nutrient_conc,
  MD_shaking,
  MD_temperature,
  MD_pH,
  MD_light,
  MD_time) = procedure
  scaffold = f"{SH_mutation} Shewanella oneidensis MR-1 (initial OD600 {SH_initial_conc}) were cultured with "\
             f"{QD_conc} uM {QD_material} quantum dots capped with {QD_Surface} "\
             f"in an {MD_growth} {MD_medium} medium solution containing {MD_nutrient_conc} mM {MD_nutrient} at {MD_temperature} ºC. " \
             f"Cultures were irradiated with {MD_light} nm LEDs for {MD_time} week."

  pool_elements.append(scaffold)

with open('pool.dat', 'w') as pool_file:
  pool_file.write('Procedure')
  pool_file.write('\n'.join(pool_elements))


In [None]:
pool = bolift.Pool(pool_elements)



In [None]:
cloudpickle.dump(pool, open("pool.pkl", "wb"))

# Generating prompts based on known data

In [2]:
import pandas as pd

# Load the Google Sheet into a Pandas DataFrame
df = pd.read_csv("BioNano Shared - exp results - formatted.csv", skiprows=[0])
df.drop(columns=df.columns[15:], inplace=True)

# Cleaning dataframe
## Remove invalid H2 values
df['H2'] = pd.to_numeric(df['H2'], errors='coerce')
df.dropna(subset=['H2'], inplace=True)
df.reset_index(drop=True, inplace=True)

## Build the mutation string
mutations = df.columns[1:8]
df[mutations] = df[mutations].astype(int)

def build_mutation(row):
  mut = ""
  for m in row.keys()[0:8]:
    if row[m] == 'Ascorbic Acid':
      mut += "Ascorbic Acid"
    else:
      mut += f"{m}" if int(row[m]) != 0 else ""
  return mut

df['mutation'] = df.apply(lambda x: build_mutation(x), axis=1)
df = df[df['mutation'] != "Ascorbic Acid"]
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,WT,DhyaA,DhydA,DhyaB,DcymA,DmtrA,DmtrC,DomcA,Starting Conc,Size,Conc,Ligand,H2,Stdev,n,mutation
0,1,0,0,0,0,0,0,0,0.05,527.0,1.0,MPA,20.0,293.0,3.0,WT
1,1,0,0,0,0,0,0,0,0.05,,0.0,MPA,10.8,0.16,2.0,WT
2,0,0,1,1,0,0,0,0,0.05,527.0,1.0,MPA,25.2,1.06,3.0,DhydADhyaB
3,0,0,1,1,0,0,0,0,0.05,,0.0,MPA,0.0,0.0,1.0,DhydADhyaB
4,1,0,0,0,0,0,0,0,0.05,528.0,1.0,MPA,18.7,0.364,3.0,WT
5,0,0,1,1,0,0,0,0,0.05,528.0,1.0,MPA,21.9,1.62,3.0,DhydADhyaB
6,0,0,0,0,1,0,0,0,0.05,528.0,1.0,MPA,1.67,0.923,3.0,DcymA
7,0,0,0,0,0,1,0,0,0.05,528.0,1.0,MPA,4.75,0.627,3.0,DmtrA
8,0,0,0,0,0,0,1,1,0.05,528.0,1.0,MPA,7.17,0.141,3.0,DmtrCDomcA
9,1,0,0,0,0,0,0,0,0.05,,0.0,,6.42,5.18,2.0,WT


In [3]:

prompts, labels = [], []
for i, k in df.iterrows():
  (SH_mutation,
  SH_initial_conc,
  QD_material,
  QD_conc,
  QD_Size,
  QD_Shape,
  QD_Surface,
  MD_medium,
  MD_growth,
  MD_nutrient,
  MD_nutrient_conc,
  MD_shaking,
  MD_temperature,
  MD_pH,
  MD_light,
  MD_time) = k['mutation'], k['Starting Conc'], 'CdSe', k['Conc'], k['Size'], "espherical", k['Ligand'], 'minimal', 'anaerobic', 'lactate', '20', '100', '25', '7', '530','1'

  # shewanella = f'{mutation} Shewanella oneidensis MR-1 (initial OD600 {sh_conc})' if mutation != 'Ascorbic Acid' else f'A solution of {mutation} {sh_conc}'

  scaffold = f"{SH_mutation} Shewanella oneidensis MR-1 (initial OD600 {SH_initial_conc}) were cultured with "\
             f"{QD_conc} uM {QD_material} quantum dots capped with {QD_Surface} "\
             f"in an {MD_growth} {MD_medium} medium solution containing {MD_nutrient_conc} mM {MD_nutrient} at {MD_temperature} ºC. " \
             f"Cultures were irradiated with {MD_light} nm LEDs for {MD_time} week."
  label = k['H2']

  prompts.append(scaffold)
  labels.append(label)


# bo-lift

### Test prediction

In [12]:
import bolift
import cloudpickle

In [5]:
import numpy as np
indexes = np.arange(len(prompts))
np.random.shuffle(indexes)
split = (len(indexes)-5)/len(indexes)

train_indexes = indexes[:int(split*len(indexes))]
test_indexes = indexes[int(split*len(indexes)):]

In [6]:
asktell = bolift.AskTellFewShotTopk(
    prefix="The following are correctly answered questions about H2 production by Shewanella oneidensis MR-1 and quantum dots. " \
          " Each answer should be numeric and ends with ###",
    x_formatter=lambda x: f"the experimental procedure: {x}",
    y_name="H2 production",
    y_formatter=lambda y: f"{y:.2f}",
    model="gpt-4",
    selector_k=5,
    temperature=0.7
)

for i in train_indexes:
  asktell.tell(prompts[i], labels[i])

In [7]:
yhat=[]
y=[]
for i in test_indexes:
  yhat.append(asktell.predict(prompts[i]))
  y.append(labels[i])

In [8]:
for i in test_indexes:
  print(f"{prompts[i]} => {labels[i]}")

[f"{i.mean():.2f}" for i in yhat], y

DhydADhyaB Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with 1 uM CdSe quantum dots capped with MPA in an anaerobic minimal medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week. => 18.7
DhydADhyaB Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with 1 uM CdSe quantum dots capped with MPA in an anaerobic minimal medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week. => 25.2
DhydADhyaB Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with 0 uM CdSe quantum dots capped with nan in an anaerobic minimal medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week. => 0.0
WT Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with 1 uM CdSe quantum dots capped with MPA in an anaerobic minimal medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week. =

(['27.89', '27.89', '0.00', '10.86', '10.86'], [18.7, 25.2, 0.0, 18.7, 29.8])

In [9]:
sum = 0
print(f"| {'predicted':^23s} | {'label':^10s} | {'AE':^10s} | ")
n = 0
for ihat, i in zip(yhat, y):
  if i>30: continue
  n+=1
  mae = abs(ihat.mean()-i)
  print(f"| {ihat.mean():^10.2f}+/-{ihat.std():^10.2f} | {i:^10.2f} | {mae:^10.2f} |")
  sum += mae
print(f"\n{'MAE: ':>20s}{sum/n:<18.2f}")

|        predicted        |   label    |     AE     | 
|   27.89   +/-   0.32    |   18.70    |    9.19    |
|   27.89   +/-   0.32    |   25.20    |    2.69    |
|    0.00   +/-  11.66    |    0.00    |    0.00    |
|   10.86   +/-   1.82    |   18.70    |    7.84    |
|   10.86   +/-   1.82    |   29.80    |   18.94    |

               MAE: 7.73              


In [10]:
for i in test_indexes:
  asktell.tell(prompts[i], labels[i])

In [13]:
cloudpickle.dump(asktell, open('asktell.pkl', 'wb'))

### Running the Bayesian optimization

In [None]:
# asktell = bolift.AskTellFewShotTopk(
#     prefix="The following are correctly answered questions about H2 production by Shewanella oneidensis MR-1 and quantum dots. " \
#           " Each answer should be numeric and ends with ###",
#     x_formatter=lambda x: f"the experimental procedure: {x}",
#     y_name="H2 production",
#     y_formatter=lambda y: f"{y:.2f}",
#     model="gpt-4",
#     selector_k=5,
#     temperature=0.7
# )

# for p, l in zip(prompts, labels):
#   asktell.tell(p, l)

In [14]:
asktell = cloudpickle.load(open("./asktell.pkl", "rb"))
pool = cloudpickle.load(open("./pool.pkl", "rb"))

In [15]:
asktell.ask(pool, "probability_of_improvement", 5)

(['DmtrADomcADmtrC Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with0.5 uM CdS quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum medium solution containing 20 mM lactate at 5 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DomcADmtrC Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with0.5 uM CdSe quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum medium solution containing 20 mM lactate at 45 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhyaADhyaBDmtrC Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with0.5 uM CdSe quantum dots capped with Cys (cystine) in an anaerobic minimum medium solution containing 20 mM lactate at 45 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhydADhyaB Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with1 uM CdTe quantum dots capped with GSH (glutathione) in an anaerobic minimum medium solution

In [16]:
asktell.ask(pool, "expected_improvement", 5)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-4 in organization org-zVzDC0J6UhWoGf9pmQAfuLud on tokens per min. Limit: 40000 / min. Please try again in 1ms. Contact us through our help center at help.openai.com if you continue to have issues..
Error in OpenAICallbackHandler.on_retry callback: 'OpenAICallbackHandler' object has no attribute 'on_retry'
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-4 in organization org-zVzDC0J6UhWoGf9pmQAfuLud on tokens per min. Limit: 40000 / min. Please try again in 1ms. Contact us through our help center at help.openai.com if you continue to have issues..
Error in OpenAICallbackHandler.on_retry callback: 'OpenAICallbackHandler' object has no attribute 'on_retry'
Retrying langchain.cha

(['DhyaADhydADomcA Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with0.5 uM CdS quantum dots capped with Cys (cystine) in an anaerobic minimum medium solution containing 20 mM lactate at 5 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhyaADhydA Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with1 uM CdSe quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum medium solution containing 20 mM lactate at 45 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhyaADhyaBDmtrC Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with0.5 uM CdTe quantum dots capped with Nothing in an anaerobic minimum medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DcymADomcADmtrC Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with0.5 uM CdTe quantum dots capped with Cys (cystine) in an anaerobic minimum medium solution containing 20 mM lact

In [17]:
asktell.ask(pool, "greedy", 5)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-4 in organization org-zVzDC0J6UhWoGf9pmQAfuLud on tokens per min. Limit: 40000 / min. Please try again in 1ms. Contact us through our help center at help.openai.com if you continue to have issues..
Error in OpenAICallbackHandler.on_retry callback: 'OpenAICallbackHandler' object has no attribute 'on_retry'
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Rate limit reached for default-gpt-4 in organization org-zVzDC0J6UhWoGf9pmQAfuLud on tokens per min. Limit: 40000 / min. Please try again in 1ms. Contact us through our help center at help.openai.com if you continue to have issues..
Error in OpenAICallbackHandler.on_retry callback: 'OpenAICallbackHandler' object has no attribute 'on_retry'
Retrying langchain.cha

(['DhyaADhydA Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with1 uM CdSe quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhyaADhydA Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with1 uM CdSe quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhyaADhydA Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with1 uM CdSe quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum medium solution containing 20 mM lactate at 25 ºC. Cultures were irradiated with 530 nm LEDs for 1 week.',
  'DhyaADhydA Shewanella oneidensis MR-1 (initial OD600 0.05) were cultured with1 uM CdSe quantum dots capped with MPA (3-mercaptopropionic acid) in an anaerobic minimum