<a href="https://colab.research.google.com/github/myprogrammerpersonality/BlackBoxOptimizer/blob/master/BBO_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## HeadQuarter

Main notebook that includes data processing and bayesian optimization function

In [1]:
import pandas as pd
import numpy as np
import os
from xgboost import XGBRegressor
import time

pd.options.mode.chained_assignment = None  # default='warn'
!wget https://raw.githubusercontent.com/myprogrammerpersonality/BlackBoxOptimizer/master/functions.py
from functions import *

--2020-08-16 22:15:06--  https://raw.githubusercontent.com/myprogrammerpersonality/BlackBoxOptimizer/master/functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10826 (11K) [text/plain]
Saving to: ‘functions.py.1’


2020-08-16 22:15:07 (102 MB/s) - ‘functions.py.1’ saved [10826/10826]



In [2]:
# Part 1: choose grid for our metabolite conc and define stock concentration

# it is important to define all value for function to be able to get conc that is appropriate to 25 nl droplet of ECHO machine
# each metabolite min, max and stock must have same units

# concentrations_limits (min, max, stock)
# final format

concentrations_limits ={
 'NTP':(0.78, 3.12, 156), # mM p=156
 'HEPES':(50, 50, 2000), # mM p=2000
 'K_Glutamate':(30, 150, 3000.0), # mM p=2666.66 ***
 'Mg_Glutamate':(0.0, 10, 500), # mM p=500
 'spermidine':(0.1, 1.0, 20), # mM p=20
 'DTT':(0.0, 3.0, 100), # mM p=100 ***
 'amino_acids':(0.15, 1.5, 6), # mM p=6
 'folinic_acid':(0.0136, 0.068, 2.72), # mM p=2.72
 'tRNA_mix':(0.04, 0.2, 8), # mg/ml p=8
 'PEG':(0.0, 4.0, 50), # % p=50 ***
 'CoA':(0.065, 0.325, 13), # mM p=13
 'NAD':(0.0875, 0.4375, 17.5), # mM p=17.5
 'cAMP':(0.1625, 0.8125, 32.5), # mM p=32.5
 '3-PGA':(4.0, 30, 800), # mM p=800 ***
 'DNA':(20, 20, 1000)} # nM p=1000

In [3]:
# add controls and ref
## Run it for all days


ref_excel = {'NTP':[0.78], 'HEPES':[50.0], 'K_Glutamate':[45.0], 'Mg_Glutamate':[2.5], 'spermidine':[0.3], 'DTT':[0.5],
       'amino_acids':[0.75], 'folinic_acid':[0.0408], 'tRNA_mix':[0.04], 'PEG':[2.0], 'CoA':[0.065], 'NAD':[0.0875],
       'cAMP':[0.325], '3-PGA':[16.0], 'DNA':[20.0]}

jove_protocol = {'NTP':[1.56], 'HEPES':[50.0], 'K_Glutamate':[90.0], 'Mg_Glutamate':[5.0], 'spermidine':[1.0], 'DTT':[0.5],
       'amino_acids':[1.5], 'folinic_acid':[0.068], 'tRNA_mix':[0.2], 'PEG':[2.0], 'CoA':[0.26], 'NAD':[0.35],
       'cAMP':[0.8125], '3-PGA':[28.0], 'DNA':[20.0]}


our_ref = {'NTP':[1.56], 'HEPES':[50.0], 'K_Glutamate':[90.0], 'Mg_Glutamate':[5.0], 'spermidine':[0.5], 'DTT':[1.5],
       'amino_acids':[0.81], 'folinic_acid':[0.0408], 'tRNA_mix':[0.12], 'PEG':[2.0], 'CoA':[0.195], 'NAD':[0.2625],
       'cAMP':[0.4875], '3-PGA':[16.0], 'DNA':[20.0]}

control_neg = {'NTP':[1.56], 'HEPES':[50.0], 'K_Glutamate':[90.0], 'Mg_Glutamate':[5.0], 'spermidine':[0.5], 'DTT':[1.5],
       'amino_acids':[0.81], 'folinic_acid':[0.0408], 'tRNA_mix':[0.12], 'PEG':[2.0], 'CoA':[0.195], 'NAD':[0.2625],
       'cAMP':[0.4875], '3-PGA':[16.0], 'DNA':[0.0]}

## Day 1

In [None]:
Concentrations_1 = random_combination_generator(concentrations_limits, number_of_combination=20, reaction_vol_nl=20000,
                                                check_max=True, max_nl=13200, drop_size_nl=100, make_csv=False, return_df=True)
Concentrations_1

Unnamed: 0,NTP,HEPES,K_Glutamate,Mg_Glutamate,spermidine,DTT,amino_acids,folinic_acid,tRNA_mix,PEG,CoA,NAD,cAMP,3-PGA,DNA
0,1.56,50.0,135.0,5.0,0.2,0.5,1.41,0.0544,0.04,0.25,0.26,0.2625,0.4875,32.0,20.0
1,1.56,50.0,30.0,10.0,1.0,3.0,0.21,0.0408,0.16,1.0,0.13,0.35,0.65,28.0,20.0
2,2.34,50.0,90.0,7.5,0.1,0.5,0.66,0.0408,0.12,2.75,0.195,0.0875,0.65,8.0,20.0
3,3.12,50.0,75.0,5.0,0.4,1.5,0.84,0.0272,0.12,2.25,0.13,0.175,0.8125,20.0,20.0
4,1.56,50.0,30.0,2.5,0.9,0.5,0.6,0.0136,0.04,0.5,0.065,0.2625,0.1625,8.0,20.0
5,3.12,50.0,105.0,0.0,0.4,0.5,0.81,0.0544,0.16,2.5,0.325,0.35,0.1625,12.0,20.0
6,2.34,50.0,90.0,7.5,1.0,2.0,0.39,0.0272,0.16,3.75,0.065,0.4375,0.8125,4.0,20.0
7,1.56,50.0,120.0,0.0,0.7,2.5,0.9,0.0136,0.12,1.25,0.26,0.4375,0.4875,8.0,20.0
8,1.56,50.0,150.0,2.5,0.4,1.5,1.2,0.0272,0.08,1.5,0.13,0.175,0.4875,8.0,20.0
9,1.56,50.0,75.0,0.0,0.7,0.5,0.84,0.0136,0.16,3.0,0.195,0.35,0.1625,32.0,20.0


In [None]:
df_ref = pd.DataFrame(ref_excel)

df_jove = pd.DataFrame(jove_protocol)

df_ours = pd.DataFrame(our_ref)

df_control = pd.DataFrame(control_neg)

Concentrations_1 = pd.concat([Concentrations_1, df_ref, df_jove, df_ours, df_control]).reset_index(drop=True)
Concentrations_1

Unnamed: 0,NTP,HEPES,K_Glutamate,Mg_Glutamate,spermidine,DTT,amino_acids,folinic_acid,tRNA_mix,PEG,CoA,NAD,cAMP,3-PGA,DNA
0,1.56,50.0,135.0,5.0,0.2,0.5,1.41,0.0544,0.04,0.25,0.26,0.2625,0.4875,32.0,20.0
1,1.56,50.0,30.0,10.0,1.0,3.0,0.21,0.0408,0.16,1.0,0.13,0.35,0.65,28.0,20.0
2,2.34,50.0,90.0,7.5,0.1,0.5,0.66,0.0408,0.12,2.75,0.195,0.0875,0.65,8.0,20.0
3,3.12,50.0,75.0,5.0,0.4,1.5,0.84,0.0272,0.12,2.25,0.13,0.175,0.8125,20.0,20.0
4,1.56,50.0,30.0,2.5,0.9,0.5,0.6,0.0136,0.04,0.5,0.065,0.2625,0.1625,8.0,20.0
5,3.12,50.0,105.0,0.0,0.4,0.5,0.81,0.0544,0.16,2.5,0.325,0.35,0.1625,12.0,20.0
6,2.34,50.0,90.0,7.5,1.0,2.0,0.39,0.0272,0.16,3.75,0.065,0.4375,0.8125,4.0,20.0
7,1.56,50.0,120.0,0.0,0.7,2.5,0.9,0.0136,0.12,1.25,0.26,0.4375,0.4875,8.0,20.0
8,1.56,50.0,150.0,2.5,0.4,1.5,1.2,0.0272,0.08,1.5,0.13,0.175,0.4875,8.0,20.0
9,1.56,50.0,75.0,0.0,0.7,0.5,0.84,0.0136,0.16,3.0,0.195,0.35,0.1625,32.0,20.0


In [None]:
Concentrations_1.to_csv('Day_1/Concentrations_1.csv', index=False)

In [None]:
# concentration_to_volume
Volumes_1 = concentration_to_volume(Concentrations_1, concentrations_limits, reaction_mixture_vol_nl=20000, add_lysate=True, make_csv=False)
Volumes_1.to_csv('Day_1/Volumes_1.csv', index=False)

## Other Days

In [18]:
# find the first uncomplete day
def day_finder(file, file_format='csv'):
    for i in range(1, 12):
        if not os.path.isfile('Day_{}/{}_{}.{}'.format(i, file, i, file_format)):
            return i
    return 0

day = day_finder('Results') - 1
day

2

In [5]:
# result preprocess

def result_preprocess(day, desired_cols, ranges=(20, 24)):
    
    results = pd.read_csv('Day_{}/Results_{}.csv'.format(day, day))

    # 20 number pipeline
    data_20 = results[desired_cols].iloc[:ranges[0],:-1]
    label_20 = results[desired_cols].iloc[:ranges[0],-1:]

    # ref_excel, jove, ours , control
    data_specials = results[desired_cols].iloc[ranges[0]:ranges[1],:-1]
    label_specials = results[desired_cols].iloc[ranges[0]:ranges[1],-1:]
    
    return data_20, label_20, data_specials, label_specials

In [6]:
desired_cols = ['NTP', 'K_Glutamate', 'Mg_Glutamate', 'spermidine', 'DTT','amino_acids',
                'folinic_acid', 'tRNA_mix', 'PEG', 'CoA', 'NAD', 'cAMP','3-PGA', 'yield']

aggregated_data_20 = pd.DataFrame(columns=desired_cols[:-1])
aggregated_label_20 = pd.DataFrame(columns=desired_cols[-1:])

days_range = [(20,24), (20,24), (20,24), (20,24), (20,24), (20,24), (20,24), (20,24), (20,24), (20,24)]

for num in range(day):
    data_20, label_20, _, _ = result_preprocess(num + 1, desired_cols, days_range[num])
    
    aggregated_data_20 = pd.concat([aggregated_data_20, data_20]).reset_index(drop=True)
    aggregated_label_20 = pd.concat([aggregated_label_20, label_20]).reset_index(drop=True)

ref_data = pd.DataFrame(our_ref)
ref_label = pd.DataFrame({'yield':[1.0]})

aggregated_data_20 = pd.concat([aggregated_data_20, ref_data[['NTP', 'K_Glutamate', 'Mg_Glutamate', 'spermidine', 'DTT','amino_acids',
                'folinic_acid', 'tRNA_mix', 'PEG', 'CoA', 'NAD', 'cAMP','3-PGA']]]).reset_index(drop=True)

aggregated_label_20 = pd.concat([aggregated_label_20, ref_label]).reset_index(drop=True)

aggregated_data_20

Unnamed: 0,NTP,K_Glutamate,Mg_Glutamate,spermidine,DTT,amino_acids,folinic_acid,tRNA_mix,PEG,CoA,NAD,cAMP,3-PGA
0,1.56,90,10.0,0.7,1.0,1.44,0.0544,0.16,0.25,0.195,0.35,0.4875,4
1,3.12,105,10.0,0.1,2.5,0.99,0.068,0.08,1.25,0.13,0.2625,0.4875,4
2,2.34,30,10.0,0.2,0.0,0.69,0.0272,0.08,0.5,0.065,0.0875,0.325,12
3,1.56,105,0.0,1.0,0.5,0.51,0.0408,0.08,3.5,0.26,0.175,0.8125,32
4,2.34,135,10.0,0.1,1.0,0.42,0.0136,0.12,3.0,0.13,0.0875,0.1625,32
5,0.78,30,2.5,0.3,2.5,1.32,0.068,0.2,0.75,0.195,0.2625,0.4875,24
6,2.34,135,2.5,0.8,3.0,1.02,0.0408,0.12,2.0,0.13,0.175,0.4875,12
7,1.56,30,2.5,0.3,1.0,0.33,0.0408,0.16,3.0,0.325,0.0875,0.8125,4
8,0.78,30,5.0,0.3,1.5,1.17,0.0408,0.16,4.0,0.065,0.175,0.4875,20
9,3.12,135,0.0,0.4,3.0,0.39,0.0408,0.2,3.5,0.065,0.175,0.8125,20


In [7]:
# our ensemble of regressors

# it is an enhanced random forest algorithm

# best
regressors_list = [XGBRegressor(
    objective = 'reg:squarederror',
    n_estimators = n,
    learning_rate = 0.03,
    max_depth = 6,
    min_child_weight = 4,
    subsample = 0.9,
    gamma = 0.4,
    colsample_bytree = 0.9) for n in (10, 20, 30, 40, 50)]

In [8]:
desired_cols  = ['NTP', 'K_Glutamate', 'Mg_Glutamate', 'spermidine', 'DTT','amino_acids',
                     'folinic_acid', 'tRNA_mix', 'PEG', 'CoA', 'NAD', 'cAMP','3-PGA']

fixed_col_value = {'HEPES':[50.0], 'DNA':[20.0]}

final_order = ['NTP', 'HEPES', 'K_Glutamate', 'Mg_Glutamate', 'spermidine', 'DTT','amino_acids',
                      'folinic_acid', 'tRNA_mix', 'PEG', 'CoA', 'NAD', 'cAMP','3-PGA', 'DNA']

pool = 100000

t0 = time.time()

# exploration day 2, 3 = 1.41
# day 4, 5, 6, 7 = 1
# day 8, 9, 10 = 0.5
Concentrations_n_20 = bayesian_optimization(regressors_list, aggregated_data_20, aggregated_label_20, concentrations_limits,
                                         desired_cols=desired_cols, fixed_col_value=fixed_col_value, final_order=final_order,
                                         exploitation=1, exploration=0.5, test_size=20, pool_size=pool, verbose=0)

print(time.time()-t0)

Concentrations_n_20

7.6336588859558105


Unnamed: 0,NTP,HEPES,K_Glutamate,Mg_Glutamate,spermidine,DTT,amino_acids,folinic_acid,tRNA_mix,PEG,CoA,NAD,cAMP,3-PGA,DNA
0,2.34,50.0,45.0,10.0,0.5,0.0,0.15,0.0408,0.08,4.0,0.065,0.2625,0.8125,20.0,20.0
1,2.34,50.0,30.0,7.5,0.5,1.0,0.15,0.0408,0.04,3.5,0.065,0.175,0.4875,32.0,20.0
2,1.56,50.0,30.0,10.0,1.0,0.0,0.18,0.0136,0.12,4.0,0.065,0.0875,0.8125,32.0,20.0
3,2.34,50.0,45.0,10.0,0.8,0.5,1.14,0.0408,0.08,4.0,0.065,0.175,0.4875,20.0,20.0
4,1.56,50.0,30.0,7.5,0.8,0.0,0.15,0.0136,0.12,3.0,0.065,0.35,0.4875,20.0,20.0
5,3.12,50.0,45.0,10.0,0.6,1.0,0.18,0.0544,0.04,0.5,0.065,0.2625,0.325,20.0,20.0
6,0.78,50.0,30.0,5.0,0.2,0.0,0.15,0.0544,0.08,3.25,0.065,0.4375,0.4875,20.0,20.0
7,1.56,50.0,30.0,5.0,0.8,1.0,0.15,0.0136,0.12,3.25,0.065,0.175,0.8125,24.0,20.0
8,0.78,50.0,30.0,7.5,0.9,0.5,0.15,0.0408,0.16,3.75,0.065,0.0875,0.8125,20.0,20.0
9,0.78,50.0,30.0,5.0,1.0,1.0,0.18,0.068,0.16,0.25,0.065,0.0875,0.325,32.0,20.0


In [9]:
df_jove = pd.DataFrame(jove_protocol)
df_ours = pd.DataFrame(our_ref)
df_control = pd.DataFrame(control_neg)

Concentrations_n = pd.concat([Concentrations_n_20, df_ours, df_jove, df_ours, df_control]).reset_index(drop=True)

name_folder = 'Day_{}'.format(day+1)
! mkdir {name_folder}

Concentrations_n.to_csv('Day_{}/Concentrations_{}.csv'.format(day+1, day+1), index=False)

In [16]:
# check to dont make repeated combination but it is not likly
previous = [pd.read_csv('Day_{}/Concentrations_{}.csv'.format(i,i)).iloc[:20,:] for i in range(1, day+1)]


df_main = pd.concat(previous)

comparison_df = df_main.merge(pd.read_csv('Day_{}/Concentrations_{}.csv'.format(day+1, day+1)).iloc[:20,:],
                              indicator=True,
                              how='outer')

comparison_df._merge.unique()

[left_only, right_only]
Categories (2, object): [left_only, right_only]

In [17]:
# concentration_to_volume
Volumes_n = concentration_to_volume(Concentrations_n, concentrations_limits,reaction_mixture_vol_nl=20000, lysate_ratio=0.33,
                                    add_lysate=True, make_csv=False)

Volumes_n.to_csv('Day_{}/Volumes_{}.csv'.format(day+1, day+1), index=False)