<a href="https://colab.research.google.com/github/myprogrammerpersonality/BlackBoxOptimizer/blob/master/BBO_v2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>version 2:</h1>
* alternative for each metabolite<br>           
* RandomSearchCV each round<br>
* suggestion for low high stock conc

## HeadQuarter

Main notebook that includes data processing and bayesian optimization function

In [1]:
import pandas as pd
import numpy as np
import os
from xgboost import XGBRegressor
import time

pd.options.mode.chained_assignment = None  # default='warn'
!wget https://raw.githubusercontent.com/myprogrammerpersonality/BlackBoxOptimizer/master/functions_final.py
from functions_final import *

--2020-08-20 21:24:38--  https://raw.githubusercontent.com/myprogrammerpersonality/BlackBoxOptimizer/master/functions_final.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14798 (14K) [text/plain]
Saving to: ‘functions_final.py’


2020-08-20 21:24:38 (603 KB/s) - ‘functions_final.py’ saved [14798/14798]



In [2]:
# Part 1: choose grid for our metabolite conc and define stock concentration

# it is important to define all value for function to be able to get conc that is appropriate to your minimum drop size
# each metabolite min, max and stock must be in same units

# concentrations_limits :
# format for one alternative : (min, max, stock)
# format for more than one alternatives : (min, max, stock, (alternative 1, alternative 2, alternative 3))
# *** metabolite name must not includes "_" ***

concentrations_limits ={
 'TF':(0.8, 3.12, 156, ('A1', 'B2', 'C3')), # mM
 'K-Glutamate':(30, 150, 3000.0),
 'DNA':(20, 20, 1000)} # nM

In [3]:
# Check Possible Concentrations
for key, value in concentrations_limits.items():
    print('Possible Conc For :',key)
    print('Your Min, Max :', "({}, {})".format(value[0], value[1]))
    print(allowed_output(value, 20000, 100))
    print()

Possible Conc For : TF
Your Min, Max : (0.8, 3.12)
[0.78, 1.56, 2.34, 3.12]

Possible Conc For : K-Glutamate
Your Min, Max : (30, 150)
[30.0, 45.0, 60.0, 75.0, 90.0, 105.0, 120.0, 135.0, 150.0]

Possible Conc For : DNA
Your Min, Max : (20, 20)
[20.0]



In [4]:
# add reference and negative control
# leave it empty if you dont need
# you can add more desired combination to this dictionary
# *** conc in these dic must be in possible conc that calculated previosly ***

specials = {
"reference" : {'TF':[1.56], 'TF_A1':[0], 'TF_B2':[0], 'TF_C3':[1], 'K-Glutamate':[90.0], 'DNA':[20.0]},
            
"control_neg" : {'TF':[1.56], 'TF_A1':[0], 'TF_B2':[0], 'TF_C3':[1], 'K-Glutamate':[90.0], 'DNA':[0.0]}
}

In [6]:
# General Parameters:
m = 20      # number_of_combination_each_round
minimum_drop_size_nanoliter = 100
final_reaction_volume_nanoliter = 20000
maximum_volume_of_model_output = 13200 # (e.g. volume except lysate)
fixed_parts = {'Lysate':0.33, 'Saline':0.01} # 0.33 means 33% of total will be Lysate, 10% saline solution

# Model Parameters:
exploration = {2: 1.41, 3: 1.41,
               4: 1.0, 5: 1.0, 6: 1.0, 7: 1.0,
               8: 0.5, 9: 0.5, 9: 0.5, 10: 0.5}
pool_size = 108
days_range = [20, 20, 20, 20, 20, 20, 20, 20, 20, 20]

## Day 1

In [7]:
# make random combinations
Concentrations_1 = random_combination_generator(concentrations_limits, number_of_combination = m,
                                                reaction_vol_nl=final_reaction_volume_nanoliter,
                                                max_nl=maximum_volume_of_model_output, drop_size_nl=minimum_drop_size_nanoliter, return_df=True)

# add control, reference and other desired combinations
df_specials = [pd.DataFrame(i) for i in specials.values()]
Concentrations_1 = pd.concat([Concentrations_1, *df_specials]).reset_index(drop=True)

Concentrations_1

Unnamed: 0,TF,TF_A1,TF_B2,TF_C3,K-Glutamate,DNA
0,1.56,0.0,0.0,1.0,135.0,20.0
1,2.34,1.0,0.0,0.0,90.0,20.0
2,1.56,0.0,1.0,0.0,90.0,20.0
3,0.78,1.0,0.0,0.0,90.0,20.0
4,0.78,1.0,0.0,0.0,150.0,20.0
5,3.12,1.0,0.0,0.0,30.0,20.0
6,1.56,1.0,0.0,0.0,105.0,20.0
7,1.56,0.0,0.0,1.0,120.0,20.0
8,0.78,1.0,0.0,0.0,120.0,20.0
9,1.56,0.0,1.0,0.0,120.0,20.0


In [10]:
!mkdir -p Day_1
Concentrations_1.to_csv('Day_1/Concentrations_1.csv', index=False)

In [11]:
# concentration_to_volume
Volumes_1 = concentration_to_volume(Concentrations_1, concentrations_limits, reaction_mixture_vol_nl=final_reaction_volume_nanoliter, fixed_parts=fixed_parts)
Volumes_1.to_csv('Day_1/Volumes_1.csv', index=False)

## Other Days

In [13]:
day = day_finder('Results') - 1
day

1

In [14]:
desired_cols = []
for key, value in concentrations_limits.items():
    if len(value) == 3:
        desired_cols.append(key)
    elif len(value) == 4:
        desired_cols.append(key)
        alternative_name = ['{}_{}'.format(key, i) for i in value[3]]
        desired_cols += alternative_name

fixed_col_value = process_limits(concentrations_limits)
final_order = desired_cols

aggregated_data_m = pd.DataFrame(columns=desired_cols)
aggregated_label_m = pd.DataFrame(columns=['yield'])


for num in range(day):
    data_m, label_m, _, _= result_preprocess(num + 1, desired_cols, days_range[num])
    
    aggregated_data_m = pd.concat([aggregated_data_m, data_m]).reset_index(drop=True)
    aggregated_label_m = pd.concat([aggregated_label_m, label_m]).reset_index(drop=True)

if 'reference' in specials.keys():
    ref_data = pd.DataFrame(specials['reference'])
    ref_label = pd.DataFrame({'yield':[1.0]})
    aggregated_data_m = pd.concat([aggregated_data_m, ref_data[desired_cols]]).reset_index(drop=True)
    aggregated_label_m = pd.concat([aggregated_label_m, ref_label]).reset_index(drop=True)

aggregated_data_m

Unnamed: 0,TF,TF_A1,TF_B2,TF_C3,K-Glutamate,DNA
0,1.56,1,0,0,150,20
1,0.78,0,0,1,105,20
2,3.12,0,0,1,30,20
3,1.56,0,0,1,120,20
4,2.34,1,0,0,90,20
5,0.78,1,0,0,90,20
6,2.34,0,0,1,45,20
7,0.78,0,0,1,45,20
8,1.56,0,0,1,90,20
9,1.56,0,0,1,75,20


In [15]:
# our ensemble of regressors

# it is an enhanced random forest boosted algorithm

# best
regressors_list = [XGBRegressor(
    objective = 'reg:squarederror',
    n_estimators = n,
    learning_rate = 0.03,
    max_depth = 6,
    min_child_weight = 4,
    subsample = 0.9,
    gamma = 0.4,
    colsample_bytree = 0.9) for n in (10, 20, 30, 40, 50)]

In [16]:
t0 = time.time()

Concentrations_n_m = bayesian_optimization(regressors_list, aggregated_data_m, aggregated_label_m, concentrations_limits,
                                         desired_cols=desired_cols, fixed_col_value=fixed_col_value, final_order=final_order,
                                         exploitation=1, exploration=exploration[day+1], test_size=m, pool_size=pool_size, verbose=0,
                                         day=day, days_range = days_range)

print("Passed Time(s): ",time.time()-t0)

Concentrations_n_m

Passed Time(s):  0.5242011547088623


Unnamed: 0,TF,TF_A1,TF_B2,TF_C3,K-Glutamate,DNA
0,1.56,0.0,0.0,1.0,75.0,20
1,0.78,1.0,0.0,0.0,75.0,20
2,3.12,0.0,0.0,1.0,105.0,20
3,3.12,0.0,0.0,1.0,150.0,20
4,2.34,1.0,0.0,0.0,75.0,20
5,2.34,0.0,1.0,0.0,135.0,20
6,1.56,0.0,0.0,1.0,150.0,20
7,0.78,0.0,1.0,0.0,150.0,20
8,0.78,0.0,1.0,0.0,60.0,20
9,0.78,0.0,1.0,0.0,75.0,20


In [17]:
# add control, reference and other desired combinations
df_specials = [pd.DataFrame(i) for i in specials.values()]
Concentrations_n = pd.concat([Concentrations_n_m, *df_specials]).reset_index(drop=True)

name_folder = 'Day_{}'.format(day+1)
! mkdir -p {name_folder}

Concentrations_n.to_csv('Day_{}/Concentrations_{}.csv'.format(day+1, day+1), index=False)

In [18]:
# check to dont make repeated combination
previous = [pd.read_csv('Day_{}/Concentrations_{}.csv'.format(i,i)).iloc[:m,:] for i in range(1, day+1)]

df_main = pd.concat(previous)

comparison_df = df_main.merge(pd.read_csv('Day_{}/Concentrations_{}.csv'.format(day+1, day+1)).iloc[:m,:],
                              indicator=True,
                              how='outer')

comparison_df._merge.unique()

[left_only, right_only]
Categories (2, object): [left_only, right_only]

In [19]:
# concentration_to_volume
Volumes_n = concentration_to_volume(Concentrations_n, concentrations_limits, reaction_mixture_vol_nl=final_reaction_volume_nanoliter, fixed_parts=fixed_parts)

Volumes_n.to_csv('Day_{}/Volumes_{}.csv'.format(day+1, day+1), index=False)