# Batch UCB
- use your favorite kernel to generate distance matrix
- clustering (k-metriod)
- large clustering numbers: reject clustering for which the max UCB is smaller other clustering's LCB; then choose sequences in the remaining clusterings by UCB.
- small clustering numbers, e.g. 90, choose one sequence from each clustering by UCB.

In [1]:
# direct to proper path
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import defaultdict
import math
import json

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import PairwiseKernel, DotProduct, RBF 
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import KFold
from sklearn_extra.cluster import KMedoids

from codes.embedding import Embedding
from codes.environment import Rewards_env
from codes.ucb import GPUCB, Random
from codes.evaluations import evaluate, plot_eva
from codes.regression import *
from codes.kernels_for_GPK import Spectrum_Kernel, Sum_Spectrum_Kernel, WeightedDegree_Kernel
from codes.batch_ucb import *

from ipywidgets import IntProgress
from IPython.display import display
import warnings
%matplotlib inline

  import pandas.util.testing as tm


In [2]:
Path = '../../data/firstRound_Microplate_normTrue_formatSeq_logTrue.csv'

known_df = pd.read_csv(Path)
known_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Group,RBS,RBS6,Rep1,Rep2,Rep3,Rep4,Rep5,AVERAGE,STD
0,0,RBS_1by1_0,reference,TTTAAGAAGGAGATATACAT,AGGAGA,1.616261,1.814182,1.760954,2.186207,2.028863,1.881293,0.225819
1,1,RBS_1by1_1,bps_noncore,CTTAAGAAGGAGATATACAT,AGGAGA,1.166174,1.337018,1.417248,1.4938,1.713526,1.425553,0.201725
2,2,RBS_1by1_2,bps_noncore,GTTAAGAAGGAGATATACAT,AGGAGA,0.604551,0.751384,0.851987,0.514929,0.577299,0.66003,0.137994
3,3,RBS_1by1_3,bps_noncore,ATTAAGAAGGAGATATACAT,AGGAGA,1.221264,1.466278,1.270212,1.34104,1.39503,1.338765,0.097386
4,4,RBS_1by1_4,bps_noncore,TCTAAGAAGGAGATATACAT,AGGAGA,1.160566,1.579025,1.171829,1.59067,1.411255,1.382669,0.210012


In [3]:
top_n = Top_n_ucb(known_df, kernel_name='WD_Kernel_Shift', 
                normalise_kernel_flag='True', embedding='label', alpha=0.1,
                eva_metric=mean_squared_error, l_list=[6], s=1,
                rec_size=90, beta=1)

In [None]:
top_n_rec_df = top_n.run_experiment()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train_df['label'] = self.train_df['AVERAGE']


X train shape:  (176, 20)
X test shape:  (3961, 20)
create kernel instance


In [None]:
gpbucb = GP_BUCB(known_df, kernel_name='WD_Kernel_Shift', 
                normalise_kernel_flag='True', embedding='label', alpha=0.1,
                eva_metric=mean_squared_error, l_list=[6], s=1,
                rec_size=90, beta=1)

In [None]:
gpbucb_rec_df = gpbucb.run_experiment()

## Create test dataset

In [None]:
# create all combos

combos = [] # 20-base
combos_6 = [] # 6-base
labels = []
char_sets = ['A', 'G', 'C', 'T']
design_len = 6

# to be changed
pre_design = 'TTTAAGA'
pos_design = 'TATACAT'

for combo in itertools.product(char_sets, repeat= design_len):
    
    combo = pre_design + ''.join(combo) + pos_design
    combos_6.append(''.join(combo))
    combos.append(combo)
    labels.append(math.inf)
    
assert len(combos) == len(char_sets) ** design_len
# len(combos)

In [None]:
df_design = pd.DataFrame()
df_design['RBS'] = list(set(combos) - frr_rbs_set)
#df['AVERAGE'] = NaN

In [None]:
df_design

In [None]:
df_design['train_test'] = 'Test'

In [None]:
# put first round result (frr) and design space (design) together
# [:150] frr; [150:] design

df_train_test = pd.concat([df_frr, df_design], sort = True).reset_index()

In [None]:
df_train_test.loc[(df_train_test['train_test'] == 'Train')]

In [None]:
df_train_test.to_csv('../../data/known_design.csv')

In [None]:
# confirm test part

df_train_test.loc[range(150,len(df_train_test))]

## K medoids 

In [None]:
kernel_dict = {
    'Spectrum_Kernel': Spectrum_Kernel,
    'WD_Kernel': WeightedDegree_Kernel,
    'Sum_Spectrum_Kernel': Sum_Spectrum_Kernel,
    'Mixed_Spectrum_Kernel': Mixed_Spectrum_Kernel,
    'WD_Kernel_Shift': WD_Shift_Kernel
    
}

log_flag = True

In [None]:
# setting

embedding = 'label'
eva_on_ave_flag = True # true label is the sample mean instead of individual samples, since the prediction is the posterior mean
eva_metric = mean_squared_error # mean square error returns a more stable optimal hyparameter choice than r2 score

kernel = 'WD_Kernel_Shift'
alpha = 0.5
l = [6]

weight_flag = False
padding_flag = False
gap_flag = False
plot_format = 'plt'

In [None]:
distance = kernel_dict[kernel](l_list=l).distance(np.asarray(df_train_test['RBS']))

In [None]:
distance.shape

In [None]:
kmedoids = KMedoids(n_clusters=90, metric = 'precomputed', init='k-medoids++').fit(distance)
y_km_spec = kmedoids.labels_

In [None]:
df_train_test['cluster'] = y_km_spec

In [None]:
df_train_test

## Prediction

In [None]:
gpr = GPR_Predictor(df_train_test, train_idx = range(0,150), test_idx = range(150,len(df_train_test)), kernel_name=kernel, normalise_kernel = False, alpha=alpha, embedding='label',
                   eva_metric=eva_metric, l_list=l, s = 0)

In [None]:
gpr.regression()

In [None]:
gpr.train_df

In [None]:
gpr.test_df

In [None]:
plt.hist(gpr.test_df['pred mean'])

In [None]:
sorted_mean_test_df = gpr.test_df.sort_values(['pred mean'], ascending=False)
sorted_mean_test_df

In [None]:
plt.plot(range(len(sorted_mean_test_df)), sorted_mean_test_df['pred mean'], label = 'pred mean')
plt.plot(range(len(sorted_mean_test_df)), sorted_mean_test_df['pred mean'] + sorted_mean_test_df['pred std'], label ='ucb')
plt.plot(range(len(sorted_mean_test_df)), sorted_mean_test_df['pred mean'] - sorted_mean_test_df['pred std'], label ='lcb')
plt.legend()
plt.title('Prediction, sorted by pred mean')

In [None]:
kernel_matrix = kernel_dict[kernel](l_list = l).__call__(np.asarray(sorted_mean_test_df['RBS'])[:90], np.asarray(sorted_mean_test_df['RBS'])[:90])
plt.imshow(kernel_matrix)
plt.colorbar()
plt.title('Top sequences kernel matrix, sorted by pred mean')

## UCB & LCB

In [None]:
test_df = gpr.test_df
test_df['ucb'] = test_df['pred mean'] + test_df['pred std']
test_df['lcb'] = test_df['pred mean'] - test_df['pred std']
test_df

In [None]:
sorted_ucb = test_df.sort_values(['ucb'], ascending=False)
sorted_ucb

In [None]:

plt.plot(range(len(sorted_ucb)), sorted_ucb['pred mean'], label = 'pred mean')
plt.plot(range(len(sorted_ucb)), sorted_ucb['ucb'], label = 'ucb')
plt.legend()
plt.title('Prediction, sorted by ucb')

In [None]:
plt.hist(sorted_ucb[:90]['ucb'])

In [None]:
sorted_ucb_kernel_matrix = kernel_dict[kernel](l_list = l).__call__(np.asarray(sorted_ucb['RBS'])[:90], np.asarray(sorted_ucb['RBS'])[:90])
plt.imshow(sorted_ucb_kernel_matrix)
plt.colorbar()
plt.title('Top sequences kernel matrix, sorted by ucb')

In [None]:
sorted_ucb.groupby('cluster').max()['ucb'].min()

In [None]:
sorted_ucb.groupby('cluster').min()['lcb'].max()

## Clustering idea

### Discussion: Reject clusters with max ucb smaller min lcb in another cluster?
For 256 clusters, the minimum max ucb among all clusters is 1.35, which is still bigger than maximum min lcb among all clusters (- 1.05)
When we look at single sequences, there are sequence's ucb is smaller than other's lcb.

### Discussion: Select max ucb in each clusters

In [None]:
max_ucb_in_clusters = pd.DataFrame(columns=['RBS', 'ucb', 'pred mean', 'pred std', 'lcb'])

for group, value in sorted_ucb.groupby('cluster'):
    max_ucb_in_clusters.loc[group] = value.sort_values('ucb', ascending = False)[['RBS', 'ucb', 'pred mean', 'pred std', 'lcb']].iloc[0]
    

In [None]:
max_ucb_in_clusters

In [None]:
plt.hist(max_ucb_in_clusters['ucb'])
plt.title('max_ucb_in_clusters')

In [None]:
sorted_max_ucb_in_clusters = max_ucb_in_clusters.sort_values('ucb', ascending=False)
sorted_max_ucb_in_clusters

In [None]:
sorted_max_ucb_clusters_kernel_matrix = kernel_dict[kernel](l_list = l).__call__(np.asarray(sorted_max_ucb_in_clusters['RBS']), np.asarray(sorted_max_ucb_in_clusters['RBS']))
plt.imshow(sorted_max_ucb_clusters_kernel_matrix)
plt.colorbar()
plt.title('sorted_max_ucb_clusters_kernel_matrix')

we can see compared with selecting top 100 sequences directly, selecting max ucb in each cluster results in a distrubiton with lower ucbs, but lower similarities as well.

## GP-BUCB

In [None]:
from copy import deepcopy
gpr_copy = deepcopy(gpr)

In [None]:
# Fix beta = 1 for now
# Desautels et al. 2014 Algorithm 2
# http://jmlr.org/papers/volume15/desautels14a/desautels14a.pdf

batch_size = 90
rec_df = pd.DataFrame()

gpr = deepcopy(gpr_copy)

gpr.train_idx = range(0,150)
gpr.test_idx = range(150,len(df_train_test))

sorted_ucb_batch = sorted_ucb

for i in range(batch_size):
    print(i)
    rec = pd.DataFrame(sorted_ucb_batch.head(1))
    rec_df = rec_df.append(rec, ignore_index = True)
    
    rec_idx = sorted_ucb_batch.index[0]
    
    train_idx = list(gpr.train_idx)
    train_idx.append(rec_idx)
    gpr.train_idx = train_idx
    
    # add replicates label to avoid being droped
    gpr.df.loc[rec_idx,'Rep2'] = gpr.test_df.loc[rec_idx,'pred mean']
    gpr.df.loc[rec_idx,'AVERAGE'] = gpr.test_df.loc[rec_idx,'pred mean']
    
    test_idx = list(gpr.test_idx)
    test_idx.remove(rec_idx)
    gpr.test_idx = test_idx
    
    gpr.regression()
    
    test_batch_df = gpr.test_df
    test_batch_df['ucb'] = test_df['pred mean'] + test_batch_df['pred std']
    test_batch_df['lcb'] = test_df['pred mean'] - test_batch_df['pred std']
    sorted_ucb_batch = test_batch_df.sort_values(['ucb'], ascending=False)

In [None]:
sorted_ucb.head(90)

In [None]:
rec_df

In [None]:
sorted_gpbucb_kernel_matrix = kernel_dict[kernel](l_list = l).__call__(np.asarray(rec_df['RBS']), np.asarray(rec_df['RBS']))
plt.imshow(sorted_gpbucb_kernel_matrix)
plt.colorbar()
plt.title('GP-BUCB sequences kernel matrix, sorted by ucb')

## plot together

In [None]:
num_rows = 2
num_cols = 3


fig,a = plt.subplots(num_rows, num_cols, figsize = (15,10))

im = a[0][0].imshow(sorted_max_ucb_clusters_kernel_matrix, cmap = 'viridis')
fig.colorbar(im, ax =a[0][0])
a[0][0].set_title('sorted_max_ucb_clusters_kernel_matrix')


im = a[0][1].imshow(sorted_gpbucb_kernel_matrix, cmap = 'viridis')
fig.colorbar(im, ax =a[0][1])
a[0][1].set_title('GP-BUCB sequences kernel matrix, sorted by ucb')

im = a[0][2].imshow(sorted_ucb_kernel_matrix, cmap = 'viridis')
fig.colorbar(im, ax =a[0][2])
a[0][2].set_title('sorted_ucb_kernel_matrix')

im = a[1][0].hist(max_ucb_in_clusters['ucb'])
a[1][0].set_title('max_ucb_in_clusters')

im = a[1][1].hist(rec_df['ucb'])
a[1][1].set_title('GP_BUCB_in_clusters')

im = a[1][2].hist(sorted_ucb[:90]['ucb'])
a[1][2].set_title('sorted ucb')

In [None]:
# save file  

with pd.ExcelWriter('batch_ucb.xlsx') as writer:
    rec_df.to_excel(writer, sheet_name = 'gpbucb')
    max_ucb_in_clusters.to_excel(writer, sheet_name = 'clusterucb')
    sorted_ucb.to_excel(writer, sheet_name = 'sorteducb')

In [None]:
# pd.read_excel('batch_ucb.xlsx', sheet_name='sorteducb')