In [1]:
# direct to proper path
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import defaultdict
import math
import json

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import PairwiseKernel, DotProduct, RBF 
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
from sklearn.model_selection import KFold

from codes.embedding import Embedding
from codes.environment import Rewards_env
from codes.ucb import GPUCB, Random
from codes.evaluations import evaluate, plot_eva
from codes.regression import *
from codes.kernels_for_GPK import Spectrum_Kernel, Sum_Spectrum_Kernel, WeightedDegree_Kernel

from ipywidgets import IntProgress
from IPython.display import display
import warnings
%matplotlib inline

In [2]:
Path = '../../data/firstRound_4h_normFalse_formatSeq.csv'

df_raw = pd.read_csv(Path)
df_raw.head(5)

Unnamed: 0.1,Unnamed: 0,Name,Group,RBS,RBS6,Rep1,Rep2,Rep3,Rep4,Rep5,AVERAGE,STD
0,0,RBS_1by1_0,reference,TTTAAGAAGGAGATATACAT,AGGAGA,,52.402431,,61.622165,54.151485,56.058694,3.998246
1,1,RBS_1by1_1,bps_noncore,CTTAAGAAGGAGATATACAT,AGGAGA,,40.072951,,42.042854,45.432032,42.515946,2.213263
2,2,RBS_1by1_2,bps_noncore,GTTAAGAAGGAGATATACAT,AGGAGA,,28.831559,,24.48787,24.133637,25.817689,2.136029
3,3,RBS_1by1_3,bps_noncore,ATTAAGAAGGAGATATACAT,AGGAGA,,43.093359,,38.641958,38.049577,39.928298,2.251065
4,4,RBS_1by1_4,bps_noncore,TCTAAGAAGGAGATATACAT,AGGAGA,,45.913214,,44.352931,38.394865,42.887003,3.23966


In [3]:
Path = '../../data/firstRound_4h_normTrue_formatSeq.csv'

df_normalised = pd.read_csv(Path)
df_normalised.head(5)

Unnamed: 0.1,Unnamed: 0,Name,Group,RBS,RBS6,Rep1,Rep2,Rep3,Rep4,Rep5,AVERAGE,STD
0,0,RBS_1by1_0,reference,TTTAAGAAGGAGATATACAT,AGGAGA,,2.783529,,3.260245,3.225496,3.089757,0.265769
1,1,RBS_1by1_1,bps_noncore,CTTAAGAAGGAGATATACAT,AGGAGA,,1.678119,,1.589655,2.374458,1.880744,0.429851
2,2,RBS_1by1_2,bps_noncore,GTTAAGAAGGAGATATACAT,AGGAGA,,0.670263,,0.091788,0.295688,0.35258,0.293404
3,3,RBS_1by1_3,bps_noncore,ATTAAGAAGGAGATATACAT,AGGAGA,,1.948917,,1.299476,1.653915,1.634102,0.325173
4,4,RBS_1by1_4,bps_noncore,TCTAAGAAGGAGATATACAT,AGGAGA,,2.201733,,1.78676,1.687616,1.892036,0.272748


In [4]:
print('raw data mean:')
df_raw.mean(axis = 0)

raw data mean:


Unnamed: 0    82.613333
Rep1          19.430768
Rep2          21.355608
Rep3          18.328426
Rep4          23.412110
Rep5          21.104113
AVERAGE       21.041324
STD            1.867289
dtype: float64

In [5]:
print('raw data std:')
df_raw.std(axis = 0)

raw data std:


Unnamed: 0    52.668112
Rep1          10.156944
Rep2          11.153762
Rep3          10.913377
Rep4          11.719995
Rep5          10.245670
AVERAGE       10.807507
STD            1.099764
dtype: float64

In [6]:
print('raw data spearman cor (Rep1 and Rep4/5 does not exist for the same sequence, so NaN):')
df_raw.corr(method = 'spearman')

raw data spearman cor (Rep1 and Rep4/5 does not exist for the same sequence, so NaN):


Unnamed: 0.1,Unnamed: 0,Rep1,Rep2,Rep3,Rep4,Rep5,AVERAGE,STD
Unnamed: 0,1.0,0.14532,-0.317726,0.241248,-0.482346,-0.462019,-0.299283,-0.244265
Rep1,0.14532,1.0,0.951031,0.949127,,,0.97652,-0.189741
Rep2,-0.317726,0.951031,1.0,0.935854,0.961066,0.962002,0.986734,0.221594
Rep3,0.241248,0.949127,0.935854,1.0,,,0.979535,-0.258488
Rep4,-0.482346,,0.961066,,1.0,0.952469,0.983095,0.548553
Rep5,-0.462019,,0.962002,,0.952469,1.0,0.980661,0.403796
AVERAGE,-0.299283,0.97652,0.986734,0.979535,0.983095,0.980661,1.0,0.215809
STD,-0.244265,-0.189741,0.221594,-0.258488,0.548553,0.403796,0.215809,1.0


In [7]:
print('Normalised data spearman cor (Rep1 and Rep4/5 does not exist for the same sequence, so NaN):')
df_normalised.corr(method = 'spearman')

Normalised data spearman cor (Rep1 and Rep4/5 does not exist for the same sequence, so NaN):


Unnamed: 0.1,Unnamed: 0,Rep1,Rep2,Rep3,Rep4,Rep5,AVERAGE,STD
Unnamed: 0,1.0,0.14532,-0.317726,0.241248,-0.482346,-0.462019,-0.211549,-0.116032
Rep1,0.14532,1.0,0.951031,0.949127,,,0.976626,-0.104283
Rep2,-0.317726,0.951031,1.0,0.935854,0.961066,0.962002,0.977563,0.102584
Rep3,0.241248,0.949127,0.935854,1.0,,,0.979376,-0.065574
Rep4,-0.482346,,0.961066,,1.0,0.952469,0.982142,0.259687
Rep5,-0.462019,,0.962002,,0.952469,1.0,0.982516,0.233844
AVERAGE,-0.211549,0.976626,0.977563,0.979376,0.982142,0.982516,1.0,0.111911
STD,-0.116032,-0.104283,0.102584,-0.065574,0.259687,0.233844,0.111911,1.0
