In [None]:
%%capture

import os
import gc
import json
import random
import itertools
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import tensorflow as tf
import tensorflow_addons as tfa

from tqdm.auto import tqdm
from IPython.display import display
from matplotlib import pyplot as plt
from tensorflow.keras import layers as L
from tensorflow.keras import backend as K
from collections import defaultdict,Counter
from sklearn.metrics import mean_squared_error
from tensorflow_addons.optimizers import RectifiedAdam
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold,RepeatedKFold,RepeatedStratifiedKFold
from transformers import BertTokenizer, TFBertModel, BertConfig, BertModel, TFDistilBertModel, DistilBertConfig

In [None]:
plt.rcParams['figure.figsize'] = (10,5)
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print(f'Running on TPU : {tpu.master()}')
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
REPLICAS = strategy.num_replicas_in_sync
print(f'REPLICAS : {REPLICAS}')

In [None]:
def plot_image(file):
    file = f'bpps/{file}.npy' if 'npy' not in file else file
    data = np.load(file)
    _ = plt.title(file.split('/')[-1],color='white')
    _ = plt.imshow(data)
    _ = plt.axis('off')
    _ = plt.colorbar()
    return data


def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
seed_everything(54)

def get_bppm(file):
    return np.load(f'bpps/{file}.npy')

def get_count(x,v):
    return x.count(v)

def padding(x,col):
    PAD_LEN = 130
    to_len = 130 - len(x) 
    if col == 'sequence':
        return x + ('N' * to_len)
    elif col == 'structure':
        return x + ('#' * to_len)
    elif col == 'predicted_loop_type':
        return x + ('Z' * to_len)

def get_dataset(data,label,params):
    if label is not None:
        data = tf.data.Dataset.from_tensor_slices((data,label)).batch(params['batch_size'])
        if params['shuffle']:
            data = data.shuffle(1024)
        if params['repeat']:
            data = data.repeat()
        return data.prefetch(AUTO)
    else:
        data = tf.data.Dataset.from_tensor_slices((data)).batch(params['batch_size']).prefetch(AUTO)
        return data
    
def explode(df, lst_cols, fill_value=''):
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    idx_cols = df.columns.difference(lst_cols)
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]
    
    
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices
        
        
    
def get_error(preds):
    val = pd.read_json('train.json', lines=True)

    val_data = []
    for mol_id in val['id'].unique():
        sample_data = val.loc[val['id'] == mol_id]
        sample_seq_length = sample_data.seq_length.values[0]
        for i in range(68):
            sample_dict = {
                           'id_seqpos' : sample_data['id'].values[0] + '_' + str(i),
                           'reactivity_gt' : sample_data['reactivity'].values[0][i],
                           'deg_Mg_pH10_gt' : sample_data['deg_Mg_pH10'].values[0][i],
                           'deg_Mg_50C_gt' : sample_data['deg_Mg_50C'].values[0][i],
                           }
            
            val_data.append(sample_dict)
            
    val_data = pd.DataFrame(val_data)
    val_data = val_data.merge(preds, on='id_seqpos')

    rmses = []
    mses = []
    print('column\t\tRMSE\t\tMSE')
    for col in ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']:
        rmse = ((val_data[col] - val_data[col+'_gt']) ** 2).mean() ** .5
        mse = ((val_data[col] - val_data[col+'_gt']) ** 2).mean()
        rmses.append(rmse)
        mses.append(mse)
        print(f'{col}\t{rmse:0.5f}\t\t{mse:0.5f}')
        
    print(f'Mean RMSE : {np.mean(rmses):0.5f}')
    print(f'Mean MSE  : {np.mean(mses):0.5f}')
    print('\n')
    
def format_predictions(test_df, test_preds, val=False):
    preds = []
    
    for df, preds_ in zip(test_df, test_preds):
        for i, uid in enumerate(df['id']):
            single_pred = preds_[i]

            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
            if val:
                single_df['SN_filter'] = df[df['id'] == uid].SN_filter.values[0]

            preds.append(single_df)
    return pd.concat(preds).groupby('id_seqpos').mean().reset_index() if TTA else pd.concat(preds)


def aug_data(df):
    aug_df = pd.read_csv('../augmented-data-for-stanford-covid-vaccine/48k_augment.csv')
    aug_df['aug'] = 1
    target_df = df.copy()
    new_df = aug_df[aug_df['id'].isin(target_df['id'])]
    
    del target_df['structure']
    del target_df['predicted_loop_type']
    new_df = new_df.merge(target_df, on=['id','sequence'], how='left')
    
    df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])
    df['log_gamma'] = 100
    df['score'] = 1.0
    df['aug'] = 0
    df = df.append(new_df[df.columns])
    return df

SEED = 42
def extend_sequence(df,p):
    np.random.seed(SEED)
    n = int(df.shape[0]*p)
    sample = df.sample(n).copy(deep=True)
    sample['seq_length'] = 130
    sample['seq_scored'] = 91
    sample[sequence_cols] = (sample[sequence_cols].applymap(lambda x:x[:65]) +
                             sample[sequence_cols].applymap(lambda x:x[-65:]))
    
    sample[pred_cols] = (sample[pred_cols].applymap(lambda x:x[:45]) +
                             sample[pred_cols].applymap(lambda x:x[-46:]))
    
    df = df.append(sample,ignore_index=True)
    return df

In [None]:
PATH = './../input/stanford-covid-vaccine/'
os.chdir(PATH)
os.listdir()

In [None]:
DENOISE = False
RUN_TEST = False
FLIP_AUG = False
TTA = False
LONG_SEQ = False

In [None]:
train = pd.read_json('train.json',lines=True).drop(columns=['index']).sort_values(by='id')
test = pd.read_json('test.json',lines=True).drop(columns=['index']).sort_values(by='id')

submission = pd.read_csv('sample_submission.csv')
npy = ['bpps/'+x for x in os.listdir('bpps')]

In [None]:
sequence_cols = ['sequence','structure','predicted_loop_type']
pred_cols = ['reactivity','deg_Mg_pH10','deg_pH10','deg_Mg_50C','deg_50C']


if TTA:
    train = aug_data(train)
    test = aug_data(test)

    
if LONG_SEQ:
    train = extend_sequence(train,0.25 if TTA else 1)
    

train['flip'] = 0
if FLIP_AUG:
    train_flip = pd.read_json('train.json',lines=True).drop(columns=['index']).sort_values(by='id')
    train_flip = train_flip[train_flip['SN_filter'] >= 1]
    train_flip['flip'] = 1
    train_flip[sequence_cols + pred_cols] = train_flip[sequence_cols + pred_cols].applymap(lambda x:x[::-1])
    train = train.append(train_flip,ignore_index=True)
    del train_flip

if DENOISE:
    train = train[train['SN_filter']>=1]
    
if RUN_TEST:
    train = train.sample(300)
    test = test.sample(300)
    
print(f'train shape : {train.shape}')
print(f'test shape : {test.shape}')

print(f'Null values if train data : {train.isnull().sum().sum()}')
print(f'Null values if test data : {test.isnull().sum().sum()}')

In [None]:
%%capture
"""def get_ohe(x,i):
    vec_len = {'(':3, ')':3, '.':3,
           'A':4, 'U':4, 'G':4, 'C':4,
           'B':7, 'E':7, 'H':7, 'I':7, 'M':7, 'S':7, 'X':7}
    x = np.zeros(vec_len[x],dtype=np.int32)
    x[i] = 1
    return x
mapping = dict()
mapping.update({x:get_ohe(x,i) for i,x in enumerate(['A','U','G','C'])})
mapping.update({x:get_ohe(x,i) for i,x in enumerate(['(',')','.'])})
mapping.update({x:get_ohe(x,i) for i,x in enumerate(list('BEHIMSX'))})
for c in ['sequence','structure','predicted_loop_type']:
    train[c] = train[c].apply(list)
    test[c] = test[c].apply(list)"""

In [None]:
cols = train.columns.tolist()
drop_cols = [x for x in cols if 'error' in x]
train.drop(columns=drop_cols,inplace=True)

seq_values = ['A','U','G','C']
struct_values = ['.','(',')']
loop_values = ['B', 'E', 'H', 'I', 'S', 'X','M']

seq_map = dict(zip(seq_values,range(len(seq_values))))
struct_map = dict(zip(struct_values,range(len(struct_values))))
loop_map = dict(zip(loop_values,range(len(loop_values))))

In [None]:
seq_comb = seq_values + []
for p in itertools.permutations(seq_values,2):
    seq_comb.append(''.join(p))
    
struct_comb = struct_values + []
for p in itertools.permutations(struct_values,2):
    struct_comb.append(''.join(p))
    
loop_comb = loop_values + []
for p in itertools.permutations(loop_values,2):
    loop_comb.append(''.join(p))

In [None]:
for c in ['sequence','structure','predicted_loop_type']:
    train[c] = train[c].apply(list)
    test[c] = test[c].apply(list)

In [None]:
for c in seq_comb:
    train[f'{c}_content'] = train['sequence'].apply(lambda x:x.count(c))/107
    test[f'{c}_content'] = test['sequence'].apply(lambda x:x.count(c))/130
    
for c in struct_comb:
    train[f'{c}_content'] = train['structure'].apply(lambda x:x.count(c))/107
    test[f'{c}_content'] = test['structure'].apply(lambda x:x.count(c))/130
    
for c in loop_comb:
    train[f'{c}_content'] = train['predicted_loop_type'].apply(lambda x:x.count(c))/107
    test[f'{c}_content'] = test['predicted_loop_type'].apply(lambda x:x.count(c))/130

In [None]:
def get_bpm_feature(df,len_):
    bpm_max = []
    bpm_sum = []
    bpm_upb = []
    bpm_mean = []
    bpm_std = []
    bpm_nb = []
    
    bpps_nb_mean = 0.077522
    bpps_nb_std = 0.08914
    
    for idx in tqdm(df.id.values):
        bpm_ar = get_bppm(idx)
        bpm_max.append(np.max(bpm_ar,axis=1)/len_)
        bpm_sum.append(np.sum(bpm_ar,axis=1)/len_)
        bpm_upb.append(1-np.sum(bpm_ar,axis=1)/len_)
        bpm_mean.append(np.mean(bpm_ar,axis=1)/len_)
        bpm_std.append(np.std(bpm_ar,axis=1)/len_)
        bpps_nb_ = (bpm_ar > 0).sum(axis=0) / bpm_ar.shape[0]
        bpps_nb_ = (bpps_nb_ - bpps_nb_mean) / bpps_nb_std
        bpm_nb.append(bpps_nb_)
        
    return bpm_max,bpm_sum,bpm_upb,bpm_mean,bpm_std,bpm_nb

bpm_max_tr,bpm_sum_tr,bpm_upb_tr,bpm_mean_tr,bpm_std_tr,bpm_nb_tr = get_bpm_feature(train,107)
bpm_max_tst,bpm_sum_tst,bpm_upb_tst,bpm_mean_tst,bpm_std_tst,bpm_nb_tst = get_bpm_feature(test,130)

train['bpm_max'] = bpm_max_tr
train['bpm_sum'] = bpm_sum_tr
train['bpm_upb'] = bpm_upb_tr
train['bpm_std'] = bpm_std_tr
train['bpm_nb'] = bpm_nb_tr
train['bpm_mean'] = bpm_mean_tr

test['bpm_max'] = bpm_max_tst
test['bpm_sum'] = bpm_sum_tst
test['bpm_upb'] = bpm_upb_tst
test['bpm_std'] = bpm_std_tst
test['bpm_nb'] = bpm_nb_tst
test['bpm_mean'] = bpm_mean_tst

In [None]:
train_short = train[train['seq_scored'] == 68]
train_long = train[train['seq_scored'] == 91]


public_test = test[test['seq_scored'] == 68]
private_test = test[test['seq_scored'] == 91]

print(f'train short shape : {train_short.shape}')
print(f'train long shape : {train_long.shape}')

In [None]:
%%capture
desc_tr = train.describe().T.sort_values(by='std')
std0_tr = desc_tr[desc_tr['std'] == 0].index.tolist() 

desc_tst = test.describe().T.sort_values(by='std')
std0_tst = desc_tst[desc_tst['std'] == 0].index.tolist() 

std0_cols = set(std0_tr + std0_tst)
#std0_cols.remove('seq_length')
#std0_cols.remove('seq_scored')
std0_cols.remove('flip')
train.drop(columns=std0_cols,inplace=True)

#train_short.drop(columns=std0_cols,inplace=True)
#train_long.drop(columns=std0_cols,inplace=True)

public_test.drop(columns=std0_cols,inplace=True)
private_test.drop(columns=std0_cols,inplace=True)

In [None]:
#train_short = train_short.sort_values(by='sequence').reset_index(drop=True)
#train_long = train_long.sort_values(by='sequence').reset_index(drop=True)

#display(train_short.head(1))
#display(train_long.head(1))

In [None]:
SPLITS = 5
tr_data = train[train['signal_to_noise'] >= 0.25]
if not TTA:
    print('RepeatedStratifiedKFold')
    folds = RepeatedStratifiedKFold(n_splits=SPLITS,n_repeats=1,random_state=42)
    for i,(_,val_idx) in enumerate(folds.split(tr_data['id'],tr_data['SN_filter'])):
        train.loc[val_idx,'folds'] = i
    #for i,(_,val_idx) in enumerate(folds.split(tr_short['id'],tr_short['SN_filter'])):
    #    train_short.loc[val_idx,'folds'] = i
    #for i,(_,val_idx) in enumerate(folds.split(tr_long['id'],tr_long['SN_filter'])):
    #    train_long.loc[val_idx,'folds'] = i
            
else:
    print('stratified_group_k_fold')
    for i,(_,val_idx) in enumerate(stratified_group_k_fold(train,train['SN_filter'],train['id'],SPLITS)):
        train.loc[val_idx,'folds'] = i
    #for i,(_,val_idx) in enumerate(stratified_group_k_fold(tr_short,tr_short['SN_filter'],tr_short['id'],SPLITS)):
    #    train_short.loc[val_idx,'folds'] = i
    #for i,(_,val_idx) in enumerate(stratified_group_k_fold(tr_long,tr_long['SN_filter'],tr_long['id'],SPLITS)):
    #    train_long.loc[val_idx,'folds'] = i
    
#del tr_short,tr_long
    
#print(train_short.folds.value_counts())
#print(train_long.folds.value_counts())
print(train.folds.value_counts())

In [None]:
K.clear_session()
EMB_SIZE = 256
DROPOUT = 0.1
BATCH_SIZE = 64*REPLICAS
LR = 0.01*REPLICAS
HEADS = 4

def MCRMSE(y_true, y_pred):
    columnwise_mse = tf.reduce_mean(tf.square(y_true-y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(columnwise_mse), axis=1)

def RMSE(y_true,y_pred):
    loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(y_true,y_pred)
    return tf.math.sqrt(loss)


token2int = {x:i for i, x in enumerate('().AUGCBEHIMSX')}
def preprocess_data(df, cols):
    base_fea1 = np.array(df[cols[:2]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    base_fea2 = np.array(df[cols[2:]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    
    #bpps_sum_fea = np.array(df['bpm_sum'].to_list())[:,:,np.newaxis]
    #bpps_max_fea = np.array(df['bpm_max'].to_list())[:,:,np.newaxis]    
    data = np.concatenate([base_fea1, base_fea2], 2)
    return data

def gru_model():
    return L.Bidirectional(
        L.GRU(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal')
    )

def lstm_model():
    return L.Bidirectional(
        L.LSTM(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal')
    )

def conv_block(ksize):
    return L.Conv1D(filters=EMB_SIZE,kernel_size=ksize,padding='same',
                    activation='swish')

def attn_block(layer):
    query = conv_block(1)(layer)
    value = conv_block(1)(layer)
    key = conv_block(1)(layer)
    attn = L.Attention(dropout=DROPOUT,use_scale=True)([query,value,key])
    
    return attn


def multi_head(layer):
    last_axis = layer.shape[-1]
    assert last_axis % HEADS == 0
    split = last_axis//HEADS

    heads = [attn_block(layer[:,:,i*split:(i+1)*split]) for i in range(HEADS)]
    concat = tf.concat(heads,axis=2)
    
    return concat
    

def process_block(seq,seq_len):
    seq_emb = L.Embedding(len(token2int),EMB_SIZE)(seq)
    seq_reshape = L.Reshape((seq_len,seq_emb.shape[2]*seq_emb.shape[3]))(seq_emb)
    seq_drop = L.SpatialDropout1D(DROPOUT)(seq_reshape)
    seq_conv = conv_block(2)(seq_drop)
    
    lstm1 = lstm_model()(seq_conv)
    attn1 = multi_head(lstm1)
    
    lstm2 = lstm_model()(attn1)
    attn2 = multi_head(lstm2)
    
    lstm3 = lstm_model()(attn2)
    attn3 = multi_head(lstm3)
    
    lstm4 = lstm_model()(attn3)
    attn4 = multi_head(lstm4)
    
    return attn4

def build_model(seq_len, pred_len):
    inp = L.Input(shape=(seq_len,3)) 
    
    #seq1 = tf.expand_dims(inp[:,:,0],axis=-1)
    #seq2 = tf.expand_dims(inp[:,:,1],axis=-1)
    #seq3 = tf.expand_dims(inp[:,:,2],axis=-1)
    
    seq1 = inp[:,:,:2]
    seq2 = inp[:,:,2:]
    #seq3 = inp[:,:,3:]
    
    #seq3_conv = L.Conv1D(filters=EMB_SIZE,kernel_size=1,padding='same')(seq3)
    #seq3_norm = L.BatchNormalization()(seq3_conv)

    seq1_out = process_block(seq1,seq_len)
    seq2_out = process_block(seq2,seq_len)
    #seq3_out = process_block(seq3,seq_len)
    
    concat = L.Concatenate(axis=2)([seq1_out, seq2_out])
    concat = L.BatchNormalization()(concat)

    out = concat[:,:pred_len]
    out = L.Dense(5,activation='linear')(out)
    
    model = tf.keras.Model(inputs=[inp],outputs=out)
    model.compile(optimizer=RectifiedAdam(lr=LR),loss=MCRMSE)
    return model
    
model = build_model(68,68)
tf.keras.utils.plot_model(model,to_file='./../../working/model.png',show_shapes=True,dpi=55)

In [None]:
cols = sequence_cols
private_preds = np.zeros((private_test.shape[0],130,5))
public_preds = np.zeros((public_test.shape[0],107,5))
val_preds = np.zeros((train.shape[0],107,5))

private_data =  preprocess_data(private_test,cols)
public_data =  preprocess_data(public_test,cols)

non_fil_score = []
fil_score = []

In [None]:
gc.collect()
EPOCHS = 30
pred_cols_1 = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C']
for i in range(SPLITS):
    print(f'Training on fold {i}')
    K.clear_session()
    
    tr_df = train[train['signal_to_noise'] > 0.25].query(f'folds != {i}')
    valid_df1 = train[train['signal_to_noise'] > 0.25].query(f'folds == {i}')
    valid_df2 = train[train['signal_to_noise'] >= 1].query(f'folds == {i}')
    
    
    tr_values = preprocess_data(tr_df,['sequence','structure','predicted_loop_type'])
    tr_label = np.array(tr_df[pred_cols].values.tolist()).transpose(0,2,1)
    
    
    valid_val = preprocess_data(valid_df1,['sequence','structure','predicted_loop_type'])
    valid_label = np.array(valid_df1[pred_cols].values.tolist()).transpose(0,2,1)
    valid_data1 = tf.data.Dataset.from_tensor_slices((valid_val,valid_label)).batch(BATCH_SIZE).prefetch(AUTO)
    
    valid_val = preprocess_data(valid_df2,['sequence','structure','predicted_loop_type'])
    valid_label = np.array(valid_df2[pred_cols].values.tolist()).transpose(0,2,1)
    valid_data2 = tf.data.Dataset.from_tensor_slices((valid_val,valid_label)).batch(BATCH_SIZE).prefetch(AUTO)
    
    weight = np.log1p(tr_df['signal_to_noise'] + 0.01)/2
    #weight = np.log1p((tr_df['signal_to_noise'].max()-tr_df['signal_to_noise']) + 0.01)/2.0
    
    lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                       factor=0.85,
                                                       patience=2,
                                                       verbose=True)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=f'./../../working/fold_{i}_best_weight.h5',
                                                    monitor='val_loss',
                                                    save_weights_only=True,
                                                    save_best_only=True,
                                                    mode='min',
                                                    verbose=True)

    print(f'Building models on distribution strategy mode')
    with strategy.scope():
        train_model = build_model(107,68)
        model_short = build_model(107,107)
        model_long = build_model(130,130)

    train_model.fit(tr_values,tr_label,
                    epochs=EPOCHS,
                    sample_weight = weight,
                    batch_size=BATCH_SIZE,
                    validation_data = valid_data1, #[valid_val,valid_label],
                    callbacks=[lr_schedule,checkpoint],
                    verbose=True)
    
    print(f'Loading fold {i} best weights')
    train_model.load_weights(f'./../../working/fold_{i}_best_weight.h5')
    model_short.load_weights(f'./../../working/fold_{i}_best_weight.h5')
    model_long.load_weights(f'./../../working/fold_{i}_best_weight.h5')
    
    print(f'Predicting validation data')
    print(f'----------Non-Filtered Data-----------')
    non_fil_score.append(train_model.evaluate(valid_data1,verbose=True))
    print(f'----------Filtered Data-----------')
    fil_score.append(train_model.evaluate(valid_data2,verbose=True))
    
    print(f'Predicting public test data')
    public_preds += model_short.predict(public_data,verbose=True)/SPLITS
    
    print(f'Predicting private test data')
    private_preds += model_long.predict(private_data,verbose=True)/SPLITS
    
    print('#'*100)
    print('#'*100)
    print('#'*100)
    
    del tr_df,valid_df1,valid_df2,tr_values,tr_label,valid_val,
    del valid_label,valid_data1,valid_data2,weight
    del lr_schedule,checkpoint,train_model,model_short,model_long
    
    gc.collect()
    
print(f'Mean MCRMSE filtered data : {np.mean(fil_score):0.5f}')
print(f'Mean MCRMSE Non filtered data : {np.mean(non_fil_score):0.5f}')
print(f'Mean MCRMSE total data : {(np.mean(non_fil_score)+np.mean(fil_score))/2.0:0.5f}')

In [None]:
plt.plot(non_fil_score,label='non_fil_score')
plt.plot(fil_score,label='fil_score')
plt.xticks([0,1,2,3,4],color='white')
plt.yticks(color='white')
plt.legend()

In [None]:
def format_predictions(test_df, test_preds, val=False):
    preds = []
    
    for df, preds_ in zip(test_df, test_preds):
        for i, uid in enumerate(df['id']):
            single_pred = preds_[i]

            single_df = pd.DataFrame(single_pred, columns=pred_cols)
            single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]
            if val:
                single_df['SN_filter'] = df[df['id'] == uid].SN_filter.values[0]

            preds.append(single_df)
    return pd.concat(preds).groupby('id_seqpos').mean().reset_index() if TTA else pd.concat(preds)

In [None]:
preds_df = [public_preds,private_preds]
test_df = [public_test,private_test]
sub_preds = format_predictions(test_df, preds_df)

In [None]:
sub_preds = sub_preds[['id_seqpos','reactivity',
                       'deg_Mg_pH10','deg_pH10',
                       'deg_Mg_50C','deg_50C']]
if TTA:
    sub_preds = submission[['id_seqpos']].merge(sub_preds,on=['id_seqpos'],how='left')
print(sub_preds.shape)
sub_preds.head()

In [None]:
sub_file = './../../working/submission_autoencoder_seq_struct_long_short_20.csv'
sub_preds.to_csv(sub_file,index=False)

# Garbage

In [None]:
def get_train_fold(df_short,df_long,f):
    tr_short = df_short[df_short['signal_to_noise'] >= 0.25].query(f'folds != {f}')
    tr_long = df_long[df_long['signal_to_noise'] >= 0.25 ].query(f'folds != {f}')
    
    tr_short_data = preprocess_data(tr_short,sequence_cols)
    tr_short_label = np.array(tr_short[pred_cols].values.tolist()).transpose(0,2,1)
    tr_short_weight = np.log1p(tr_short['signal_to_noise'] + 0.01)/2.0
    
    tr_long_data = preprocess_data(tr_long,sequence_cols)
    tr_long_label = np.array(tr_long[pred_cols].values.tolist()).transpose(0,2,1)
    tr_long_weight = np.log1p(tr_long['signal_to_noise'] + 0.01)/2.0
    
    return  tr_short_data,tr_short_label,tr_short_weight,tr_long_data,tr_long_label,tr_long_weight


def get_valid_fold(df_short,df_long,f):
    val_short = df_short[df_short['signal_to_noise'] >= 0.25].query(f'folds == {f}')
    val_long = df_long[df_long['signal_to_noise'] >= 0.25].query(f'folds == {f}')
    
    val_short_data = preprocess_data(val_short,sequence_cols)
    val_short_label = np.array(val_short[pred_cols].values.tolist()).transpose(0,2,1)
    val_short = tf.data.Dataset.from_tensor_slices((val_short_data,val_short_label)).batch(BATCH_SIZE).prefetch(AUTO)
    
    val_long_data = preprocess_data(val_long,sequence_cols)
    val_long_label = np.array(val_long[pred_cols].values.tolist()).transpose(0,2,1)
    val_long = tf.data.Dataset.from_tensor_slices((val_long_data, val_long_label)).batch(BATCH_SIZE).prefetch(AUTO)
    
    return val_short, val_long

In [None]:
"""gc.collect()
EPOCHS = 30
score_long = []
score_short = []

for i in range(SPLITS):
    print(f'Training on fold {i}')
    K.clear_session()
    tr_short,tr_short_label,tr_short_weight,tr_long,tr_long_label,tr_long_weight = get_train_fold(train_short,train_long,i)
    val_short, val_long = get_valid_fold(train_short,train_long,i)
    
    
    lr_schedule_short = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                       factor=0.85,
                                                       patience=3,
                                                       verbose=True)
    #lr_schedule_long = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
    #                                                   factor=0.85,
    #                                                   patience=3,
    #                                                   verbose=True)
    
    #checkpoint_long = tf.keras.callbacks.ModelCheckpoint(filepath=f'./../../working/fold_{i}_best_weight_long.h5',
    #                                                monitor='val_loss',
    #                                                save_weights_only=True,
    #                                                save_best_only=True,
    #                                                mode='min',
    #                                                verbose=True)
    checkpoint_short = tf.keras.callbacks.ModelCheckpoint(filepath=f'./../../working/fold_{i}_best_weight_short.h5',
                                                    monitor='val_loss',
                                                    save_weights_only=True,
                                                    save_best_only=True,
                                                    mode='min',
                                                    verbose=True)

    print(f'Building models on distribution strategy mode')
    with strategy.scope():
        train_model_short = build_model(107,68)
        #train_model_long = build_model(130,91)
        
        short_seq_model_short = build_model(107,107) #Short sequences with short model weights
        #short_seq_model_long = build_model(107,107)  #Short sequences with long model weights
        
        long_seq_model_short = build_model(130,130)  #Long sequences with short model weights
        #long_seq_model_long = build_model(130,130)   #Long sequences with long model weights
    
    print('-'*50)
    #print('Training LONG model...')
    #train_model_long.fit(tr_long,tr_long_label,
    #                epochs=EPOCHS,
    #                sample_weight = tr_long_weight,
    #                batch_size=BATCH_SIZE,
    #                validation_data = val_long,
    #                callbacks=[lr_schedule_long,checkpoint_long],
    #                verbose=True)
    
    print('Training SHORT model...')
    train_model_short.fit(tr_short,tr_short_label,
                    epochs=EPOCHS,
                    sample_weight = tr_short_weight,
                    batch_size=BATCH_SIZE,
                    validation_data = val_short,
                    callbacks=[lr_schedule_short,checkpoint_short],
                    verbose=True)
    
    print('-'*50)
    
    print(f'Loading fold {i} SHORT model best weight')
    train_model_short.load_weights(f'./../../working/fold_{i}_best_weight_short.h5')
    short_seq_model_short.load_weights(f'./../../working/fold_{i}_best_weight_short.h5')
    long_seq_model_short.load_weights(f'./../../working/fold_{i}_best_weight_short.h5')
    
    
    #print(f'Loading fold {i} LONG model best weight')
    #train_model_short.load_weights(f'./../../working/fold_{i}_best_weight_long.h5')
    #short_seq_model_long.load_weights(f'./../../working/fold_{i}_best_weight_long.h5')
    #long_seq_model_long.load_weights(f'./../../working/fold_{i}_best_weight_long.h5')
    
    print('-'*50)
    
    print(f'Predicting validation data')
    #print(f'++++++++++  Long Seq Data   +++++++++++')
    #score_long.append(train_model_long.evaluate(val_long,verbose=True))
    
    print(f'++++++++++  Short Seq Data  +++++++++++')
    score_short.append(train_model_short.evaluate(val_short,verbose=True))
    
    print('-'*50)
    print(f'Predicting public test data')
    public_preds += short_seq_model_short.predict(public_data,verbose=True)
    #public_preds += short_seq_model_long.predict(public_data,verbose=True)
    public_preds /= SPLITS
    
    print(f'Predicting private test data')
    private_preds += long_seq_model_short.predict(private_data,verbose=True)
    #private_preds += long_seq_model_long.predict(private_data,verbose=True)
    private_preds /= SPLITS
    
    print('#'*100)
    print('#'*100)
    print('#'*100)
    
    del checkpoint_short,lr_schedule_short,val_short, #checkpoint_long,lr_schedule_long
    del tr_short,tr_short_label,tr_short_weight#,val_long,tr_long,tr_long_label,tr_long_weight
    del train_model_short#,train_model_long 
    del short_seq_model_short#,short_seq_model_long
    del long_seq_model_short#,long_seq_model_long
    
    gc.collect()
    
print(f'Mean MCRMSE SHORT data : {np.mean(score_short):0.5f}')
#print(f'Mean MCRMSE LONG data : {np.mean(score_long):0.5f}')
#print(f'Mean MCRMSE Total data : {((np.mean(score_long)+np.mean(score_short))/2.0):0.5f}')"""

In [None]:
"""K.clear_session()
EMB_SIZE = 256
DROPOUT = 0.1
BATCH_SIZE = 64*REPLICAS
LR = 0.01*REPLICAS

def MCRMSE(y_true, y_pred):
    columnwise_mse = tf.reduce_mean(tf.square(y_true-y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(columnwise_mse), axis=1)

def RMSE(y_true,y_pred):
    loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(y_true,y_pred)
    return tf.math.sqrt(loss)


token2int = {x:i for i, x in enumerate('().AUGCBEHIMSX')}
def preprocess_data(df, cols):
    base_fea1 = np.array(df[cols[:2]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    base_fea2 = np.array(df[cols[2:]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    #base_fea3 = np.array(df[cols[:3:2]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    
    bpps_sum_fea = np.array(df['bpm_sum'].to_list())[:,:,np.newaxis]
    bpps_max_fea = np.array(df['bpm_max'].to_list())[:,:,np.newaxis]
    #bpps_std_fea = np.array(df['bpm_std'].to_list())[:,:,np.newaxis]
    #bpps_upb_fea = np.array(df['bpm_upb'].to_list())[:,:,np.newaxis]
    data = np.concatenate([base_fea1, base_fea2, bpps_sum_fea,bpps_max_fea], 2)
    return data

def gru_model():
    return L.Bidirectional(
        L.GRU(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal')
    )

def lstm_model():
    return L.Bidirectional(
        L.LSTM(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal')
    )

def conv_block(ksize):
    return L.Conv1D(filters=EMB_SIZE,kernel_size=ksize,padding='same',
                    activation='swish')

def attn_block(layer):
    query = conv_block(1)(layer)
    key = conv_block(1)(layer)
    value = conv_block(1)(layer)
    
    attn = L.Attention(dropout=DROPOUT,use_scale=True)([query,value,key])
    
    add = L.Add()([layer,attn])
    norm = L.LayerNormalization()(add)
    drop = L.SpatialDropout1D(DROPOUT)(norm)
    
    return drop

def process_block(seq,seq_len):
    seq_emb = L.Embedding(len(token2int),EMB_SIZE)(seq)
    seq_reshape = L.Reshape((seq_len,seq_emb.shape[2]*seq_emb.shape[3]))(seq_emb)
    seq_drop = L.SpatialDropout1D(DROPOUT)(seq_reshape)
    #seq_conv = conv_block(2)(seq_drop)
    
    lstm1 = lstm_model()(seq_drop)
    attn1 = attn_block(lstm1)
    
    lstm2 = lstm_model()(attn1)
    attn2 = attn_block(lstm2)
    
    lstm3 = lstm_model()(attn2)
    attn3 = attn_block(lstm3)
    
    #lstm3_mul = L.Multiply()([lstm1,lstm3])
    #lstm3_norm = L.BatchNormalization()(lstm3_mul)
    #lstm3_drop = L.SpatialDropout1D(DROPOUT)(lstm3_norm)
    
    lstm4 = lstm_model()(attn3)
    attn4 = attn_block(lstm4)
    
    #lstm4_mul = L.Multiply()([lstm2,lstm4])
    #lstm4_norm = L.BatchNormalization()(lstm4_mul)
    #lstm4_drop = L.SpatialDropout1D(DROPOUT)(lstm4_norm)
    
    return attn4

def build_model(seq_len, pred_len):
    inp = L.Input(shape=(seq_len,6))
    seq1 = inp[:,:,:2]
    seq2 = inp[:,:,1:3]
    
    seq3 = inp[:,:,3:]
    seq3_conv = conv_block(2)(seq3)
    
    seq1_out = process_block(seq1,seq_len)
    seq2_out = process_block(seq2,seq_len)
    
    concat = L.Concatenate(axis=2)([seq1_out, seq2_out, seq3_conv])
    concat = L.BatchNormalization()(concat)
    out = L.SpatialDropout1D(DROPOUT)(concat)

    out = out[:,:pred_len]
    out = L.Dense(5,activation='linear')(out)
    
    model = tf.keras.Model(inputs=inp,outputs=out)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=LR),loss=MCRMSE)
    return model
    
model = build_model(68,68)
tf.keras.utils.plot_model(model,to_file='./../../working/model.png',show_shapes=True,dpi=55)"""

In [None]:
"""K.clear_session()
EMB_SIZE = 256
DROPOUT = 0.1
BATCH_SIZE = 64*REPLICAS
LR = 0.01*REPLICAS

def MCRMSE(y_true, y_pred):
    columnwise_mse = tf.reduce_mean(tf.square(y_true-y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(columnwise_mse), axis=1)

def RMSE(y_true,y_pred):
    loss = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)(y_true,y_pred)
    return tf.math.sqrt(loss)


token2int = {x:i for i, x in enumerate('().AUGCBEHIMSX')}
def preprocess_data(df, cols):
    base_fea1 = np.array(df[cols[:2]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    base_fea2 = np.array(df[cols[2:]].applymap(lambda x:[token2int[i] for i in x]).values.tolist()).transpose(0,2,1)
    
    bpps_sum_fea = np.array(df['bpm_sum'].to_list())[:,:,np.newaxis]
    bpps_max_fea = np.array(df['bpm_max'].to_list())[:,:,np.newaxis]
    #bpps_mean_fea = np.array(df['bpm_mean'].to_list())[:,:,np.newaxis]
    bpps_upb_fea = np.array(df['bpm_upb'].to_list())[:,:,np.newaxis]
    data = np.concatenate([base_fea1, base_fea2, bpps_sum_fea, bpps_max_fea, bpps_upb_fea], 2)
    return data

def gru_model():
    return L.Bidirectional(L.GRU(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal'))

def lstm_model():
    return L.Bidirectional(L.LSTM(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal'))


def process_block(seq,seq_len):
    seq_emb = L.Embedding(len(token2int),EMB_SIZE)(seq)
    seq_reshape = L.Reshape((seq_len,seq_emb.shape[2]*seq_emb.shape[3]))(seq_emb)
    seq_drop = L.SpatialDropout1D(DROPOUT)(seq_reshape)
    seq_conv = L.Conv1D(filters=EMB_SIZE,kernel_size=2,padding='same')(seq_drop)
    
    lstm1 = lstm_model()(seq_conv)
    lstm2 = lstm_model()(lstm1)
    
    conv_block = L.Conv1D(filters=EMB_SIZE,kernel_size=1,padding='same',
                         activation=tf.keras.activations.swish)
    
    query = conv_block(lstm2)
    key = conv_block(lstm2)
    value = conv_block(lstm2)
    attn = L.Attention(dropout=DROPOUT,use_scale=True)([query,value,key])
    
    lstm3 = lstm_model()(attn)
    lstm_mul = L.Multiply()([lstm1,lstm3])
    lstm_norm = L.BatchNormalization()(lstm_mul)
    lstm4 = lstm_model()(lstm_norm)
    
    return lstm4

def build_model(seq_len, pred_len):
    inp = L.Input(shape=(seq_len,6))
    seq1 = inp[:,:,:2]
    seq2 = inp[:,:,1:3]
    
    seq3 = inp[:,:,3:]
    seq3_dense = L.Conv1D(filters=EMB_SIZE,kernel_size=2,padding='same',
                          activation=tf.keras.activations.swish)(seq3)
    
    seq1_out = process_block(seq1,seq_len)
    seq2_out = process_block(seq2,seq_len)
    
    concat = L.Concatenate(axis=2)([seq1_out, seq2_out, seq3_dense])
    out = L.SpatialDropout1D(DROPOUT)(concat)
    out = L.Dense(256)(out)
    out = out[:,:pred_len]
    out = L.Dense(5,activation='linear')(out)
    
    model = tf.keras.Model(inputs=inp,outputs=out)
    model.compile(optimizer=RectifiedAdam(lr=LR),loss=MCRMSE)
    return model
    
model = build_model(68,68)
tf.keras.utils.plot_model(model,to_file='./../../working/model.png',show_shapes=True,dpi=55)"""

In [None]:
"""comb = np.array(train[sequence_cols].values.tolist()).transpose(0,2,1)
combined_seq = []
for i in range(len(comb)):
    join = []
    for j in range(len(comb[i])):
        join.append(''.join(comb[i][j]))
    combined_seq.append(q)
train['combined_seq'] = combined_seq



comb = np.array(test[sequence_cols].values.tolist()).transpose(0,2,1)
combined_seq = []
for i in range(len(comb)):
    join = []
    for j in range(len(comb[i])):
        join.append(''.join(comb[i][j]))
    combined_seq.append(q)
test['combined_seq'] = combined_seq"""

In [None]:
"""%%capture
train['sequence'] = train['sequence'].apply(lambda x:[seq_map[i] for i in x])
public_test['sequence'] = public_test['sequence'].apply(lambda x:[seq_map[i] for i in x])
private_test['sequence'] = private_test['sequence'].apply(lambda x:[seq_map[i] for i in x])


train['structure'] = train['structure'].apply(lambda x:[struct_map[i] for i in x])
public_test['structure'] = public_test['structure'].apply(lambda x:[struct_map[i] for i in x])
private_test['structure'] = private_test['structure'].apply(lambda x:[struct_map[i] for i in x])


train['predicted_loop_type'] = train['predicted_loop_type'].apply(lambda x:[loop_map[i] for i in x])
public_test['predicted_loop_type'] = public_test['predicted_loop_type'].apply(lambda x:[loop_map[i] for i in x])
private_test['predicted_loop_type'] = private_test['predicted_loop_type'].apply(lambda x:[loop_map[i] for i in x])


train['combined_seq'] = train['combined_seq'].apply(lambda x:[pairs[i] for i in x])
public_test['combined_seq'] = public_test['combined_seq'].apply(lambda x:[pairs[i] for i in x])
private_test['combined_seq'] = private_test['combined_seq'].apply(lambda x:[pairs[i] for i in x])"""

In [None]:
"""
BEST MODEL 1
K.clear_session()
EMB_SIZE = 256
DROPOUT = 0.1
BATCH_SIZE = 64*REPLICAS
LR = 0.01*REPLICAS

def gru_model():
    return L.Bidirectional(L.GRU(EMB_SIZE//2,dropout=DROPOUT,return_sequences=True,kernel_initializer='orthogonal'))

def lstm_model():
    return L.Bidirectional(L.LSTM(EMB_SIZE//2,return_sequences=True,kernel_initializer='orthogonal'))


def build_model(seq_len, pred_len):
    inp = L.Input(shape=(seq_len,4))
    seq1 = inp[:,:,:2]
    seq2 = inp[:,:,2:]
    seq2_dense = L.Conv1D(filters=EMB_SIZE,kernel_size=2,padding='same')(seq2)
    
    seq1_emb = L.Embedding(len(token2int),EMB_SIZE)(seq1)
    seq1_reshape = L.Reshape((seq_len,seq1_emb.shape[2]*seq1_emb.shape[3]))(seq1_emb)
    seq1_conv = L.Conv1D(filters=EMB_SIZE,kernel_size=2,padding='same')(seq1_reshape)
    
    lstm1 = lstm_model()(seq1_conv)
    lstm2 = lstm_model()(lstm1)
    
    conv_block = L.Conv1D(filters=EMB_SIZE,kernel_size=1,padding='same')
    query = conv_block(lstm2)
    key = conv_block(lstm2)
    value = conv_block(lstm2)
    attn = L.Attention(dropout=DROPOUT)([query,value,key])
    
    concat = L.Concatenate(axis=2)([attn,seq2_dense])
    out = concat[:,:pred_len]
    out = L.Dense(5,activation='linear')(out)
    
    model = tf.keras.Model(inputs=inp,outputs=out)
    model.compile(optimizer=RectifiedAdam(lr=LR),loss=MCRMSE)
    return model
    
model = build_model(68,68)
tf.keras.utils.plot_model(model,to_file='./../../working/model.png',show_shapes=True,dpi=55)"""