In [1]:
import os 
import random
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import GroupShuffleSplit

import tensorflow as tf

%load_ext autoreload


In [2]:
# basic random seed
DEFAULT_RANDOM_SEED = 2022

def seedBasic(seed=DEFAULT_RANDOM_SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# tensorflow random seed 
def seedTF(seed=DEFAULT_RANDOM_SEED):
    tf.random.set_seed(seed)
    
# torch random seed
# import torch
# def seedTorch(seed=DEFAULT_RANDOM_SEED):
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
      
# basic + tensorflow + torch 
def seedEverything(seed=DEFAULT_RANDOM_SEED):
    seedBasic(seed)
    seedTF(seed)
    # seedTorch(seed)

seedEverything()

In [3]:
class CFG:
    input_file='data/rfam/rfam_seq_100.csv.gz'
    label='IRES'
    feature='Seq'
    feature_group='ID'
    feature_seq_len=600
    feature_pad_end='3end'
    channel=4


In [4]:
df = pd.read_csv(CFG.input_file)
print(df.shape)

data = df[[CFG.label, CFG.feature, CFG.feature_group]]
data = data.rename(columns={CFG.label: 'label', CFG.feature: 'seq', CFG.feature_group: 'group' })
data = data[~data['label'].isnull()]
print(data.shape)

(168034, 9)
(168034, 3)


In [5]:
y=data['label']
#y=tf.keras.utils.to_categorical(y)
g=data['group']
#print(X.shape)
print(y.shape)
add_on=['A','C','G','T']

(168034,)


In [None]:
# prepare data
for fold in range(1,6):
    print(fold)
    
    valid_split = GroupShuffleSplit(test_size=.20, n_splits=2)
    
    while True:
        split = valid_split.split(data, y, groups=g)
        train_inds, valid_inds = next(split)
        if len(train_inds)/len(valid_inds)>3:
            break
    
    data_train=data.iloc[train_inds,:]
    data_valid=data.iloc[valid_inds,:]
    
    data_train_pos=data_train[data_train.label==1]
    data_train_pos_new = data_train_pos.sample(n=data_train.shape[0], random_state=1, replace=True)
    
    for i in range(data_train_pos_new.shape[0]):
        r=random.uniform(0, 1)
        if r>0.75:
            tmp=random.sample([-1, -2, -3], k=1)
            data_train_pos_new.seq.iloc[i]=data_train_pos_new.seq.iloc[i][:tmp[0]]
        if r >0.5 and r <=0.75:
            tmp=random.sample([1, 2, 3], k=1)
            data_train_pos_new.seq.iloc[i]=data_train_pos_new.seq.iloc[i][tmp[0]:]
        if r >0.25 and r <=0.5:
            tmp=''.join(np.random.choice(add_on, size=random.sample([1,2,3],k=1), replace=True))
            data_train_pos_new.seq.iloc[i]=data_train_pos_new.seq.iloc[i] + tmp
        else:
            tmp=''.join(np.random.choice(add_on, size=random.sample([1,2,3],k=1), replace=True))
            data_train_pos_new.seq.iloc[i]=tmp + data_train_pos_new.seq.iloc[i]

    data_train_all = pd.concat([data_train, data_train_pos_new])
    data_train_all = data_train_all.sample(frac=1).reset_index(drop=True)
    data_train_all['set']='train'
    data_valid['set']='valid'
    data_all = pd.concat([data_train_all, data_valid])
    data_all = data_all.reset_index(drop=True)
    data_all['id']=str(fold)+'_'+data_all.index.astype(str)
    data_all.to_csv('data/fold'+str(fold)+'.csv', index=False)
