In [1]:
"""
(1)代码说明
读取原始数据，并将原始数据划分成训练集、开发集和测试集，其中训练集0.6，开发集0.2，测试集0.2

对应变量名称:
    X_train,X_dev,X_test
    y_train,y_dev,y_test

变量类型:
    numpy.ndarray

变量维度:
    X_train,X_dev,X_test: (8844, 200, 4) (2948, 200, 4) (2948, 200, 4)
    y_train,y_dev,y_test: (8844,) (2948,) (2948,)

"""

"""
(2)原始数据 
所在文件夹"../data"
AAAAAG.txt  AATACA.txt  ACTAAA.txt  CATAAA.txt     
AAGAAA.txt  AATAGA.txt  AGTAAA.txt  GATAAA.txt 
AATAAA.txt  AATATA.txt  ATTAAA.txt  TATAAA.txt
negAAGAAA.txt  negAATAGA.txt  negAGTAAA.txt  negGATAAA.txt  
negAATAAA.txt  negAATATA.txt  negATTAAA.txt  negTATAAA.txt
negAAAAAG.txt  negAATACA.txt  negACTAAA.txt  negCATAAA.txt  
"""

"""
(3)数据保存位置： "../temp_data/"


"""

"""
(4)数据处理说明
"""

'\n(4)数据处理说明\n'

In [4]:
#读取数据
import numpy as np
from sklearn.model_selection import train_test_split
polys="AATAGA AATATA CATAAA GATAAA AGTAAA ACTAAA TATAAA AATACA AAAAAG AAGAAA ATTAAA AATAAA".split(" ")
np.random.seed(22)

def fetch_polyA(poly_name,file_dir='../data/'):
    '''
    one-hot coding and remove the polyA signal which is 6bp long
    
    input:
        poly_name:polyA，如AATAAA
        file_dir: polyA data file site
    
    output:
        X:pos and neg polyA data
        y:pos and neg polyA label
    '''
    
    file_path_pos = file_dir+poly_name+'.txt'
    file_path_neg = file_dir+'neg'+poly_name+'.txt'

    base2num={
    'A':np.array([1,0,0,0],dtype='float16'),
    'T':np.array([0,1,0,0],dtype='float16'),
    'C':np.array([0,0,1,0],dtype='float16'),
    'G':np.array([0,0,0,1],dtype='float16'),
    'a':np.array([1,0,0,0],dtype='float16'),
    't':np.array([0,1,0,0],dtype='float16'),
    'c':np.array([0,0,1,0],dtype='float16'),
    'g':np.array([0,0,0,1],dtype='float16')
    }
    
    
    pdata = np.loadtxt(file_path_pos,dtype='str')
    pdata = [seq[:100]+seq[106:] for seq in pdata]
    pdata = [[base2num[base] for base in seq] for seq in pdata]
                   
    ndata = np.loadtxt(file_path_neg,dtype='str')
    ndata = [seq[:100]+seq[106:] for seq in ndata]
    ndata = [[base2num[base] for base in seq] for seq in ndata]
    
    X = np.array(pdata+ndata)
    y = np.append(np.ones(len(pdata)),np.zeros(len(ndata)))
    
    return X,y

def fetch_and_split_ployA(polya_name):
    """
    split X,y into X_train,X_dev,X_test,y_train,y_dev,y_test without shuffle
    """
    X,y = fetch_polyA(polya_name)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=22,shuffle =True)
    X_train,X_dev,y_train,y_dev = train_test_split(X_train,y_train,test_size=0.25,random_state=22)
    return X_train,X_dev,X_test,y_train,y_dev,y_test




def run():
    """
    concatenate 12 polyA sigmal data
    """
    X_train,X_dev,X_test,y_train,y_dev,y_test = [],[],[],[],[],[]
    for index,poly in enumerate(polys):
        X_tr,X_de,X_te,y_tr,y_de,y_te = fetch_and_split_ployA(poly)
        if(index == 0):
            X_train,X_dev,X_test,y_train,y_dev,y_test = X_tr,X_de,X_te,y_tr,y_de,y_te
        else:
            X_train = np.concatenate((X_train,X_tr),axis=0)
            X_dev   = np.concatenate((X_dev,X_de),axis=0)
            X_test  = np.concatenate((X_test,X_te),axis=0)
            y_train = np.append(y_train,y_tr)
            y_dev   = np.append(y_dev,y_de)
            y_test  = np.append(y_test,y_te)
    return X_train,X_dev,X_test,y_train,y_dev,y_test

X_train,X_dev,X_test,y_train,y_dev,y_test = run()


print('X:',X_train.shape,X_dev.shape,X_test.shape)
print('Y:',y_train.shape,y_dev.shape,y_test.shape)

X: (8844, 200, 4) (2948, 200, 4) (2948, 200, 4)
Y: (8844,) (2948,) (2948,)


In [5]:
# def my_shuffle(*args):
#     for arg in args:
#         np.random.seed(22)
#         np.random.shuffle(arg)
# my_shuffle(X_train,X_dev,X_test,y_train,y_dev,y_test)

# def change_y(y):
#     re = np.zeros((len(y),2))
#     for index,value  in enumerate(y):
#         if(value == 1):
#             re[index] = [1,0]
#         else:
#             re[index] = [0,1]
#     return re
# y_train,y_dev,y_test = change_y(y_train),change_y(y_dev),change_y(y_test)

X: (8844, 200, 4) (2948, 200, 4) (2948, 200, 4)
Y: (8844,) (2948,) (2948,)


In [25]:
#保存数据
np.save("../temp_data/X_train.npy",X_train)
np.save("../temp_data/X_dev.npy",  X_dev)
np.save("../temp_data/X_test.npy", X_test)

np.save("../temp_data/y_train.npy",y_train)
np.save("../temp_data/y_dev.npy",y_dev)
np.save("../temp_data/y_test.npy",y_test)

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 0., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 1., 1., ..., 0., 0., 1.],
       [0., 1., 1., ..., 1., 1., 0.]])