In [67]:
import numpy as np
import pandas as pd
import pickle
from sklearn.utils import shuffle

In [133]:
#################################
# input and setting: edit these
#################################

# file should have columns called "sequence" with 23 nts and "effect"
# train, val, test on samplepubmed1
filename = 'data/samplePubMed1.csv'
outfile_prefix = 'samplePubMed1'

# percent of data to be split into train, val, test 
# train, val, test on samplepubmed1
n_train, n_val, n_test = 0.8, 0.1, 0.1

# type of model: either 'regression' or 'classification'
# model_type = 'regression'
model_type = 'classification'

# # generate test set only on samplepubmed2
filename = 'data/samplePubMed2.csv'
outfile_prefix = 'samplePubMed2'
n_train, n_val, n_test = 0, 0, 1

In [134]:
##################################################################

In [135]:
# read in data
data = pd.read_csv(filename)

In [136]:
# groupby sequence (for model, same seq can not have multiple effects)
data = data.groupby(['sequence']).median()['effect'].reset_index()

# shuffle data so that we can split into training val test
data = shuffle(data, random_state=23)

# get unique seqs
unique_seqs = data['sequence']

In [137]:
# one_hot function
def one_hot(x):
    e_dict = {'A':[1,0,0,0], 'C':[0,1,0,0], 'G':[0,0,1,0], 'T':[0,0,0,1]}
    return e_dict[x]

# one hot encode unique seqs
one_hot_un_seq = unique_seqs.apply(lambda x: map(one_hot, x))
one_hot_un_seq = one_hot_un_seq.reset_index()

features = np.ndarray(shape=(one_hot_un_seq['sequence'].shape[0],23,4), dtype=int)
for i in range(one_hot_un_seq.shape[0]):
    for j in range(23):
        features[i][j] = one_hot_un_seq['sequence'][i][j]

In [138]:
if model_type == "regression":
    def convert_labels_reg(y):
        return (y + 10)/float(20)
    labels = data['effect']
    labels = np.array(list(map(lambda x: convert_labels_reg(x), labels)))
else: # 21-label classification labels
    num_classes=21
    labels = data['effect']
    labels = np.array(list(map(lambda x: int(x+10), labels)))
    labels = np.eye(num_classes)[labels]

In [139]:
# split shuffled data into training, validation, testing
train_stop = int(n_train*len(one_hot_un_seq))
val_stop = train_stop + int(n_val*len(one_hot_un_seq))

train_data = features[0:train_stop]
train_labels = labels[0:train_stop]

val_data = features[train_stop:val_stop]
val_labels = labels[train_stop:val_stop]

test_data = features[val_stop: len(one_hot_un_seq)]
test_labels = labels[val_stop: len(labels)]

In [140]:
##################################################################

In [141]:
# write output files as pickle
if n_train != 0:
    pickle.dump(train_data, open("data/" + outfile_prefix + "_train_data.p", "wb"))
    pickle.dump(train_labels, open("data/" + outfile_prefix + "_train_labels.p", "wb"))

if n_test != 0:
    pickle.dump(test_data, open("data/" + outfile_prefix + "_test_data.p", "wb"))
    pickle.dump(test_labels, open("data/" + outfile_prefix + "_test_labels.p", "wb"))

if n_val != 0:
    pickle.dump(val_data, open("data/" + outfile_prefix + "_val_data.p", "wb"))
    pickle.dump(val_labels, open("data/" + outfile_prefix + "_val_labels.p", "wb"))