In [1]:
import numpy as np
import pandas as pd
from itertools import product
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
np.random.seed(1337)

In [2]:
# Keep 280k UTRs with most reads (what the authors used)
df = pd.read_csv('../data/GSM3130435_egfp_unmod_1.csv')
df.sort_values('total_reads', inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)
df = df.iloc[:280000]

# Create column that identifies upstream AUGs
df = df.assign(uAUG=df['utr'].str.contains("ATG"))

# Split train and test keeping proportion of uAUG the same
train, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df['uAUG'])

# split "train" into training and validation
train, valid = train_test_split(train, test_size=1/9, random_state=42, stratify=train['uAUG'])

# Keep only sequences with no uAUG
train = train.loc[~train['uAUG']]
valid = valid.loc[~valid['uAUG']]
test = test.loc[~test['uAUG']]

scaler = preprocessing.StandardScaler()
scaler.fit(train['rl'].values.reshape(-1,1))
train = train.assign(scaled_rl=scaler.transform(train.loc[:,'rl'].values.reshape(-1,1)))
valid = valid.assign(scaled_rl=scaler.transform(valid.loc[:,'rl'].values.reshape(-1,1)))
test = test.assign(scaled_rl=scaler.transform(test.loc[:,'rl'].values.reshape(-1,1)))


In [3]:
def structure_2D(seqs):
    '''
    Creates a matrix of sequence x sequence with a 1 for any nucleotide pair that could
    potentially base pair and a 0 for any pair that could not base pair. Pairs must be AT,
    GC, or GU wobble (GT here because sequences contain T). The pair must be separated by
    at least 3 nucleotides to be considered a potential base pair interaction.
    '''
    result = []
    l = len(seqs.iloc[0])
    diag_zeros = np.ones((l, l))
    for i in range(-3, 4):
        diag_zeros *= np.diag(-1 * np.ones(l - abs(i)), i) + 1
        bp = (('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G'), ('G', 'T'), ('T', 'G'))
    for i, seq in enumerate(seqs):
        bp_potential = [int(pair in bp) for pair in product(seq, repeat=2)]
        bp_2D = np.reshape(bp_potential, (l, l)) * diag_zeros
        result.append(bp_2D)
    return np.asarray(result)

def one_hot_encode(df, col='utr', seq_len=50):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    
    # Creat empty matrix.
    vectors=np.empty([len(df),seq_len,4])
    
    # Iterate through UTRs and one-hot encode
    for i,seq in enumerate(df[col].str[:seq_len]): 
        seq = seq.lower()
        a = np.array([nuc_d[x] for x in seq])
        vectors[i] = a
    return vectors

In [4]:
train_one_hot = one_hot_encode(train, seq_len=50)
valid_one_hot = one_hot_encode(valid, seq_len=50)
test_one_hot = one_hot_encode(test, seq_len=50)

train_structure = structure_2D(train['utr'])
valid_structure = structure_2D(valid['utr'])
test_structure = structure_2D(test['utr'])

In [5]:
# train.to_csv('egfp_unmod_1_train.csv', index=False)
# valid.to_csv('egfp_unmod_1_valid.csv', index=False)
# test.to_csv('egfp_unmod_1_test.csv', index=False)

# np.save('egfp_unmod_1_onehot_train.npy', train_one_hot)
# np.save('egfp_unmod_1_onehot_valid.npy', valid_one_hot)
# np.save('egfp_unmod_1_onehot_test.npy', test_one_hot)

# np.save('egfp_unmod_1_structure_train.npy', train_structure)
# np.save('egfp_unmod_1_structure_valid.npy', valid_structure)
# np.save('egfp_unmod_1_structure_test.npy', test_structure)

train.to_csv('train_no_uAUG.csv', index=False)
valid.to_csv('valid_no_uAUG.csv', index=False)
test.to_csv('test_no_uAUG.csv', index=False)

np.save('onehot_train_no_uAUG.npy', train_one_hot)
np.save('onehot_valid_no_uAUG.npy', valid_one_hot)
np.save('onehot_test_no_uAUG.npy', test_one_hot)

np.save('structure_train_no_uAUG.npy', train_structure)
np.save('structure_valid_no_uAUG.npy', valid_structure)
np.save('structure_test_no_uAUG.npy', test_structure)