In [None]:
import random
import pickle
import numpy as np
import pandas as pd
import scipy.stats as stats
from itertools import product
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
np.random.seed(1337)

def one_hot_encode(df, col='utr', seq_len=50):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    
    # Creat empty matrix.
    vectors=np.empty([len(df),seq_len,4])
    
    # Iterate through UTRs and one-hot encode
    for i,seq in enumerate(df[col].str[:seq_len]): 
        seq = seq.lower()
        a = np.array([nuc_d[x] for x in seq])
        vectors[i] = a
    return vectors


def structure_2D(seqs):
    '''
    Creates a matrix of sequence x sequence with a 1 for any nucleotide pair that could
    potentially base pair and a 0 for any pair that could not base pair. Pairs must be AT,
    GC, or GU wobble (GT here because sequences contain T). The pair must be seaparated by
    at least 3 nucleotides to be considered a potential base pair interaction.
    '''
    result = []
    l = len(seqs.iloc[0])
    diag_zeros = np.ones((l, l))
    for i in range(-3, 4):
        diag_zeros *= np.diag(-1 * np.ones(l - abs(i)), i) + 1
        bp = (('A', 'T'), ('T', 'A'), ('G', 'C'), ('C', 'G'), ('G', 'T'), ('T', 'G'))
    for i, seq in enumerate(seqs):
        bp_potential = [int(pair in bp) for pair in product(seq, repeat=2)]
        bp_2D = np.reshape(bp_potential, (l, l)) * diag_zeros
        result.append(bp_2D)
    return np.asarray(result)

Test structure_2D function:
structure_2D(['CCCCAAAGGGG']) should give blocks of 1s in the top right and bottom left corners separated by 3 0s
on the center diagonal.

In [None]:
test = pd.DataFrame(['CCCCAAAGGGG'])
print(structure_2D(test[0]))

In [None]:
# Keep 280k UTRs with most reads (what the authors used)
df = pd.read_csv('../data/GSM3130435_egfp_unmod_1.csv')
df.sort_values('total_reads', inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)
df = df.iloc[:280000]

# Create column that identifies upstream AUGs
df = df.assign(uAUG=df['utr'].str.contains("ATG"))

# Split train and test keeping proportion of uAUG the same
train, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df['uAUG'])

# split "train" into training and validation
train, valid = train_test_split(train, test_size=1/9, random_state=42, stratify=train['uAUG'])

# One-hot encode both training and test UTRs
train_one_hot = one_hot_encode(train, seq_len=50)
valid_one_hot = one_hot_encode(valid, seq_len=50)
test_one_hot = one_hot_encode(test, seq_len=50)

# Create 2D structure matrices
train_structure = structure_2D(train['utr'])
valid_structure = structure_2D(valid['utr'])
test_structure = structure_2D(test['utr'])

# Scale the training mean ribosome load values
scaler = preprocessing.StandardScaler()
scaler.fit(train['rl'].values.reshape(-1,1))
train = train.assign(scaled_rl=scaler.transform(train.loc[:,'rl'].values.reshape(-1,1)))
valid = valid.assign(scaled_rl=scaler.transform(valid.loc[:,'rl'].values.reshape(-1,1)))
test = test.assign(scaled_rl=scaler.transform(test.loc[:,'rl'].values.reshape(-1,1)))

In [None]:
pickle.dump((train, valid, test), "egfp_unmod_1_split_data.pkl")
pickle.dump((train_one_hot, valid_one_hot, test_one_hot), "egfp_unmod_1_split_one_hot.pkl")
pickle.dump((train_structure, valid_structure, test_structure), "egfp_unmod_1_split_structure.pkl")