In [4]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
np.random.seed(1337)

In [5]:
# Keep 280k UTRs with most reads (what the authors used)
df = pd.read_csv('../data/GSM3130435_egfp_unmod_1.csv')
df.sort_values('total_reads', inplace=True, ascending=False)
df.reset_index(inplace=True, drop=True)
df = df.iloc[:280000]

# Create column that identifies upstream AUGs
df = df.assign(uAUG=df['utr'].str.contains("ATG"))

# Split train and test keeping proportion of uAUG the same
train, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df['uAUG'])

# split "train" into training and validation
train, valid = train_test_split(train, test_size=1/9, random_state=42, stratify=train['uAUG'])

scaler = preprocessing.StandardScaler()
scaler.fit(train['rl'].values.reshape(-1,1))
train = train.assign(scaled_rl=scaler.transform(train.loc[:,'rl'].values.reshape(-1,1)))
valid = valid.assign(scaled_rl=scaler.transform(valid.loc[:,'rl'].values.reshape(-1,1)))
test = test.assign(scaled_rl=scaler.transform(test.loc[:,'rl'].values.reshape(-1,1)))


In [6]:
train.to_csv('egfp_unmod_1_train.csv', index=False)
valid.to_csv('egfp_unmod_1_valid.csv', index=False)
test.to_csv('egfp_unmod_1_test.csv', index=False)