In [1]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.text.all import *
from pathlib import Path

In [4]:
from Bio import Seq
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import FeatureLocation, CompoundLocation
import networkx as nx
from utils import *

In [6]:
path = Path('../data/genomic_data/')

In [7]:
# select train, test and challenge data for 16K
mrna_train16k = 'mRNAs.train16K.fa'  
lncrna_train16k = 'lncRNAs.train16K.fa'

mrna_test ='mRNAs.TEST500.fa'
lncrna_test = 'lncRNAs.TEST500.fa'

mrna_challenge = 'mRNAs.CHALLENGE500.fa'
lncrna_challenge = 'lncRNAs.CHALLENGE500.fa'

# build data set for FULL length sequence files
mrna_train = 'mRNAs.TRAIN.fa'
lncrna_train = 'lncRNAs.TRAIN.fa'

In [8]:
# parse fasta files
def parse_fdata(filename, label):
    fasta = SeqIO.parse(path/filename, 'fasta')
    files = [i for i in fasta]
    seqs = [str(i.seq) for i in files]
    df = pd.DataFrame(seqs, columns=['Sequence'])
    df['Target'] = label
    df = df.drop_duplicates()
    return df

In [10]:
def split_data(df):
    train_sz = int(len(df) * 0.90)
    val_sz = int(len(df) - train_sz)
    
    train_df = df.sample(train_sz)
    val_df = df.drop(train_df.index)
    
    train_df['set'] = 'train'
    val_df['set'] = 'validation'
    
    return pd.concat([train_df, val_df])

In [12]:
# 16K files
mrna16k_df = split_data(parse_fdata(mrna_train16k, 'mRNA'))
lncrna16k_df = split_data(parse_fdata(lncrna_train16k, 'lncRNA'))

mrna_test_df = split_data(parse_fdata(mrna_test, 'mRNA'))
mrna_test_df['set'] = 'test'
lncrna_test_df = split_data(parse_fdata(lncrna_test, 'lncRNA'))
lncrna_test_df['set'] = 'test'


mrna_chal_df = split_data(parse_fdata(mrna_challenge, 'mRNA'))
mrna_chal_df['set'] = 'challenge set'

lncrna_chal_df = split_data(parse_fdata(lncrna_challenge, 'lncRNA'))
lncrna_chal_df['set'] = 'challenge set'

# Full length sequence
mrna_df = split_data(parse_fdata(mrna_train, 'mRNA'))
lncrna_df = split_data(parse_fdata(lncrna_train, 'lncRNA'))


In [13]:
dfs = [mrna16k_df, lncrna16k_df, mrna_test_df, lncrna_test_df, mrna_chal_df, lncrna_chal_df]

In [14]:
[i.shape for i in dfs]

[(15978, 3), (15950, 3), (500, 3), (500, 3), (499, 3), (500, 3)]

In [15]:
data_df = pd.concat(dfs)

In [16]:
data_df.tail()

Unnamed: 0,Sequence,Target,set
439,GCTGCCCAGAGAGGTCTTTAATCCATGACTCCAAAGCAAGTCCCTGAGTCGCTGCACCCACAACTCACGATCTCGGGGTCCGGCGTTAGTCTTCAGCACCGGGATCTCAGACACAACACGTCGCGTGGCTTCATCCGCCATCTTGGACCAGGGCGCAAGAGAAACGCGGAAGTGGTGTTGCCTGTAGCTTACCTCAAGGCGCCGTCATCTTTACTATGGGAAGATGAGACGTTGGGTTGCTGTAAGACCTCTTTTGAATAAGATCTTTGTTTTCTTGTCACCTAGTTACCCGCTTGTGAGTGCATCGAGAAGAACAGAATGTAAACAGATTACTCCGTCCTATATATGACACCTATCATTAAAATCACCGCATAAGGCA,lncRNA,challenge set
448,AAATAAAAGCAAGTCTCATTACTTTTCAAGGGGAGTGTAGAAGCGATTAGTCTCTGATGATAATAAGGGTACCCCCCATGTAAAACTGCCACAAGCTCTCAGAAGTGAGACTTTAGCTCCTTACAAATGCTGAGCAGACGTGAACACCCTCCTGGGAGATTGATGAGTCCTATTAACCTTGTTGCATAGATGGAAACTCAGAAACAAAGTGACTCTGAGGATTAAGAGGAAGTCAATGTCAGCACCAGGATTAGAATTCACACTTGCCTGGTTTCTCAGCCCACAACCAGCTCCCCCATGGATTCCTGCTTTGTCTCCCTCCCCAAGTGAGGGGACAGGGTGTTTCCAAACTTTAAAAGAATCCTGTTATCCTCAGGCCTCTGCCTTGTAGGAAGATCCTTTCTTTTAATAAATTACAGAGCTGAGAA,lncRNA,challenge set
458,GTGTTCACACAGTGCTAAGGGAACACGCTGACGTGTGTGACATCAAGAACTCCAGTGCCCACACAGCCCGAGTCCGGATGGGACGTTTGAAATCAGGAAGGAGTGTGGGTTGGATCCACGTTTCTGTTCATACCAGGCAGGCACATCGGAAGGCTCACGGTCAGAGGATGCTGGGACAAACGCCACGAGAGGTTCCTTTTTGTCTGGCTCACCCAGAAGAGAGGCTCCTCTCCTCCCTTCCCTGCAGCACAACGGCCTGGCTGGCACCCAAAGGGCAGCCGGCATGGGCTGACCTCAGAGGAGCAAGGCGTGTTTGCAGCTCTGCCGCTCACATCCACAGTGGCCATGACAACAGGTGCAGCACCATCCATTAAAGCCCTGTGTCATTTCA,lncRNA,challenge set
480,TTGGCTACACATTGGAGTCATTTGGGAAGCTAGAAAAAAAAAATGCCCAGGACCTGGGCAAGACGGCTGAATAAGAACAGCTCTGGTCTGCAGTTCCTAGCAAGATCAACGCAGAAGGGAAGCTGGCAGCCTGGGCTCTTGGAGATTGTATCTGGTGCCGGAGAGGACTGGCTCCCACACTCAGTGAGGGAGGAATTTTACTGACCCAGCATCCAGATGAACTTTGGTGAGGAGAGGAGGTGAGGAAGTCAATGAGCCTTGCTTTTTCACTTGCAATTACGGCATTTCCTTTGGAAATGGACATAAGAGCCTGGTGGTTGATGATGGAAGAGATGAAACTGTGAAGAGACCGAAGAAGCTAATCTATCAATAGTTAAGATTCCATTTCTAAAAGATCTTACTTTGAGTCTACCTATGCCTGGG,lncRNA,challenge set
499,AGACCCGGTGAGAGGTACTGGGCGATGAGGAAGTACAGCTCCAACTCCATGAGAGGCACCGGGCCTCAGAACCAGGAAGATCCGGATAAAGACACTGTCTGCACTCTACTCAGATGCCCTTTGAGTCTATGCGCTGTTTCTGGGCTCATCCTCCAGCCTCAACGTGTTTGCTGCCACACAGCATGTCCTGTGGGACTGGAGAAATATGGAGAATGCAGCACACTTGCTTCCTACTGAA,lncRNA,challenge set


In [17]:
data_df.to_csv(path/'lncRNA_data.csv', index=False)

In [19]:
full_len_dfs = [mrna_df, lncrna_df, mrna_test_df, lncrna_test_df, mrna_chal_df, lncrna_chal_df]
[i.shape for i in full_len_dfs]

[(86978, 3), (24339, 3), (500, 3), (500, 3), (499, 3), (500, 3)]

In [20]:
full_len_df = pd.concat(full_len_dfs)

In [21]:
full_len_df.shape

(113316, 3)

In [23]:
full_len_df.head()

Unnamed: 0,Sequence,Target,set
54494,CAAGGCATTCTTCCATGTCCTCAGCCTCCTCTTTCCTTCCTAGGACTGGCTTCCATGGAGGTGAAGAACTGCTGCATGGTGACAGAGTTCATCCTTTTGGGAATCCCACACACAGAGGGGCTGGAGATGACACTTTTTGTCTTATTCTTGCCCTTCTATGCCTGCACTCTACTGGGAAATGTGTCTATCCTTGTTGCTGTTATGTCTTCTGCTCGCCTTCACACACCTATGTATTTCTTCCTGGGAAACTTGTCTGTGTTTGACATGGGTTTCTCCTCAGTGACTTGTCCCAAAATGCTGCTCTACCTTATGGGGCTGAGCCGACTCATCTCCTACAAAGACTGTGTCTGCCAGCTTTTCTTCTTCCATTTCCTCGGGAGCATTGAGTGCTTCTTGTTTACGGTGATGGCCTATGACCGCTTCACTGCCATCTGTTATCCTCTGCGATACACAGTCATCATGAACCCAAGGATCTGTGTGGCCCTGGCTGTGGGCACATGGCTGTTAGGGTGCATTCATTCCAGTATCTTGACCTCCCTCACCTTCACCTTGCCATACTGTGGTCCCAATGAAGTGGATCACTTCTTCTGTGACAT...,mRNA,train
70168,TCCCAGCCCGGCGACTGCTCGGGCCCGGCCGCCACCTGCACGGCGGGGGAGCCGCTCGCCGCGGGAGCGTCAGGAGGGCACGCGTCTGCGGCTGAACCGCGGAAGGGCCGGTGAGGAACCGGGCCTCGGGAGATGGCCCTGAGGGCCCCCGCACTGCTGCCGCTGCTGCTGCTACTACTGCCGCTCCGCGCCGCCGGCTGCCCAGCAGCCTGCCGCTGCTACAGCGCCACGGTGGAGTGTGGCGCCCTGCGGTTGCGCGTCGTCCCGCTGGGAATCCCGCCAGGGACGCAGACACTGTTCCTGCAGGACAACAACATCGCCCGCCTAGAGCCGGGAGCCCTGGCGCCACTCGCCGCTCTGCGCCGGCTCTACCTGCACAACAACAGCCTGCGCGCCCTGGAGGCCGGCGCCTTCCGCGCGCAGCCGCGCCTGCTGGAGCTGGCGCTCACTAGCAACCGGCTGCGCGGCTTGCGCAGCGGCGCCTTCGTAGGCCTGGCCCAGCTGCGCGTGCTCTACCTGGCGGGCAACCAGCTGGCGCGGCTGCTGGATTTCACCTTCTTGCACCTGCCGGAGCTTCACCTGCAAGAAAACAGCAT...,mRNA,train
5216,TTTTTTTTTTTTTCTTTTTTTTTTTTTGCCGGAGTCGAGCGGGTGCTGCTAGCGGAGGCGCCATATTGGAGGGGACAAAACTCCGGCGACAGCGAGTGACACAAATAAACCCCTGGACCCCCTTGTTCCCTCAGCTCTAAGGGCCGCGATGTTGTACCTAGAAGACTATCTGGAAATGATTGAGCAGCTTCCTATGGATCTGCGGGACCGCTTCACGGAAATGCGCGAGATGGACCTGCAGGTGCAGAATGCAATGGATCAACTAGAACAAAGAGTCAGTGAATTCTTTATGAATGCAAAGAAAAATAAACCTGAGTGGAGGGAAGAGCAAATGGCATCCATCAAAAAAGACTACTATAAAGCTTTGGAAGATGCAGATGAGAAGGTTCAGTTGGCAAACCAGATATATGACTTGGTAGATCGACACTTGAGAAAGCTGGATCAGGAACTGGCTAAGTTTAAAATGGAGCTGGAAGCTGATAATGCTGGAATTACAGAAATATTAGAGAGGCGATCTTTGGAATTAGACACTCCTTCACAGCCAGTGAACAATCACCATGCTCATTCACATACTCCAGTGGAAAAAAGGAAATATA...,mRNA,train
78696,GAGACTTTTAGTTTCGCTTTCGCTAAAGGGGCCCCAGACCCTTGCTGCGGAGCGACGGAGAGAGACTGTGCCAGTCCCAGCCGCCCTACCGCCGTGGGAACGATGGCAGATGATCAGGGCTGTATTGAAGAGCAGGGGGTTGAGGATTCAGCAAATGAAGATTCAGTGGATGCTAAGCCAGACCGGTCCTCGTTTGTACCGTCCCTCTTCAGTAAGAAGAAGAAAAATGTCACCATGCGATCCATCAAGACCACCCGGGACCGAGTGCCTACATATCAGTACAACATGAATTTTGAAAAGCTGGGCAAATGCATCATAATAAACAACAAGAACTTTGATAAAGTGACAGGTATGGGCGTTCGAAACGGAACAGACAAAGATGCCGAGGCGCTCTTCAAGTGCTTCCGAAGCCTGGGTTTTGACGTGATTGTCTATAATGACTGCTCTTGTGCCAAGATGCAAGATCTGCTTAAAAAAGCTTCTGAAGAGGACCATACAAATGCCGCCTGCTTCGCCTGCATCCTCTTAAGCCATGGAGAAGAAAATGTAATTTATGGGAAAGATGGTGTCACACCAATAAAGGATTTGACAGCCCA...,mRNA,train
53585,GATTTAACCCAGGAGAGCCGCTGGTGGGAGGCGCGGCTGGCGCCGCTGCGCGCATGGGCCTGTTCCTGGCCCGCAGCCGCCACCTACCCAGTGACCATGATAGTGTTTGTCAGGTTCAACTCCAGCCATGGTTTCCCAGTGGAGGTCGATTCTGACACCAGCATCTTCCAGCTCAAGGAGGTGGTTGCTAAGCGACAGGGGGTTCCGGCTGACCAGTTGCGTGTGATTTTCGCAGGGAAGGAGCTGAGGAATGACTGGACTGTGCAGAATTGTGACCTGGATCAGCAGAGCATTGTTCACATTGTGCAGAGACCGTGGAGAAAAGGTCAAGAAATGAATGCAACTGGAGGCGACGACCCCAGAAACGCGGCGGGAGGCTGTGAGCGGGAGCCCCAGAGCTTGACTCGGGTGGACCTCAGCAGCTCAGTCCTCCCAGGAGACTCTGTGGGGCTGGCTGTCATTCTGCACACTGACAGCAGGAAGGACTCACCACCAGCTGGAAGTCCAGCAGGTAGATCAATCTACAACAGCTTTTATGTGTATTGCAAAGGCCCCTGTCAAAGAGTGCAGCCGGGAAAACTCAGGGTACAGTGCAG...,mRNA,train


In [25]:
full_len_df = full_len_df[~full_len_df.Sequence.map(lambda x: 'N' in x.upper())]

In [26]:
full_len_df.shape

(113315, 3)

In [27]:
full_len_df.to_csv(path/'lncRNA_Full_len_data.csv')