In [1]:
!pwd

/content


In [11]:
import argparse
import itertools
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
import pathlib

# truncate sequences longer than max_len
MAX_LEN = 4000
KMER = 3 # or 6

# read_dir = pathlib.Path('.../misannotated_lncrna/data/external/cppred')
# write_dir = pathlib.Path('.../misannotated_lncrna/data/interim')

def tokenize_sequences(sequences, kmer=KMER, max_len=MAX_LEN):
    """
    takes input sequences e.g. "AAAAAC" and returns tokenized representation.
    
    i've implemented a sliding window kmer tokenization, i.e. kmer window shifts 
    by 1 at each step. Therefore, 3-mer tokens of AAAAAC would be 
    [AAA, AAA, AAA, AAC] 

    (at some point, i'd like to compare to the discrete tokenization as well, 
    i.e the one where 3-mer tokens of AAAAAC would be [AAA, AAC] only)

    e.g. if tokernizer is {AAA:1, AAC: 2, AAG:3, ...}, then, 
    "AAAAAC" -> [AAA, AAA, AAA, AAC] -> [1,1,1,2]

    Most of this function has been adapted from 
    https://github.com/hzy95/EPIVAN/blob/master/sequence_processing.py

    """
 
    def sentence2word(str_set, k_mer=kmer):
        word_seq=[]
        for sr in str_set:
            tmp=[]
            for i in range(len(sr)-(k_mer-1)):
                tmp.append(sr[i:i+k_mer])
            word_seq.append(' '.join(tmp))
        return word_seq

    def word2num(wordseq, tokenizer, max_len):
        sequences = tokenizer.texts_to_sequences(wordseq)
        numseq = pad_sequences(sequences, maxlen=max_len)
        return numseq

    def sentence2num(str_set, tokenizer, max_len):
        wordseq = sentence2word(str_set)
        numseq = word2num(wordseq, tokenizer, max_len)
        return numseq

    def get_tokenizer(k_mer):
        f= ['a','c','g','t']
        res=[]

        if k_mer == 3:
            c = itertools.product(f,f,f)
            for i in c:
                temp=i[0]+i[1]+i[2]
                res.append(temp)

        elif k_mer == 6:
            c = itertools.product(f,f,f,f,f,f)
            for i in c:
                temp=i[0]+i[1]+i[2]+i[3]+i[4]+i[5]
                res.append(temp)    
        
        res=np.array(res)
        NB_WORDS = res.shape[0]
        tokenizer = Tokenizer(num_words=NB_WORDS)
        tokenizer.fit_on_texts(res)
        acgt_index = tokenizer.word_index
        acgt_index['null'] = 0
        return tokenizer

    tokenizer = get_tokenizer(kmer)
    tokenized_sequences = sentence2num(sequences, tokenizer, max_len)
    return tokenized_sequences


def main(read_dir, write_dir):
    # test- coding rnas
    crna_test = open(read_dir / 'Human_coding_RNA_test.fa','r').read().splitlines()
    crna_test_seqs = crna_test[1::2]
    crna_test_ids = [s.split('|')[3] for s in crna_test[0::2]]

    crna_test_tokens = tokenize_sequences(crna_test_seqs, KMER, MAX_LEN)

    with open(write_dir / 'X_c_test.pickle', 'wb') as handle:
        pickle.dump(crna_test_tokens, handle)
    with open(write_dir / 'X_c_ids_test.pickle', 'wb') as handle:
        pickle.dump(crna_test_ids, handle)

    # test- noncoding rnas
    ncrna_test = open(read_dir / 'Homo38_ncrna_test.fa','r').read().splitlines()
    ncrna_test_seqs = ncrna_test[1::2]
    lnc_seqs_inds =  [i for i,seq in enumerate(ncrna_test_seqs) if len(seq)>200]
    ncrna_test_seqs_lnc = [ncrna_test_seqs[i] for i in lnc_seqs_inds]
    ncrna_test_ids = [s.split(' ')[0][1:] for s in ncrna_test[0::2]]
    ncrna_test_ids_lnc = [ncrna_test_ids[i] for i in lnc_seqs_inds]

    ncrna_test_tokens = tokenize_sequences(ncrna_test_seqs_lnc, KMER, MAX_LEN)

    with open(write_dir / 'X_nc_test.pickle', 'wb') as handle:
        pickle.dump(ncrna_test_tokens, handle)
    with open(write_dir / 'X_nc_ids_test.pickle', 'wb') as handle:
        pickle.dump(ncrna_test_ids_lnc, handle)

    # train- coding rnas
    crna_train = open(read_dir / 'Human.coding_RNA_training.fa','r').read().splitlines()
    crna_train_seqs = crna_train[1::2]
    crna_train_ids = [s.split('|')[3] for s in crna_train[0::2]]

    crna_train_tokens = tokenize_sequences(crna_train_seqs, KMER, MAX_LEN)

    with open(write_dir / 'X_c_train.pickle', 'wb') as handle:
        pickle.dump(crna_train_tokens, handle)
    with open(write_dir / 'X_c_ids_train.pickle', 'wb') as handle:
        pickle.dump(crna_train_ids, handle)


    # train- noncoding rnas
    ncrna_train = open(read_dir / 'Homo38.ncrna_training.fa','r').read().splitlines()
    ncrna_train_seqs = ncrna_train[1::2]
    lnc_seqs_inds =  [i for i,seq in enumerate(ncrna_train_seqs) if len(seq)>200]
    ncrna_train_seqs_lnc = [ncrna_train_seqs[i] for i in lnc_seqs_inds]
    ncrna_train_ids = [s.split(' ')[0][1:] for s in ncrna_train[0::2]]
    ncrna_train_ids_lnc = [ncrna_train_ids[i] for i in lnc_seqs_inds]

    ncrna_train_tokens = tokenize_sequences(ncrna_train_seqs_lnc, KMER, MAX_LEN)

    with open(write_dir / 'X_nc_train.pickle', 'wb') as handle:
        pickle.dump(ncrna_train_tokens, handle)
    with open(write_dir / 'X_nc_ids_train.pickle', 'wb') as handle:
        pickle.dump(ncrna_train_ids_lnc, handle)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-i",
        "--read_dir",
        help="Path to data to read in",
        required=True,
    )
    parser.add_argument(
        "-o",
        "--write_dir",
        help="Path to where new data should be written",
        required=True,
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    main(
        read_dir=pathlib.Path(args.read_dir),
        write_dir=pathlib.Path(args.write_dir),
    )

usage: ipykernel_launcher.py [-h] -i READ_DIR -o WRITE_DIR
ipykernel_launcher.py: error: the following arguments are required: -i/--read_dir, -o/--write_dir


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [13]:
!python3 src/data/preprocess.py -i data/external/cppred -o data/temp

Preprocessing test coding RNAs...
Preprocessing test non-coding RNAs...
Preprocessing train coding RNAs...
Preprocessing train coding RNAs...


In [14]:
!rm -rf data/temp

# dna2vec

In [20]:
%cd /content/drive/MyDrive/00Projects/lncrna/misannotated_lncrna
!ls

/content/drive/MyDrive/00Projects/lncrna/misannotated_lncrna
data	 Makefile   README.md	requirements.txt  src
docs	 models     references	sample_runs.txt   test_environment.py
LICENSE  notebooks  reports	setup.py	  tox.ini


In [None]:
import itertools
import numpy as np
from keras.preprocessing.text import Tokenizer
import pickle  
import pathlib


# read_dir = pathlib.Path('/data/external/dna2vec')
# write_dir = pathlib.Path('/data/interim')


def getKmerEmbeddingMatrix(read_dir: pathlib.Path, kmer: int):
    """
    read_dir: path at which w2v file for dna embeddings is located
        dna2vec representations are obtained from https://arxiv.org/abs/1701.06279
    kmer: can be either 3 or 6, length of kmer for which to get embedding
    """  
    embedding_dim = 100
    # Read all embeddings
    embeddings_index = {}
    f = open(read_dir / 'dna2vec-20161219-0153-k3to8-100d-10c-29320Mbp-sliding-Xat.w2v')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    # Get word indices for relevant kmer
    f= ['a','c','g','t']
    res=[]
    if kmer == 3:    
        c = itertools.product(f,f,f)    
        for i in c:
            temp=i[0]+i[1]+i[2]
            res.append(temp)
    elif kmer == 6:    
        c = itertools.product(f,f,f,f,f,f)    
        for i in c:
            temp=i[0]+i[1]+i[2]+i[3]+i[4]+i[5]
            res.append(temp)

    res=np.array(res)
    NB_WORDS = res.shape[0] + 1
    tokenizer = Tokenizer(num_words=NB_WORDS)
    tokenizer.fit_on_texts(res)
    word_index = tokenizer.word_index
    word_index['null']=0

    # get embedding matrix
    embedding_matrix = np.zeros((len(word_index), embedding_dim))
    for word, i in word_index.items(): 
        embedding_vector = embeddings_index.get(word.upper())
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

def main(read_dir, write_dir):
    embedding_matrix_6mer = getKmerEmbeddingMatrix(read_dir, 6)
    embedding_matrix_3mer = getKmerEmbeddingMatrix(read_dir, 3)

    with open(write_dir / 'embedding_matrix_6mer.pickle', 'wb') as handle:
        pickle.dump(embedding_matrix_6mer, handle)
    with open(write_dir / 'embedding_matrix_3mer.pickle', 'wb') as handle:
        pickle.dump(embedding_matrix_3mer, handle)

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-i",
        "--read_dir",
        help="Path to dna2vec file",
        required=True,
    )
    parser.add_argument(
        "-o",
        "--write_dir",
        help="Path to where kmer embeddings should be written",
        required=True,
    )
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    main(
        read_dir=pathlib.Path(args.read_dir),
        write_dir=pathlib.Path(args.write_dir),
    )

In [12]:
!python3 src/data/get_embeddings.py -i data/external/dna2vec/ -o data/temp/

# data_preprocessing_utils

In [None]:
def tokenizerGetWordIndex(kmerLen = 3):
    '''
    Returns kmer length words w/ associated indices
    '''
    
    f= ['a','c','g','t']
    res=[]

    if kmerLen == 6:
        c = itertools.product(f,f,f,f,f,f)
        for i in c:
            temp=i[0]+i[1]+i[2]+i[3]+i[4]+i[5]
            res.append(temp)
    elif kmerLen == 3:
        c = itertools.product(f,f,f)
        for i in c:
            temp=i[0]+i[1]+i[2]
            res.append(temp)
    
    res=np.array(res)
    NB_WORDS = len(res) + 1
    tokenizer = Tokenizer(num_words=NB_WORDS)
    tokenizer.fit_on_texts(res)
    word_index = tokenizer.word_index
    word_index['null']=0

    return word_index

def readEmbeddingMatrix(kmer, dataDir):
    '''
    Read embedding matrix for relevant kmer
    '''
    if kmer == '6mer':
        with open(dataDir + 'embedding_matrix_6mer.pickle', 'rb') as handle:
            embedding_matrix = pickle.load(handle)
    elif kmer == '3mer':
        with open(dataDir + 'embedding_matrix_3mer.pickle', 'rb') as handle:
            embedding_matrix = pickle.load(handle)
    return embedding_matrix

# train and valid data splits

In [1]:
%cd /content/drive/MyDrive/00Projects/lncrna/misannotated_lncrna

/content/drive/MyDrive/00Projects/lncrna/misannotated_lncrna


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/00Projects/lncrna/misannotated_lncrna')

In [6]:
!ls

data	 Makefile   README.md	requirements.txt  src
docs	 models     references	sample_runs.txt   test_environment.py
LICENSE  notebooks  reports	setup.py	  tox.ini


In [3]:
from src.data.data_utils import tokenizerGetWordIndex

In [4]:
tokenizerGetWordIndex

<function src.data.data_utils.tokenizerGetWordIndex>

In [6]:
!python3 src/data/trial.py

hello this works


In [10]:
import itertools
import numpy as np
from keras.preprocessing.text import Tokenizer
import pickle  
import pathlib


def get_tokenizer(k_mer):
        f= ['a','c','g','t']
        res=[]

        if k_mer == 3:
            c = itertools.product(f,f,f)
            for i in c:
                temp=i[0]+i[1]+i[2]
                res.append(temp)

        elif k_mer == 6:
            c = itertools.product(f,f,f,f,f,f)
            for i in c:
                temp=i[0]+i[1]+i[2]+i[3]+i[4]+i[5]
                res.append(temp)    
        
        res=np.array(res)
        NB_WORDS = res.shape[0]
        tokenizer = Tokenizer(num_words=NB_WORDS)
        tokenizer.fit_on_texts(res)
        acgt_index = tokenizer.word_index
        acgt_index['null'] = 0
        return tokenizer

t = get_tokenizer(3)
t

<keras_preprocessing.text.Tokenizer at 0x7f53d3590090>

In [11]:
dir(t)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_keras_api_names',
 '_keras_api_names_v1',
 'char_level',
 'document_count',
 'filters',
 'fit_on_sequences',
 'fit_on_texts',
 'get_config',
 'index_docs',
 'index_word',
 'lower',
 'num_words',
 'oov_token',
 'sequences_to_matrix',
 'sequences_to_texts',
 'sequences_to_texts_generator',
 'split',
 'texts_to_matrix',
 'texts_to_sequences',
 'texts_to_sequences_generator',
 'to_json',
 'word_counts',
 'word_docs',
 'word_index']