In [1]:
import sentencepiece as spm
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import datetime
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics.scorer import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm

In [3]:
# Train a sentence piece model to segment sequences
spm.SentencePieceTrainer.Train('--input=/Users/Kbutterstrap/Desktop/iGEM_2019/Tokenization/seq_for_seperation.txt --model_prefix=/Users/Kbutterstrap/Desktop/iGEM_2019/seg')

True

In [3]:
model_path = '/Users/Kbutterstrap/Desktop/iGEM_2019/models/seg.model'
sp = spm.SentencePieceProcessor()
sp.Load(model_path)

True

In [5]:
with open('data/tokenized_fake.txt', 'w') as f, \
    open('data/seq_for_seperation.txt', 'r') as f_a:
    start = 0
    for lines in f_a:
        #first line of the file
        if start == 0:
            current_protein = lines[1:]
        #first line of another sequence
        if (lines[0] == '_') & (start != 0):
            token_list = sp.EncodeAsPieces(current_protein)
            for t in token_list:
                if token_list.index(t) == 0:
                    f.write('\n' + t + ' ')
                else:
                    f.write(t + ' ')
            current_protein = lines[1:]
        #within the range of one sequence
        else:
            current_protein += lines
            start += 1

In [11]:
def train_w2v(filename):
    print("Training data loading...")
    train_sentence = LineSentence(filename)
    print("Start training w2v...", datetime.datetime.now())
    model = Word2Vec(train_sentence, sg=1, workers=10, iter=50, min_count=1)
    print("Training finished. Time", datetime.datetime.now())
    print('Saving trained model...')
    model.save('w2v_model')
    print('DONE')

In [12]:
train_w2v('tokenized_fake.txt')

Training data loading...
Start training w2v... 2019-09-19 15:28:42.925196
Training finished. Time 2019-09-19 15:32:40.840828
Saving trained model...
DONE


In [13]:
def seq_to_vector(filename):
    vectors = []
    model = Word2Vec.load('models/w2v_model')
    with open(filename, 'r') as f:
        lines = list(f)
    for l in tqdm(lines, total=len(lines), position=0):
        l = l[1:].strip().split()
        sentence_vec = np.copy(model.wv.word_vec(l[0]))
        l = l[1:]
        for l_p in l:
            w2v = model.wv.word_vec(l_p)
            word_vec = np.copy(w2v)
            sentence_vec += word_vec
        vectors.append(w2v)
    return vectors

In [14]:
vec_list = seq_to_vector('data/tokenized_fake.txt')

100%|██████████| 999999/999999 [00:09<00:00, 104832.85it/s]


In [16]:
len(vec_list)

999999

In [17]:
vec_np = np.array(vec_list)
vec_np.shape

(999999, 100)

In [18]:
np.save('data/seq_vectors.npy', vec_np)