In [26]:
import sentencepiece as spm
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import datetime
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.metrics.scorer import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from tqdm import tqdm

In [3]:
# Train a sentence piece model to segment sequences
spm.SentencePieceTrainer.Train('--input=/Users/Kbutterstrap/Desktop/iGEM_2019/Tokenization/seq_for_seperation.txt --model_prefix=/Users/Kbutterstrap/Desktop/iGEM_2019/seg')

True

In [5]:
model_path = '/Users/Kbutterstrap/Desktop/iGEM_2019/seg.model'
sp = spm.SentencePieceProcessor()
sp.Load(model_path)

True

In [7]:
with open('tokenized_fake.txt', 'w') as f, \
    open('seq_for_seperation.txt', 'r') as f_a:
    start = 0
    for lines in f_a:
        #first line of the file
        if start == 0:
            current_protein = lines[1:]
        #first line of another sequence
        if (lines[0] == '_') & (start != 0):
            token_list = sp.EncodeAsPieces(current_protein)
            for t in token_list:
                if token_list.index(t) == 0:
                    f.write('\n' + t + ' ')
                else:
                    f.write(t + ' ')
            current_protein = lines[1:]
        #within the range of one sequence
        else:
            current_protein += lines
            start += 1

In [11]:
def train_w2v(filename):
    print("Training data loading...")
    train_sentence = LineSentence(filename)
    print("Start training w2v...", datetime.datetime.now())
    model = Word2Vec(train_sentence, sg=1, workers=10, iter=50, min_count=1)
    print("Training finished. Time", datetime.datetime.now())
    print('Saving trained model...')
    model.save('w2v_model')
    print('DONE')

In [12]:
train_w2v('tokenized_fake.txt')

Training data loading...
Start training w2v... 2019-09-19 15:28:42.925196
Training finished. Time 2019-09-19 15:32:40.840828
Saving trained model...
DONE


In [27]:
def seq_to_vector(filename):
    vectors = []
    model = Word2Vec.load('w2v_model')
    with open(filename, 'r') as f:
        lines = list(f)
    for l in tqdm(lines, total=len(lines), position=0):
        l = l[1:].strip().split()
        sentence_vec = np.copy(model.wv.word_vec(l[0]))
        l = l[1:]
        for l_p in l:
            w2v = model.wv.word_vec(l_p)
            word_vec = np.copy(w2v)
            sentence_vec += word_vec
        vectors.append(w2v)
    return vectors

In [28]:
vec_list = seq_to_vector('tokenized_fake.txt')

100%|██████████| 999999/999999 [00:10<00:00, 97023.12it/s]


In [33]:
vec_list[0]

array([ 0.49376258, -0.04863574, -0.08934297,  0.5419286 , -0.02641163,
       -0.5598214 ,  0.7205441 , -0.50720066, -0.25759354,  0.33942303,
       -0.11762138,  0.28954136,  0.21736398,  0.3994069 , -0.05856253,
        0.44108242,  0.19643083, -0.39851502,  0.42666486, -0.0207038 ,
       -0.02349987,  0.14633283,  0.0283577 , -0.2827237 , -0.17783149,
       -0.16719225,  0.6900813 , -0.24106178, -0.28042758,  0.20822479,
       -0.02559084,  0.04714456,  0.05040161, -0.33121407, -0.20649062,
        0.30674982,  0.0639196 ,  0.0524102 , -0.11731397,  0.24896349,
        0.41546154,  0.30132976, -0.19770172,  0.00688319,  0.17285962,
        0.26493493, -0.16574487,  0.079319  , -0.13747118,  0.31354335,
        0.22191371, -0.21060339,  0.17862594,  0.42991298,  0.7693046 ,
        0.06751509,  0.23081692, -0.13026395,  0.10514796,  0.3012194 ,
       -0.18611918, -0.5031955 , -0.10528275, -0.36862406,  0.2936876 ,
        0.26366392,  0.35900638, -0.05209184,  0.40159002, -0.38