In [1]:
import gzip
import random
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import os
import pickle
from Bio import SeqIO
import six.moves.cPickle

import r2v_functions as r2v

In [2]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

k = 10

# path = "C:\\Users\\arya3\\Desktop\\_Uni\\3. CI Project\\_Other_Cloned_Repo\\16s_embeddings\\code\\data\\"
path = "embedding_data"

# n_fold = 10
# expName = "Test_Run_current_setting1"
# outPath = "Generated"
# foldName = "folds.pickle"

# modelNames = ["DLNN_CORENup"]

# epochs = 200
# batch_size = 64
# shuffle = False
# seed = None

# dataset_path = "CORENup-Datasets\\Datasets"
# setting = "Setting1"


### Word2Vec MODEL

#### Load Training Data

In [3]:
## read fasta file
subset_openFile = open(os.path.join(path, "input", "kegg_subset.fasta"))
subset_fastaSequences = SeqIO.parse(subset_openFile, "fasta")

In [4]:
##################################################################################
##### extract data from the current fasta file
##################################################################################

subset_id_List = []
subset_seq_List = []

for subset_fasta in subset_fastaSequences:
    
    name, sequence = subset_fasta.id, str(subset_fasta.seq)
    
    subset_id_List.append(name)
    subset_seq_List.append(sequence)

In [5]:
##################################################################################
##### Generate k-mers and write to file
##################################################################################

out_subset_kmers = gzip.open(os.path.join(path, "output", "kegg_subset_model_input.gz"),'w')

subset_kmers = []
for subset_sequence in subset_seq_List:
    curr_seq_kmers = []
    for i in range(0,len(subset_seq_List[0]) - k + 1):
        curr_seq_kmers.append(sequence[i:i+k])
    subset_kmers.append(curr_seq_kmers)
    
    curr_seq_kmers_joined = " ".join(map(str, subset_kmers[0]))+"\n"
    out_subset_kmers.write(curr_seq_kmers_joined.encode())

out_subset_kmers.close()

#### Train on stored file

In [6]:
##################################################################################
##### Model Training parameters
##################################################################################

seed = random.randint(1,9999999)
d = 64
w = 50
neg_samps = 10
samp_freq = 0.0001
n_min = 100
epochs = 3
n_cores = 1
work_dir = 0
prefix = "test"

In [7]:
model_fn = prefix + '_' + str(k) + '_' + str(d) + \
        '_' + str(epochs) + '_' + str(w) + '_' + \
        str(neg_samps).replace('0.','') + '_' + \
        str(samp_freq) + '_' + str(n_min) + '_model.pkl'

model_path = os.path.join(path, "output", model_fn)

kmers_path = os.path.join(path, "output", "kegg_subset_model_input.gz")

In [8]:
kmers_init = LineSentence(kmers_path, max_sentence_length=100000)

model = Word2Vec(kmers_init,sg=1,size=d,window=w,min_count=n_min,negative=neg_samps,
                 sample=samp_freq,iter=epochs,workers=n_cores,seed=seed)

model.save(model_path)

#### Embed on full data

In [9]:
##################################################################################
##### Needed Parameters 
##################################################################################

nr = bool(int(1))
a = 1e-05
v = 1000

path_reads = os.path.join(path, "input", "kegg_subset.fasta.gz")
path_model = model_path

In [10]:
fn_totalkmers = '%s_%s_totalkmers.pkl' % (prefix,str(k))
path_totalkmers = os.path.join(path, "output", fn_totalkmers)

work_dir = os.path.join(path, "output")

In [11]:
total_kmers = r2v.calc_total_kmers(path_reads, path_model, k, verbose=True, v=v)

six.moves.cPickle.dump(total_kmers, open(path_totalkmers, 'wb'), protocol=4)

r2v.embed_reads(path_sample = path_reads, path_totalkmers = path_totalkmers, path_model = path_model, path_out = work_dir, normread=nr, k=k, a=a, verbose=True, v=v)

Processing read 0. Last batch: 0.000 minutes. Total time: 0.000 hours.
Processing read 1000. Last batch: 0.095 minutes. Total time: 0.002 hours.
Processing read 2000. Last batch: 0.096 minutes. Total time: 0.003 hours.
Loading total kmers.
Loading model.
Total reads in sample kegg_subset: 2500.
Normalizing each read by total number of kmers in that read.
Processing eco:b020: 0/2500.
Processing esc:Entcl_R007: 1000/2500.
Processing sfw:WN53_1327: 2000/2500.
Saving reads to embedding_data\output\kegg_subset_10_64_3_50_10_0.0001_100_1e-05_remb_raw.csv.gz.
Performing SVD: (64,2500).
Saving reads to embedding_data\output\kegg_subset_10_64_3_50_10_0.0001_100_1e-05_remb.csv.gz.


In [12]:
nr

True

In [13]:
##################################################################

In [14]:
# ## read fasta file
# subset_openFile = open(os.path.join(path, "input", "hmp_subset.fasta"))
# subset_fastaSequences = SeqIO.parse(subset_openFile, "fasta")

In [15]:
# ##################################################################################
# ##### extract data from the current fasta file
# ##################################################################################

# subset_id_List = []
# subset_seq_List = []

# for subset_fasta in subset_fastaSequences:
    
#     name, sequence = subset_fasta.id, str(subset_fasta.seq)
    
#     subset_id_List.append(name)
#     subset_seq_List.append(sequence)

In [16]:
# evaluations = pickle.load(open(os.path.join(path, "output", "exread2_10_totalkmers.pkl"), "rb"))

In [17]:
# len(evaluations.keys())

In [18]:
# evaluations.keys()

In [19]:
# evaluations