In [1]:
import gzip
import random
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import os
import pickle
from Bio import SeqIO
import six.moves.cPickle

import r2v_functions_mod as r2v

In [2]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

k = 9

dataset_dir_path = "CORENup-Datasets\\Datasets"
setting_dir = "Setting1"

out_dataset_path = os.path.join(dataset_dir_path[0:dataset_dir_path.find('\\')]+"-Embeddings-Word2Vec", 
                                dataset_dir_path[dataset_dir_path.find('\\')+1:len(dataset_dir_path)])

In [3]:
dataset_setting_dir_path = os.path.join(dataset_dir_path, setting_dir)

file_list = []

for root, dirs, files in os.walk(dataset_setting_dir_path):
    for file in files:
        file_list.append((root, files))

In [4]:
for current_file in file_list:
    
    print("\nProcessing started for file:", os.path.join(current_file[0], current_file[1][0]), "\n")
    
    ##################################################################################
    ##### getting file in/out paths and file names ready
    ##################################################################################
    
    in_current_file_name = current_file[1][0]
    
    in_current_file_path = os.path.join(current_file[0],
                                        current_file[1][0])
    
    out_current_kmers_file_name = current_file[1][0][0:current_file[1][0].find(".")]+"_{}mers".format(k)+current_file[1][0][current_file[1][0].find("."):len(current_file[1][0])]
    
    out_current_file_name = current_file[1][0][0:current_file[1][0].find(".")]+"_word2vec_embedded"+current_file[1][0][current_file[1][0].find("."):len(current_file[1][0])]
    
    out_current_dir_path = os.path.join(current_file[0][0:current_file[0].find("\\")]+"-Embeddings-Word2Vec",
                                        current_file[0][current_file[0].find('\\')+1:len(current_file[0])])
    
    ##################################################################################
    ##### read FASTA file
    ##################################################################################
    
    openFile = open(in_current_file_path)
    fastaSequences = SeqIO.parse(openFile, "fasta")
    
    ##################################################################################
    ##### extract data from the current fasta file
    ##################################################################################
    
    id_List = []
    seq_List = []
    
    for fasta in fastaSequences:
        
        name, sequence = fasta.id, str(fasta.seq)
        
        id_List.append(name)
        seq_List.append(sequence)
    
    ##################################################################################
    ##### Generate k-mers and write to file
    ##################################################################################
    
    if(not os.path.isdir(out_current_dir_path)):
        os.makedirs(out_current_dir_path)
        
    kmers_path = os.path.join(out_current_dir_path, out_current_kmers_file_name+".gz")
    
    out_kmers = gzip.open(kmers_path,'w')
    
    for sequence in seq_List:
        curr_seq_kmers = []
        for i in range(0,len(seq_List[0]) - k + 1):
            curr_seq_kmers.append(sequence[i:i+k])

        curr_seq_kmers_joined = " ".join(map(str, curr_seq_kmers))+"\n"
        out_kmers.write(curr_seq_kmers_joined.encode())

    out_kmers.close()
    
    ##################################################################################
    ##### word2vec Model Training parameters
    ##################################################################################

    seed = random.randint(1,9999999)
    d = 64
    w = 5
    neg_samps = 5
    samp_freq = 0.0001
    n_min = 10
    epochs = 3
    n_cores = 1
    prefix = setting_dir + "_" + in_current_file_path[0:in_current_file_path.rfind("\\")][in_current_file_path[0:in_current_file_path.rfind("\\")].rfind("\\")+1:len(in_current_file_path[0:in_current_file_path.rfind("\\")])]
    
    model_fn = prefix + '_' + str(k) + '_' + str(d) + \
        '_' + str(epochs) + '_' + str(w) + '_' + \
        str(neg_samps).replace('0.','') + '_' + \
        str(samp_freq) + '_' + str(n_min) + '_model.pkl'

    model_file_path = os.path.join(out_current_dir_path, model_fn)
    
    ##################################################################################
    ##### Train word2vec Model
    ##################################################################################
    
    kmers_init = LineSentence(kmers_path)

    model = Word2Vec(kmers_init, sg=1, size=d, window=w, min_count=n_min, negative=neg_samps,
                     sample=samp_freq, iter=epochs, workers=n_cores, seed=seed)

    model.save(model_file_path)
    
    ##################################################################################
    ##### Embedding Parameters 
    ##################################################################################

    nr = bool(int(1))
    a = 1e-05
    v = 1000

    path_reads = in_current_file_path
    path_model = model_file_path
    
    fn_totalkmers = '%s_%s_totalkmers.pkl' % (prefix,str(k))

    path_totalkmers = os.path.join(out_current_dir_path, fn_totalkmers)

    work_dir = out_current_dir_path
    
    ##################################################################################
    ##### Generating the read embeddings from the sequences using word2vec
    ##################################################################################
    
    total_kmers = r2v.calc_total_kmers(path_reads, path_model, k, verbose=True, v=v)
    
    six.moves.cPickle.dump(total_kmers, open(path_totalkmers, 'wb'), protocol=4)
    
    r2v.embed_reads(path_sample = path_reads, path_totalkmers = path_totalkmers, path_model = path_model, path_out = work_dir, normread=nr, k=k, a=a, verbose=True, v=v)
    
    print("\nProcessing completed for current file.\n============================================================================")


Processing started for file: CORENup-Datasets\Datasets\Setting1\Drosophila\nucleosomes_vs_linkers_melanogaster.fas 

Processing read 0. Last batch: 0.000 minutes. Total time: 0.000 hours.
Processing read 1000. Last batch: 0.003 minutes. Total time: 0.000 hours.
Processing read 2000. Last batch: 0.004 minutes. Total time: 0.000 hours.
Processing read 3000. Last batch: 0.004 minutes. Total time: 0.000 hours.
Processing read 4000. Last batch: 0.004 minutes. Total time: 0.000 hours.
Processing read 5000. Last batch: 0.004 minutes. Total time: 0.000 hours.
Loading total kmers.
Loading model.
path_sample: CORENup-Datasets\Datasets\Setting1\Drosophila\nucleosomes_vs_linkers_melanogaster.fas
Total reads in sample nucleosomes_vs_linkers_melanogaster: 5750.
Normalizing each read by total number of kmers in that read.
Processing nucleosomal_sequence_1: 0/5750.


  reads[:,i] /= n_kmer


Processing nucleosomal_sequence_1001: 1000/5750.
Processing nucleosomal_sequence_2001: 2000/5750.
Processing linker_sequence_101: 3000/5750.
Processing linker_sequence_1101: 4000/5750.
Processing linker_sequence_2101: 5000/5750.
Saving reads to CORENup-Datasets-Embeddings-Word2Vec\Datasets\Setting1\Drosophila\nucleosomes_vs_linkers_melanogaster_Drosophila_9_64_3_5_5_0.0001_10_1e-05_remb_raw.csv.gz.
Performing SVD: (64,5750).


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').