In [1]:
import csv
import gzip
import random
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import os
import pickle
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import six.moves.cPickle

import r2v_functions_mod as r2v

In [2]:
##################################################################################
##### Define all parameters for model tuning
##################################################################################

k = 6

dataset_dir_path = "piRNA"

out_dataset_path = dataset_dir_path

In [3]:
# file_list = []

# for root, dirs, files in os.walk(dataset_dir_path):
#     for file in files:
#         file_list.append((root, file))

file_list = [(dataset_dir_path, 'piRNA_layer1.csv')]

In [4]:
for current_file in file_list:
    
    print("\nProcessing started for file:", os.path.join(current_file[0], current_file[1]), "\n")
    
    ##################################################################################
    ##### getting file in/out paths and file names ready
    ##################################################################################
    
    in_current_file_name = current_file[1]
    
    in_current_file_path = os.path.join(current_file[0],
                                        current_file[1])
    
    out_current_kmers_file_name = current_file[1][0:current_file[1].find(".")]+"_{}mers".format(k)+".fas"
    
    out_current_file_name = current_file[1][0:current_file[1].find(".")]+"_word2vec_embedded"+current_file[1][current_file[1].find("."):len(current_file[1])]
    
    out_current_dir_path = out_dataset_path
    
    ##################################################################################
    ##### read CSV file
    ##################################################################################
    
    org_ACGU_data_list = []
    max_len = 0
    
    with open(in_current_file_path, newline='\n') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=',')
        for row in spamreader:
            if len(row[0]) > max_len:
                max_len = len(row[0])
            org_ACGU_data_list.append(row)

    ##################################################################################
    ##### pre process data as required by the models
    ##################################################################################
    
    org_ACGT_data_list = []

    for row in org_ACGU_data_list: 
        org_ACGT_data_list.append([row[0].replace("U", "T"), int(row[1])])
    
    id_List = [row[1] for row in org_ACGT_data_list]
    seq_List = [row[0] for row in org_ACGT_data_list]
    
    ##################################################################################
    ##### Write SEQ information to a fasta sequence file for word2vec processing
    ##################################################################################
    
    def generator_of_sequences(seq_list, id_list):
        id_n = 0
        id_l = 0
        for string_seq, id_01 in zip(seq_list, id_list):
            if id_01 == 0:
                id_l = id_l + 1
                push_id = "linker_sequence_{}".format(id_l)
            elif id_01 == 1:
                id_n = id_n + 1
                push_id = "nucleosomal_sequence_{}".format(id_n)
            yield SeqRecord(Seq(string_seq), id=push_id)
    
    path_reads = os.path.join(out_current_dir_path, in_current_file_name.split(".")[0]+".fas")
    output_handle = open(path_reads, "w")
    SeqIO.write(generator_of_sequences(seq_List, id_List), output_handle, "fasta")
    output_handle.close()
    
    ##################################################################################
    ##### Generate k-mers and write to file
    ##################################################################################
    
    if(not os.path.isdir(out_current_dir_path)):
        os.makedirs(out_current_dir_path)
        
    kmers_path = os.path.join(out_current_dir_path, out_current_kmers_file_name+".gz")
    
    out_kmers = gzip.open(kmers_path,'w')
    
    for sequence in seq_List:
        curr_seq_kmers = []
        for i in range(0,len(seq_List[0]) - k + 1):
            curr_seq_kmers.append(sequence[i:i+k])

        curr_seq_kmers_joined = " ".join(map(str, curr_seq_kmers))+"\n"
        out_kmers.write(curr_seq_kmers_joined.encode())

    out_kmers.close()
    
    ##################################################################################
    ##### word2vec Model Training parameters
    ##################################################################################

    seed = random.randint(1,9999999)
    d = 64
    w = 5
    neg_samps = 5
    samp_freq = 0.0001
    n_min = 10
    epochs = 3
    n_cores = 1
    prefix = in_current_file_name[0:in_current_file_name.rfind(".")]
    
    model_fn = prefix + '_' + str(k) + '_' + str(d) + \
        '_' + str(epochs) + '_' + str(w) + '_' + \
        str(neg_samps).replace('0.','') + '_' + \
        str(samp_freq) + '_' + str(n_min) + '_model.pkl'

    model_file_path = os.path.join(out_current_dir_path, model_fn)
    
    ##################################################################################
    ##### Train word2vec Model
    ##################################################################################
    
    kmers_init = LineSentence(kmers_path)
    
    model = Word2Vec(kmers_init, sg=1, size=d, window=w, min_count=n_min, negative=neg_samps,
                     sample=samp_freq, iter=epochs, workers=n_cores, seed=seed)

    model.save(model_file_path)
    
    ##################################################################################
    ##### Embedding Parameters 
    ##################################################################################

    nr = bool(int(1))
    a = 1e-05
    v = 1000

    #path_reads = in_current_file_path
    path_model = model_file_path
    
    fn_totalkmers = '%s_%s_totalkmers.pkl' % (prefix,str(k))

    path_totalkmers = os.path.join(out_current_dir_path, fn_totalkmers)

    work_dir = out_current_dir_path
    
    ##################################################################################
    ##### Generating the read embeddings from the sequences using word2vec
    ##################################################################################
    
    total_kmers = r2v.calc_total_kmers(path_reads, path_model, k, verbose=True, v=v)
    
    six.moves.cPickle.dump(total_kmers, open(path_totalkmers, 'wb'), protocol=4)
    
    r2v.embed_reads(path_sample = path_reads, path_totalkmers = path_totalkmers, path_model = path_model, path_out = work_dir, normread=nr, k=k, a=a, verbose=True, v=v)
    
    print("\nProcessing completed for current file.\n============================================================================")


Processing started for file: piRNA\piRNA_layer1.csv 

Processing read 0. Last batch: 0.000 minutes. Total time: 0.000 hours.
Processing read 1000. Last batch: 0.001 minutes. Total time: 0.000 hours.
Processing read 2000. Last batch: 0.001 minutes. Total time: 0.000 hours.
Loading total kmers.
Loading model.
path_sample: piRNA\piRNA_layer1.fas
Total reads in sample piRNA_layer1: 2835.
Normalizing each read by total number of kmers in that read.
Processing linker_sequence_1: 0/2835.
Processing linker_sequence_1001: 1000/2835.
Processing nucleosomal_sequence_584: 2000/2835.
Saving reads to piRNA\piRNA_layer1_layer1_6_64_3_5_5_0.0001_10_1e-05_remb_raw.csv.gz.
Performing SVD: (64,2835).
Saving reads to piRNA\piRNA_layer1_layer1_6_64_3_5_5_0.0001_10_1e-05_remb.csv.gz.

Processing completed for current file.


In [5]:
path_reads

'piRNA\\piRNA_layer1.fas'

In [6]:
path_model

'piRNA\\piRNA_layer1_6_64_3_5_5_0.0001_10_model.pkl'