In [1]:
import os
import threading
import time

import pandas as pd
import tensorflow as tf
import numpy as np


def split_dataframe_list_to_rows(df, target_column, separator):
    """Splits column that contains list into row per element of the list.

    Args:
      df: dataframe to split
      target_column: the column containing the values to split
      separator: the symbol used to perform the split

    Returns:
      dataframe with each entry for the target column separated,
      with each element moved into a new row.  The values in the
      other columns are duplicated across the newly divided rows.

    """

    def split_list_to_rows(row, row_accumulator, target_column, separator):
        """

        Args:
          row: 
          row_accumulator: 
          target_column: 
          separator: 

        Returns:

        """
        split_row = row[target_column].split(separator)
        for s in split_row:
            new_row = row.to_dict()
            new_row[target_column] = s
            row_accumulator.append(new_row)

    new_rows = []
    df.apply(split_list_to_rows, axis=1, args=(new_rows, target_column, separator))
    new_df = pd.DataFrame(new_rows)
    return new_df


def get_id_character_mapping(data, columns):
    """Creating a mapping between characters and ids given dataframe.

    Args:
      data: dataframe that contains characters that need to be converted to ids
      column: a column of the dataframe that contains characters that need to be converted to ids
      columns: 

    Returns:
      id_to_character: dictionary of ids and characters
      character_to_id: dictionary of characters and ids

    """
    characters = set([])
    for column in columns:
        [characters.update(set(val)) for index, val in data[column].iteritems()]
    characters = list(sorted(characters))

    id_to_character = {i: characters[i] for i in range(len(characters))}
    character_to_id = {characters[i]: i for i in range(len(characters))}
    return id_to_character, character_to_id


def get_category_to_id_mapping(data, column):
    """Creates two mappings for id and categorical value and vice verse for given column.
    Id is a unique identifier of categorical value. Starting from 0.

    Args:
      data: dataframe that contains categorical values
      column: a column of dataframe that contains categorical values for which a mapping from categorical value
    to id is needed

    Returns:
      id_to_category: dictionary of ids and categories
      category_to_id: dictionary of categories and ids

    """
    categories = sorted(data[column].unique())
    print("There are {} unique categories".format(len(categories)))
    id_to_category = {i: categories[i] for i in range(len(categories))}
    category_to_id = {categories[i]: i for i in range(len(categories))}
    return id_to_category, category_to_id



def to_int_feature(data):
    """
    Converts int list to tf Feature
    Args:
        data: int list to be stored in tf record

    Returns:
        tf Feature that is used in building tfrecord
    """
    return tf.train.Feature(int64_list=tf.train.Int64List(value=data))


def to_float_feature(data):
    """
    Converts float list to tf Feature
    Args:
        data: float list to be stored in tf record

    Returns:
        tf Feature that is used in building tfrecord
    """
    return tf.train.Feature(float_list=tf.train.FloatList(value=data))





def save_as_npy(path, original_data, columns=["Label", "sequence"], ):
    """Processes a dataframe and stores data into npy file

    Args:
      filename: the absolute path of the npy file where data should be stored
      data: dataframe containing data to be stored
      columns: list of columns that should be stored
      extension: file extension
    Returns:

    """
    os.makedirs(path, exist_ok=True)
    try:
        filename = os.path.join(path, "data.npy")
        np.save(filename, original_data[columns].values)

        print("Data was stored in {}".format(filename))
    except Exception as e:
        print("Something went wrong went writting in to npy file ({})".format(filename))
        print(e)


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import os
from collections import Counter

from common.bio.sequence import Sequence
from Bio.SeqIO.FastaIO import SimpleFastaParser
from common.bio.constants import ID_TO_AMINO_ACID, AMINO_ACID_TO_ID, NON_STANDARD_AMINO_ACIDS
import pandas as pd
import numpy as np


def fasta_to_numpy(path, length):
    """

    Args:
        path: of the fasta file
        separator: used in title of fasta file entry

    Returns: numpy array of sequences

    """
    with open(path) as fasta_file:
        sequences = []
        for title, sequence in SimpleFastaParser(fasta_file):
            sequence = sequence[:length]
            to_pad = length - len(sequence)
            sequence = sequence.rjust(len(sequence) - (to_pad // 2), '0')
            sequence = sequence.ljust(length, '0')
            if len(sequence) < length:
                print(sequence.rjust(to_pad // 2, '0'))
                print(to_pad, to_pad//2, length-len(sequence))
            np_seq = np.asarray([AMINO_ACID_TO_ID[a] for a in sequence])
            sequences.append(np_seq)
        return np.stack(sequences, axis= 0)

def from_amino_acid_to_id(data, column):
    """Converts sequences from amino acid to ids

    Args:
      data: data that contains amino acid that need to be converted to ids
      column: a column of the dataframe that contains amino acid that need to be converted to ids

    Returns:
      array of ids

    """
    return data[column].apply(lambda x: [AMINO_ACID_TO_ID[c] for c in x])


def from_id_from_amino_acid(data, column):
    """Converts sequences from ids to amino acid characters

    Args:
      data: data that contains ids that need to be converted to amino acid
      column: a column of the dataframe that contains ids that need to be converted to amino acid

    Returns:
      array of amino acid

    """
    return [[ID_TO_AMINO_ACID[id] for id in val] for index, val in data[column].iteritems()]


def filter_non_standard_amino_acids(data, column):
    """

    Args:
      data: dataframe containing amino acid sequence
      column: a column of dataframe that contains amino acid sequence

    Returns:
      filtered data drame

    """

    data = data[~data[column].str.contains("|".join(NON_STANDARD_AMINO_ACIDS))]

    return data


def get_protein_sequences(sequences, labels=None, d_scores=None):
    """

    Args:
      sequences: Protein sequences
      id_to_enzyme_class: a dictionary to get enzyme class from its id
      labels: Ids  of Enzyme classes (Default value = None)

    Returns:
      array of Sequence objects
    """
    seqs = []
    for index, seq in enumerate(sequences):
        label = None if labels is None else labels[index]
        d_score = None if d_scores is None else d_scores[index]
        seqs.append(Sequence(index, seq, label=label, d_score=d_score))
    return seqs


def numpy_seqs_to_fasta(sequences, id_to_enzyme_class, labels=None, d_scores=None, strip_zeros=False):
    """

    Args:
      sequences: Protein sequences
      id_to_enzyme_class: a dictionary to get enzyme class from its id
      labels: Ids  of Enzyme classes (Default value = None)
      d_scores: Values of discriminator (Default value = None)
      strip_zeros: Flag to determine if special characters needs to be escape. Applicable for text in tersorboard
    Returns:
      array of strings with sequences and additional information

    """
    seqs = get_protein_sequences(sequences, labels, d_scores)
    return sequences_to_fasta(seqs, id_to_enzyme_class, True, strip_zeros)


def sequences_to_fasta(sequences, id_to_enzyme_class, escape=True, strip_zeros=False):
    """

    Args:
      sequences: a list of Sequences object
      id_to_enzyme_class: a dictionary to get enzyme class from its id
      labels: Ids  of Enzyme classes (Default value = None)
      escape: a flag to determine if special characters needs to be escape. Applicable for text in tersorboard
      strip_zeros: a flag that determines whether zeros are removed from sequences
    Returns:
      string with sequences and additional information that mimics fasta format

    """
    return os.linesep.join([seq.get_seq_in_fasta(id_to_enzyme_class, escape, strip_zeros) for seq in sequences])


def print_protein_seq(sequences, id_to_enzyme_class, labels=None, d_scores=None):
    """

    Args:
      sequences: Protein sequences
      id_to_enzyme_class: a dictionary to get enzyme class from its id
      labels: Ids  of Enzyme classes (Default value = None)
      d_scores: Values of discriminator (Default value = None)

    Returns:
      Signal for DONE

    """
    print("\n".join(numpy_seqs_to_fasta(sequences, id_to_enzyme_class, labels, d_scores)))
    return "DONE"




def fasta_to_numpy(path, length):
    """

    Args:
        path: of the fasta file
        separator: used in title of fasta file entry

    Returns: numpy array of sequences

    """
    with open(path) as fasta_file:
        sequences = []
        for title, sequence in SimpleFastaParser(fasta_file):
            sequence = sequence[:length]
            to_pad = length - len(sequence)
            sequence = sequence.rjust(len(sequence) - (to_pad // 2), '0')
            sequence = sequence.ljust(length, '0')
            if len(sequence) < length:
                print(sequence.rjust(to_pad // 2, '0'))
                print(to_pad, to_pad//2, length-len(sequence))
            np_seq = np.asarray([AMINO_ACID_TO_ID[a] for a in sequence])
            sequences.append(np_seq)
        return np.stack(sequences, axis= 0)


def generate_random_seqs(data, column='sequence', n_seqs=1000):
    """

    Args:
        data: Dataframe that contains sequences
        column: a name of the column which contains sequences

    Returns:
        Randomly generated sequences based on frequency of each element

    """
    results = Counter(data[column].str.cat())
    counts = [i[1] for i in sorted(results.items())]
    prop = np.asarray(counts) / sum(list(counts))
    lengths = data.sequence.str.len().sample(n_seqs).values + int(np.random.normal(scale=3))
    seqs = []
    for i in range(n_seqs):
        r = np.random.choice(np.arange(1, 21), p=prop, size=lengths[i])
        seq = ">R_{}\nM".format(i)
        for a in r:
            seq = seq + ID_TO_AMINO_ACID[a]
        seqs.append(seq)
    return seqs


In [13]:
def fasta_to_pandas(path, separator=";"):
    """

    Args:
        path: of the fasta file
        separator: used in title of fasta file entry

    Returns: pandas dataframe with 3 columns (id, title, sequence)

    """
    with open(path) as fasta_file:
        identifiers, sequences, titles, labels = [], [], [],[]
        id =0
        for title, sequence in SimpleFastaParser(fasta_file):
            title_parts = title.split(separator, 1)
            identifiers.append(id)  # First word is ID
            titles.append("37.0.0.0_" + str(id))
            sequences.append(sequence)
            labels.append("labelx")
            id +=1
        return pd.DataFrame({"id": identifiers, "title": titles,"Label":labels, "sequence": sequences})


In [26]:
df = fasta_to_pandas("./a/1.fasta")


In [31]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


output_handle1 = open("new_fasta1.fasta", "a")

for index , row in df.iterrows():
    record = SeqRecord(
    Seq(row['sequence']),
    id=str(row['id']),
    name= '',
    description=row['title'],
    )
    SeqIO.write(record, output_handle1, 'fasta')

id                                                          0
title                                              37.0.0.0_0
Label                                                  labelx
sequence    AVGPVADNTITDAATSPDGFSRQAVVVNGVTPGPLVAGNIGDRFQL...
Name: 0, dtype: object
id                                                          1
title                                              37.0.0.0_1
Label                                                  labelx
sequence    MNLEKFVDELPIPEVAEPVKKNPRQTYYEIAMEEVFLKVHRDLPPT...
Name: 1, dtype: object
id                                                          2
title                                              37.0.0.0_2
Label                                                  labelx
sequence    MSFSSLRRALVFLGACSSALASIGPVTELDIVNKVIAPDGVARDTV...
Name: 2, dtype: object
id                                                          3
title                                              37.0.0.0_3
Label                                                  labelx
s

In [12]:
df["sequence"] = from_amino_acid_to_id(df , "sequence")
df


Unnamed: 0,id,title,Label,sequence
0,0,37.0.0.0_0,labelx,"[1, 18, 6, 13, 18, 1, 3, 12, 17, 8, 17, 3, 1, ..."
1,1,37.0.0.0_1,labelx,"[11, 12, 10, 4, 9, 5, 18, 3, 4, 10, 13, 8, 13,..."
2,2,37.0.0.0_2,labelx,"[11, 16, 5, 16, 16, 10, 15, 15, 1, 10, 18, 5, ..."
3,3,37.0.0.0_3,labelx,"[11, 3, 15, 9, 6, 8, 18, 10, 6, 16, 9, 10, 1, ..."
4,4,37.0.0.0_4,labelx,"[13, 6, 13, 17, 10, 20, 1, 3, 19, 6, 3, 11, 8,..."
...,...,...,...,...
1148,1148,37.0.0.0_1148,labelx,"[8, 7, 19, 7, 6, 5, 5, 14, 9, 6, 16, 16, 19, 1..."
1149,1149,37.0.0.0_1149,labelx,"[7, 19, 7, 6, 5, 5, 14, 1, 6, 17, 12, 19, 1, 3..."
1150,1150,37.0.0.0_1150,labelx,"[1, 8, 6, 13, 18, 17, 12, 10, 17, 8, 16, 12, 1..."
1151,1151,37.0.0.0_1151,labelx,"[7, 19, 7, 6, 5, 5, 14, 9, 6, 17, 12, 19, 1, 3..."


In [15]:
def save_as_tfrecords_multithreaded(path, original_data, columns=["sequence"], group_by_col="Label"):
    """Provided data gets splitted in to groups and processed concurrently.
    The outcome of this is a file per group.

    Args:
      path: Location where files should be stored
      original_data: dataframe which should be converted into files
      columns: a  list of columns which should be stored as sequences (Default value = ["sequence"])
      group_by_col: a column name by which split data into groups (Default value = "Label")
    Returns:

    """
    os.makedirs(path, exist_ok=True)
    threading_start = time.time()
    coord = tf.train.Coordinator()
    threads = []
    data = original_data.groupby(group_by_col)
    for group_id in data.groups:
        if isinstance(group_id, str):
            group_name = group_id.replace(".", "_").replace("-", "_")
        elif isinstance(group_id, int):
            group_name = str(group_id)
        else:
            group_name = "_".join([str(e) for e in group_id])
        filename = os.path.join(path, group_name)
        args = (filename, data.get_group(group_id), columns)
        t = threading.Thread(target=save_as_tfrecords, args=args)
        t.start()
        threads.append(t)
    coord.join(threads)
    print("Completed all threads in {} seconds".format(time.time() - threading_start))


def save_as_tfrecords(filename, data, columns=["sequence"], extension="tfrecords"):
    """Processes a dataframe and stores data into tfrecord file

    Args:
      filename: the absolute path of the tfrecords file where data should be stored
      data: dataframe containing data will be converted into tfrecord
      columns: list of columns that should be stored as varying-length sequences (Default value = ["sequence"])
      extension: file extension
    Returns:

    """
    try:
        filename = "{}.{}".format(filename, extension)
        with tf.python_io.TFRecordWriter(filename) as writer:
            for index, row in data.iterrows():
                feature = {
                    'label': to_int_feature([row[0]])
                }
                for col_name in columns:
                    value = row[col_name]
                    if isinstance(value, int):
                        feature[col_name] = to_int_feature([value])
                    elif isinstance(value, float):
                        feature[col_name] = to_float_feature([value])
                    elif not isinstance(value, (list,)) and not (isinstance (value, int) or ((value.dtype == np.float32) or (value.dtype == np.float64))):
                        feature[col_name] = to_float_feature(value)
                    else:
                        feature[col_name] = to_int_feature(value)
                        feature['length_' + col_name] = to_int_feature([len(value)])

                example = tf.train.Example(features=tf.train.Features(feature=feature))
                writer.write(example.SerializeToString())

        print("Data was stored in {}".format(filename))
    except Exception as e:
        print("Something went wrong went writting in to tfrecords file")
        print("Error is ", str(e))
save_as_tfrecords_multithreaded("./a",df)

Data was stored in ./a/labelx.tfrecords
Completed all threads in 1.0019659996032715 seconds
