In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import acovf
from sklearn.base import TransformerMixin, BaseEstimator
import tensorflow as tf

In [3]:
class ConjointTriadEncoder(TransformerMixin, BaseEstimator):
    """
    Encoder class that transforms protein sequences into Conjoint Triad representation.
    
    Source: Shen et. al. 2007. “Predicting protein-protein interactions based only on
            sequences information.”
    
    Attributes
    ----------
    num_clusters : int
        Number of groupings of amino acids (Default = 7)
    triads : set
        All of the permutations of cluster labels as triads 
    aa_clusters : dict
        Cluster assignments for amino acids
        

    Methods
    -------
    __init__(num_clusters)
        Constructor initializes member variables.
    
    fit(X)
        Called automatically by pipeline.

    transform()
        Main function for transforming sequence data.

    reshape(X)
        Reformat feature vector for neural network.

    generate_clusters()
        Cluster amino acids.

    generate_all_permutations()
        Build dictionary keys for counting triads.

    create_feature_vector(sequence)
        Count triads and return feature vector.

    apply_cluster(sequence)
        Apply cluster labels over AA sequence.
    """
    
    def __init__(self,num_clusters=7):
        super().__init__()
        self.num_clusters = num_clusters
        self.generate_all_permutations()
        self.generate_clusters()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        for seq in X_copy.columns:
            X_copy[seq] = X_copy[seq].apply(self.apply_cluster)
            X_copy[seq] = X_copy[seq].apply(self.create_feature_vector)
        X_copy["seq_vec"] = X_copy[["seq1","seq2"]].values.tolist()
        return self.reshape(X_copy["seq_vec"])
    
    def reshape(self, X):
        X_np = np.stack(X)
        X_np = X_np.reshape(X_np.shape[0], 343, 2)
        return tf.cast(X_np, tf.float32)
        
    def generate_clusters(self):
        self.aa_clusters ={"A":1,"G":1,"V":1,
                          "I":2,"L":2,"F":2,"P":2,
                          "Y":3,"M":3,"T":3,"S":3,
                          "H":4,"N":4,"Q":4,"W":4,
                          "R":5,"K":5,
                          "D":6,"E":6,
                          "C":7
                          }
    
    def generate_all_permutations(self):
        self.triads = set()
        cluster_string = "".join([str(i) for i in range(1,self.num_clusters+1)])

        for aa1 in cluster_string:
            for aa2 in cluster_string:
                for aa3 in cluster_string:
                    self.triads.add(aa1+aa2+aa3)

        self.triads = list(self.triads)
        self.triads.sort()
        
    def create_feature_vector(self,sequence):
        features = {}
        for i in self.triads:
            features[i] = 0

        for i in range(len(sequence)-2):
            features[sequence[i:i+3]] += 1
        return list(features.values())
    
    def apply_cluster(self,sequence):
        new_seq = ""
        for i in sequence:
            new_seq += str(self.aa_clusters[i])
        return new_seq

In [2]:
class AutocovarianceEncoder(TransformerMixin, BaseEstimator):
    """
    Encoder class that transforms protein sequences into Autocovariance representation.
    
    Source: Sun et. al. 2017.“Sequence-based prediction of protein protein interaction
            using a deep-learning algorithm.”
    
    Attributes
    ----------
    lag : int
        Lag between timepoints for calculating autocovariance.
    max_len : int
        Length of largest sequence for generating fixed feature vector size.
    aa_props : DataFrame
        DataFrame containing properties of amino acids.
        

    Methods
    -------
    __init__(lag, max_len, prop_path)
        Constructor initializes member variables.
    
    fit(X)
        Called automatically by Pipeline.

    transform(X)
        Main function for transforming sequence data into feature vector.

    reshape(X)
        Prepare feature vector for neural network.

    seq_to_props(seq, prop)
        Replace amino acids with property values.

    get_acvs(sequence)
        Generate autocovariance for sequence of properties.

    generate_acv_vector(row)
        Reorganize feature vector for time series friendly format.
    """
    
    def __init__(self, lag=30, max_len=500, prop_path="../data/amino_acid_properties.csv"):
        super().__init__()
        self.lag = lag
        self.aa_props = pd.read_csv(prop_path,index_col="Unnamed: 0")
        self.max_len = max_len
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy["autocovariances"] = X_copy.apply(self.generate_acv_vector,axis=1)
        return self.reshape(X_copy["autocovariances"])
    
    def reshape(self, X):
        X_np = np.stack(X)
        return X_np
    
    def seq_to_props(self, seq, prop):
        prop_arr = []
        for i in seq:
            prop_arr.append(self.aa_props.loc[i,prop])
        return prop_arr
    
    def get_acvs(self, sequence):
        acv_vec = []
        for prop in self.aa_props.columns:
            acvs = acovf(self.seq_to_props(sequence,prop),nlag=self.lag)
            acv_vec.append(acvs)
        return acv_vec
            
    def generate_acv_vector(self, row):
        seq1_acvs = self.get_acvs(row["seq1"])
        seq2_acvs = self.get_acvs(row["seq2"])
        
        vec = []
        for i in range(self.max_len):
            props = []
            for j in range(self.aa_props.shape[1]):
                for acvs in [seq1_acvs,seq2_acvs]:
                    try:
                        props.append(acvs[j][i])
                    except:
                        props.append(0)
            vec.append(props)
        return vec

In [15]:
class Res2VecEncoder(TransformerMixin, BaseEstimator):
    """
    Encoder class that transforms protein sequences with Res2Vec feature embeddings.
    
    Source: Yao et.al. 2019. “Integration of deep learning with feature embedding
            for protein–protein interaction prediction.”
    
    Attributes
    ----------
    eigen_vectors : DataFrame
        DataFrame containing feature embeddings for each amino acid.
    embedding_len : int
        Length of one feature embedding.
    vector_cap : int
        Maximum length of transformed sequence.
        

    Methods
    -------
    __init__(embedding_len, max_len, embeddings_path)
        Constructor initializes member variables.
    
    fit(X)
        Called automatically by Pipeline.

    transform(X)
        Main function for transforming sequence data.

    reshape(X)
        Prepare feature vector for neural network.

    apply_eigens(row)
        Transform sequences.
    """
    
    def __init__(self, embedding_len = 20, max_len = 500, embeddings_path="../data/res2vec.csv"):
        super().__init__()
        self.eigen_vectors = pd.read_csv(embeddings_path, index_col="Unnamed: 0")
        self.embedding_len = embedding_len
        self.vector_cap = embedding_len * max_len
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy["res2vec"] = X_copy.apply(self.apply_eigens,axis=1)
        return self.reshape(X_copy["res2vec"])
    
    def reshape(self, X):
        X_np = np.stack(X)
        return X_np
    
    def apply_eigens(self, row):
        eigens = []
        for col in ["seq1","seq2"]:
            sequence = row[col]
            vec = []
            counter = self.vector_cap
            for aa in sequence:
                vec.extend(list(self.eigen_vectors.loc[aa]))
                counter -= self.embedding_len
            vec.extend([0.0]*counter)
            eigens.append(vec)
        return eigens

In [None]:
class MultiscaleDescriptorEncoder(TransformerMixin, BaseEstimator):
    """
    Encoder class that transforms protein sequences with Multiscale Descriptor.
    
    Source: You et. al. "Predicting Protein-Protein Interactions from Primary
            Protein Sequences Using a Novel Multi-Scale Local Feature
            Representation Scheme and the Random Forest."
    """
    def transform(self, X, y=None):
        return