In [71]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from statsmodels.tsa.stattools import acovf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Conv1D, MaxPooling1D

## Autocovariance

The technique for autocovariance is taken from Sun et. al. It involves treating the sequence like time series data. Correlations, with a lag of 30, are performed on each of 7 categories of amino acid metrics.  This notebook is separated into two parts. Part One involves preprocessing data on the physiochemical properties of amino acids. Part Two involves compiling functions to perform autocovariance. 

### Part One: Prepare The Amino Acid Properties Dataset
This dataset is taken from Sun et. al. and includes the hydrophobicity, hydrophilicity, net_charge_index, polarity, polarizability, solvent_accessible_surface_area, and side chain volume of amino acids.

In [45]:
df = pd.read_csv("../data/train_cleaned.csv")
df.head()

Unnamed: 0,interacts,seq1,seq2
0,1,MHKTASQRLFPGPSYQNIKSIMEDSTILSDWTNSNKQKMKYDFSCE...,MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPL...
1,1,MPYNFCLPSLSCRTSCSSRPCVPPSCHGYTLPGACNIPANVSNCNW...,MSQAYSSSQRVSSYRRTFGGAPGFPLGSPLSSPVFPRAGFGSKGSS...
2,1,MSFSEMNRRTLAFRGGGLVTASGGGSTNNNAGGEASAWPPQPQPRQ...,MALCLKQVFAKDKTFRPRKRFEPGTQRFELYKKAQASLKSGLDLRS...
3,1,MKFQYKEDHPFEYRKKEGEKIRKKYPDRVPVIVEKAPKARVPDLDK...,MEPQVTLNVTFKNEIQSFLVSDPENTTWADIEAMVKVSFDLNTIQI...
4,1,MTILGTTFGMVFSLLQVVSGESGYAQNGDLEDAELDDYSFSCYSQL...,MRVAGAAKLVVAVAVFLLTFYVISQVFEIKMDASLGNLFARSALDT...


In [46]:
aa_prop_dict = {
     "A": [0.62, -0.5, 0.007187, 8.1, 0.046, 1.181, 27.5],
     "C": [0.29, -1, -0.03661, 5.5, 0.128, 1.461, 44.6],
     "D": [-0.9, 3, -0.02382, 13, 0.105, 1.587, 40],
     "E": [-0.74, 3, 0.006802, 12.3, 0.151, 1.862, 62],
     "F": [1.19, -2.5, 0.037552, 5.2, 0.29, 2.228, 115.5],
     "G": [0.48, 0, 0.179052, 9, 0, 0.881, 0],
     "H": [-0.4, -0.5, -0.01069, 10.4, 0.23, 2.025, 79],
     "I": [1.38, -1.8, 0.021631, 5.2, 0.186, 1.81, 93.5],
     "K": [-1.5, 3, 0.017708, 11.3, 0.219, 2.258, 100],
     "L": [1.06, -1.8, 0.051672, 4.9, 0.186, 1.931, 93.5],
     "M": [0.64, -1.3, 0.002683, 5.7, 0.221, 2.034, 94.1],
     "N": [-0.78, 2, 0.005392, 11.6, 0.134, 1.655, 58.7],
     "P": [0.12, 0, 0.239531, 8, 0.131, 1.468, 41.9],
     "Q": [-0.85, 0.2, 0.049211, 10.5, 0.18, 1.932, 80.7],
     "R": [-2.53, 3, 0.043587, 10.5, 0.291, 2.56, 105],
     "S": [-0.18, 0.3, 0.004627, 9.2, 0.062, 1.298, 29.3],
     "T": [-0.05, -0.4, 0.003352, 8.6, 0.108, 1.525, 51.3],
     "V": [1.08, -1.5, 0.057004, 5.9, 0.14, 1.645, 71.5],
     "W": [0.81, -3.4, 0.037977, 5.4, 0.409, 2.663, 145.5],
     "Y": [0.26, -2.3, 117.3, 6.2, 0.298, 2.368, 0.023599]
}

In [47]:
aa_props = pd.DataFrame(aa_prop_dict)

In [48]:
aa_props = aa_props.T

In [49]:
aa_props.head()

Unnamed: 0,0,1,2,3,4,5,6
A,0.62,-0.5,0.007187,8.1,0.046,1.181,27.5
C,0.29,-1.0,-0.03661,5.5,0.128,1.461,44.6
D,-0.9,3.0,-0.02382,13.0,0.105,1.587,40.0
E,-0.74,3.0,0.006802,12.3,0.151,1.862,62.0
F,1.19,-2.5,0.037552,5.2,0.29,2.228,115.5


In [50]:
lag = 30

In [51]:
columns = ["hydrophobicity", "hydrophilicity", "net_charge_index", "polarity", "polarizability", "solvent_accessible_surface_area", "volume"]
aa_props.columns = columns
aa_props.head()

Unnamed: 0,hydrophobicity,hydrophilicity,net_charge_index,polarity,polarizability,solvent_accessible_surface_area,volume
A,0.62,-0.5,0.007187,8.1,0.046,1.181,27.5
C,0.29,-1.0,-0.03661,5.5,0.128,1.461,44.6
D,-0.9,3.0,-0.02382,13.0,0.105,1.587,40.0
E,-0.74,3.0,0.006802,12.3,0.151,1.862,62.0
F,1.19,-2.5,0.037552,5.2,0.29,2.228,115.5


### Part Two: Prepare Autocovariance Functions

We will make three functions.

1.) seq_to_props will be called privately and transforms the amino acid sequences of amino acid properties.

2.) generate_timeseries will be called privately and performs the autocovariances on the transformed sequences.

3.) apply_autocovariance will be called publicly and adds autocovariances to the dataframe it is passed.

In [52]:
def seq_to_props(seq, prop):
    prop_arr = []
    for i in seq:
        prop_arr.append(aa_props.loc[i,prop])
    return prop_arr

In [53]:
def generate_timeseries(row):
    feature_vector = []
    for seq in ["seq1","seq2"]:
        seq_vector = []
        for prop in columns:
            autocovariances = acovf(seq_to_props(row[seq],prop),nlag=lag)
            seq_vector.append(autocovariances)
        feature_vector.append(seq_vector)
    return feature_vector            

In [54]:
def apply_autocovariance(df):
    df["autocovariances"] = df.apply(generate_timeseries,axis=1)
    return df

