In [166]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer

## Conjoint Triad Method
The technique for counting conjoint triads is taken from Sun et. al. and the technique for clustering amino acids with similar physiochemical properties is taken from Shen et. al. The method will be performed in three steps.

### Step 1: Separate the amino acids into different ranks of side chain volume and dipole moment.

We will take our ranks from the Shen et. al. The following dictionaries represent each categories rank.

In [142]:
volume_dict = {"A": 0,
               "R": 1,
               "N": 1,
               "D": 1,
               "C": 1,
               "Q": 1,
               "E": 1,
               "G": 0,
               "H": 1,
               "L": 1,
               "I": 1,
               "K": 1,
               "M": 1,
               "F": 1,
               "P": 1,
               "S": 1,
               "T": 1,
               "W": 1,
               "Y": 1,
               "V": 0
              }

In [143]:
dipole_dict = {"A": 0,
               "R": 3,
               "N": 2,
               "D": -3,
               "C": -1,
               "Q": 2,
               "E": -3,
               "G": 0,
               "H": 2,
               "L": 0,
               "I": 0,
               "K": 3,
               "M": 1,
               "F": 0,
               "P": 0,
               "S": 1,
               "T": 1,
               "W": 2,
               "Y": 1,
               "V": 0,
              }

In [144]:
# Let's organize our amino acid data into a dataframe
clustering_df = pd.DataFrame([volume_dict, dipole_dict])
clustering_df = clustering_df.T
clustering_df.columns = ["volume", "dipole"]

In [145]:
clustering_df.head()

Unnamed: 0,volume,dipole
A,0,0
R,1,3
N,1,2
D,1,-3
C,1,-1


### Step 2: Cluster the amino acids into groups according to these ranks.

In [146]:
# Let's scale our data
ss = StandardScaler()
X = ss.fit_transform(clustering_df)

In [147]:
# Let's use KMeans and set our n_clusters to 7 to match the number of clusters in the Shen et. al. paper
km = KMeans(n_clusters=7, random_state=42)
km.fit(X)

KMeans(n_clusters=7, random_state=42)

In [148]:
clustering_df.head()

Unnamed: 0,volume,dipole
A,0,0
R,1,3
N,1,2
D,1,-3
C,1,-1


In [149]:
clustering_df["clusters"] = km.labels_

In [161]:
# organizing these clusters into groups we get the following dictionary
aa_clusters ={"A":1,"G":1,"V":1,
              "I":2,"L":2,"F":2,"P":2,
              "Y":3,"M":3,"T":3,"S":3,
              "H":4,"N":4,"Q":4,"W":4,
              "R":5,"K":5,
              "D":6,"E":6,
              "C":7
             }

### Step 3: Create functions to allow conjoint modules to be built from inputted sequences.

We wil make three functions:

1.) conjoint_triad_preprocess will be called publicly on an entire dataframe.

2.) apply_cluster will be called privately to replace an individual sequence with cluster numbers.

3.) create_feature_vector will be called privately to apply a sliding window that counts the appearances of triads and translates them into a fixed feature vector.

In [2]:
# Compile list of all permutations of triads
# Helper code for create_feature_vector

triads = set()

cluster_string = "1234567"

for aa1 in cluster_string:
    for aa2 in cluster_string:
        for aa3 in cluster_string:
            triads.add(aa1+aa2+aa3)

triads = list(triads)
              
triads.sort()

In [163]:
# called privately
def create_feature_vector(sequence):
    features = {}
    for i in triads:
        features[i] = 0
        
    for i in range(len(sequence)-2):
        features[sequence[i:i+3]] += 1
    
    return features

In [164]:
# called privately
def apply_cluster(sequence):
    
    new_seq = ""
    for i in sequence:
        new_seq += str(aa_clusters[i])
        
    return new_seq

In [165]:
# called publicly
def conjoint_triad_preprocess(df):
    df["seq1_clusters"] = df["seq1"].apply(apply_cluster).apply(create_feature_vector)
    df["seq2_clusters"] = df["seq2"].apply(apply_cluster).apply(create_feature_vector)
    df["seq_vec"] = df[["seq1_clusters","seq2_clusters"]].values.tolist()
    vec = DictVectorizer()
    vec.fit(df.loc[0,"seq_vec"])
    df["seq_vec"] = df["seq_vec"].apply(vec.transform)
    vec.fit(df.loc[0,"seq_vec"])
    df["seq_vec"] = df["seq_vec"].apply(vec.transform)
    return df