### Imports

In [1]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
import pandas as pd
import numpy as np
import os
import json
import string
from collections import Counter

### On full DSSP files

In [2]:
topdir, _, files = next(os.walk('clean_dssp_csv/'))

In [3]:
df = pd.DataFrame()

for fi in files:
    data = pd.read_csv(topdir+fi)
    df = df.append(data)

In [4]:
df.shape

(268675, 32)

In [5]:
df.head()

Unnamed: 0,DSSP,PDB,CHAIN,AA,SS,3H,4H,5H,BEND,CHIR,...,ON2I,ON2E,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1,1.0,A,V,C,?,?,?,?,?,...,428,-0.0,0.0,360.0,360.0,360.0,157.5,10.2,20.8,6.8
1,2,2.0,A,L,C,?,?,?,?,-,...,0,0.0,-0.785,360.0,-142.6,-89.6,124.7,6.6,21.5,7.8
2,3,3.0,A,S,C,?,>,?,?,-,...,5,-0.3,-0.26,26.2,-103.3,-75.8,167.2,4.8,23.2,4.9
3,4,4.0,A,P,H,?,>,?,S,+,...,5,-0.2,0.929,126.4,49.2,-56.6,-43.6,2.3,26.0,5.3
4,5,5.0,A,A,H,?,>,?,S,+,...,5,-0.2,0.92,109.3,52.7,-66.7,-37.7,-0.5,23.4,4.5


In [17]:
print Counter(df['SS'].values)

Counter({'H': 92221, 'C': 53458, 'E': 51024, 'T': 31321, 'S': 24113, 'G': 11450, 'B': 3861, 'I': 1227})


In [7]:
encoding = {l:n for n, l in enumerate(list(string.punctuation)+list(string.letters))}
print encoding

{'!': 0, '#': 2, '"': 1, '%': 4, '$': 3, "'": 6, '&': 5, ')': 8, '(': 7, '+': 10, '*': 9, '-': 12, ',': 11, '/': 14, '.': 13, ';': 16, ':': 15, '=': 18, '<': 17, '?': 20, '>': 19, 'A': 32, '@': 21, 'C': 34, 'B': 33, 'E': 36, 'D': 35, 'G': 38, 'F': 37, 'I': 40, 'H': 39, 'K': 42, 'J': 41, 'M': 44, 'L': 43, 'O': 46, 'N': 45, 'Q': 48, 'P': 47, 'S': 50, 'R': 49, 'U': 52, 'T': 51, 'W': 54, 'V': 53, 'Y': 56, 'X': 55, '[': 22, 'Z': 57, ']': 24, '\\': 23, '_': 26, '^': 25, 'a': 58, '`': 27, 'c': 60, 'b': 59, 'e': 62, 'd': 61, 'g': 64, 'f': 63, 'i': 66, 'h': 65, 'k': 68, 'j': 67, 'm': 70, 'l': 69, 'o': 72, 'n': 71, 'q': 74, 'p': 73, 's': 76, 'r': 75, 'u': 78, 't': 77, 'w': 80, 'v': 79, 'y': 82, 'x': 81, '{': 28, 'z': 83, '}': 30, '|': 29, '~': 31}


In [8]:
features = df.copy().drop(['SS', 'CHAIN'], axis=1)
features.head()

Unnamed: 0,DSSP,PDB,AA,3H,4H,5H,BEND,CHIR,BB1,BB2,...,ON2I,ON2E,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1,1.0,V,?,?,?,?,?,?,?,...,428,-0.0,0.0,360.0,360.0,360.0,157.5,10.2,20.8,6.8
1,2,2.0,L,?,?,?,?,-,?,?,...,0,0.0,-0.785,360.0,-142.6,-89.6,124.7,6.6,21.5,7.8
2,3,3.0,S,?,>,?,?,-,?,?,...,5,-0.3,-0.26,26.2,-103.3,-75.8,167.2,4.8,23.2,4.9
3,4,4.0,P,?,>,?,S,+,?,?,...,5,-0.2,0.929,126.4,49.2,-56.6,-43.6,2.3,26.0,5.3
4,5,5.0,A,?,>,?,S,+,?,?,...,5,-0.2,0.92,109.3,52.7,-66.7,-37.7,-0.5,23.4,4.5


In [9]:
X = [[encoding[c] if c in encoding else float(c) for c in row] for row in features.values]

In [10]:
y = df.copy()['SS']

In [11]:
print Counter(y)

Counter({'H': 92221, 'C': 53458, 'E': 51024, 'T': 31321, 'S': 24113, 'G': 11450, 'B': 3861, 'I': 1227})


In [12]:
# without
X_train, X_test, y_train, y_test = train_test_split(X, y)

# with stratification
# X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [23]:
rf = RandomForestClassifier()
erf = ExtraTreesClassifier()

In [14]:
rf.fit(X_train, y_train)
erf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [15]:
print cross_val_score(RandomForestClassifier(), X, y).mean()
print cross_val_score(ExtraTreesClassifier(), X, y).mean()

0.938423800379
0.936421360263


In [16]:
# get an idea of the feature importances 
# NOTE: AA feature is one of the lowest signal features ... great
ind = np.argsort(rf.feature_importances_)[::-1]
features.columns.values[ind]

array(['BEND', 'TCO', 'PSI', 'BSL', 'KAPPA', 'BP1', '3H', 'ALPHA', '4H',
       'ON1I', 'CHIR', 'PHI', 'ON1E', 'NO1I', 'BP2', 'BB1', 'NO2E', 'NO1E',
       '5H', 'BB2', 'ON2I', 'ON2E', 'PDB', 'ACC', 'AA', 'NO2I', 'Y-CA',
       'Z-CA', 'X-CA', 'DSSP'], dtype=object)

### Now try with just AA sequence files

In [25]:
seq = np.asarray([encoding[v] for v in df['AA'].values])
seq = seq.reshape(-1, 1)
seq.shape

(268675, 1)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(seq, y)

In [27]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:
cross_val_score(rf, X_test, y_test)

array([ 0.3727224 ,  0.36768056,  0.37180632])

In [31]:
Counter(y_test)

Counter({'B': 1012,
         'C': 13328,
         'E': 12573,
         'G': 2788,
         'H': 23202,
         'I': 307,
         'S': 6060,
         'T': 7899})

In [30]:
Counter(rf.predict(X_test))

Counter({'B': 1, 'C': 6763, 'E': 8901, 'G': 3, 'H': 46332, 'T': 5169})