In [87]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
import pandas as pd
import numpy as np
import os
import string
from collections import Counter

In [6]:
topdir, _, files = next(os.walk('clean_dssp_csv/'))

In [16]:
df = pd.DataFrame()

for fi in files:
    data = pd.read_csv(topdir+fi)
    df = df.append(data)

In [17]:
df.shape

(269022, 32)

In [18]:
df.head()

Unnamed: 0,DSSP,PDB,CHAIN,AA,SS,3H,4H,5H,BEND,CHIR,...,ON2I,ON2E,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1.0,1.0,A,V,C,?,?,?,?,?,...,428.0,-0.0,0.0,360.0,360.0,360.0,157.5,10.2,20.8,6.8
1,2.0,2.0,A,L,C,?,?,?,?,-,...,0.0,0.0,-0.785,360.0,-142.6,-89.6,124.7,6.6,21.5,7.8
2,3.0,3.0,A,S,C,?,>,?,?,-,...,5.0,-0.3,-0.26,26.2,-103.3,-75.8,167.2,4.8,23.2,4.9
3,4.0,4.0,A,P,H,?,>,?,S,+,...,5.0,-0.2,0.929,126.4,49.2,-56.6,-43.6,2.3,26.0,5.3
4,5.0,5.0,A,A,H,?,>,?,S,+,...,5.0,-0.2,0.92,109.3,52.7,-66.7,-37.7,-0.5,23.4,4.5


In [69]:
encoding = {l:n for n, l in enumerate(list(string.punctuation)+list(string.letters))}
encoding

{'!': 0,
 '"': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 ':': 15,
 ';': 16,
 '<': 17,
 '=': 18,
 '>': 19,
 '?': 20,
 '@': 21,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 22,
 '\\': 23,
 ']': 24,
 '^': 25,
 '_': 26,
 '`': 27,
 'a': 58,
 'b': 59,
 'c': 60,
 'd': 61,
 'e': 62,
 'f': 63,
 'g': 64,
 'h': 65,
 'i': 66,
 'j': 67,
 'k': 68,
 'l': 69,
 'm': 70,
 'n': 71,
 'o': 72,
 'p': 73,
 'q': 74,
 'r': 75,
 's': 76,
 't': 77,
 'u': 78,
 'v': 79,
 'w': 80,
 'x': 81,
 'y': 82,
 'z': 83,
 '{': 28,
 '|': 29,
 '}': 30,
 '~': 31}

In [70]:
features = df.copy().drop(['SS', 'CHAIN'], axis=1)
features.head()

Unnamed: 0,DSSP,PDB,AA,3H,4H,5H,BEND,CHIR,BB1,BB2,...,ON2I,ON2E,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA,Z-CA
0,1.0,1.0,V,?,?,?,?,?,?,?,...,428.0,-0.0,0.0,360.0,360.0,360.0,157.5,10.2,20.8,6.8
1,2.0,2.0,L,?,?,?,?,-,?,?,...,0.0,0.0,-0.785,360.0,-142.6,-89.6,124.7,6.6,21.5,7.8
2,3.0,3.0,S,?,>,?,?,-,?,?,...,5.0,-0.3,-0.26,26.2,-103.3,-75.8,167.2,4.8,23.2,4.9
3,4.0,4.0,P,?,>,?,S,+,?,?,...,5.0,-0.2,0.929,126.4,49.2,-56.6,-43.6,2.3,26.0,5.3
4,5.0,5.0,A,?,>,?,S,+,?,?,...,5.0,-0.2,0.92,109.3,52.7,-66.7,-37.7,-0.5,23.4,4.5


In [72]:
X = [[encoding[c] if c in encoding else float(c) for c in row] for row in features.values]

In [74]:
y = df.copy()['SS']

In [75]:
Counter(y)

Counter({'B': 3861,
         'C': 53805,
         'E': 51024,
         'G': 11450,
         'H': 92221,
         'I': 1227,
         'S': 24113,
         'T': 31321})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [77]:
rf = RandomForestClassifier()
erf = ExtraTreesClassifier()

In [78]:
rf.fit(X_train, y_train)
erf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [79]:
rf.score(X_test, y_test)

0.96449334621961191

In [80]:
erf.score(X_test, y_test)

0.96504349119024613

In [104]:
print cross_val_score(RandomForestClassifier(), X, y).mean()
print cross_val_score(ExtraTreesClassifier(), X, y).mean()

0.938060874975
0.936856500567


In [102]:
ind = np.argsort(rf.feature_importances_)[::-1]
features.columns.values[ind]

array(['KAPPA', 'TCO', 'BB1', '4H', 'PSI', 'BEND', 'BSL', '3H', 'PHI',
       'ALPHA', 'BB2', 'BP1', 'NO1I', 'ON1I', 'CHIR', 'ON2E', 'ON1E',
       'BP2', 'NO2E', 'NO1E', '5H', 'ON2I', 'PDB', 'ACC', 'NO2I', 'AA',
       'DSSP', 'Y-CA', 'Z-CA', 'X-CA'], dtype=object)