In [13]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Lambda, Activation
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from sklearn.metrics import matthews_corrcoef, confusion_matrix

# ================== Configuration ==================
AMINO_ACIDS = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
SPACER_IDX = 20  # 21st unit for spacer/padding
LABEL_MAP = {'h':0, 'e':1, '_':2}  # 3-class mapping as per paper

# ================== Data Loading & Preprocessing ==================
def parse_data(file_path):
    sequences = []
    current_seq = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#') or not line:
                continue
            if line in ('<>', '<end>'):
                if current_seq:
                    sequences.append(current_seq)
                    current_seq = []
                continue
            parts = line.split()
            if len(parts) == 2 and parts[1].lower() in LABEL_MAP:
                current_seq.append((parts[0].upper(), parts[1].lower()))
    return sequences

def create_dataset(sequences):
    X, y = [], []
    for seq in sequences:
        seq_len = len(seq)
        for i in range(seq_len):
            # Create 13-residue window (i-6 to i+6)
            window = []
            for j in range(i-6, i+7):
                if j < 0 or j >= seq_len:
                    window.append(SPACER_IDX)
                else:
                    aa = seq[j][0]
                    window.append(AMINO_ACIDS.index(aa) if aa in AMINO_ACIDS else SPACER_IDX)
            
            # One-hot encode (13 positions × 21 units)
            encoded = np.zeros((13, 21), dtype=np.float32)
            for pos, idx in enumerate(window):
                encoded[pos, idx] = 1.0
            X.append(encoded.flatten())
            
            # Encode label
            label = LABEL_MAP[seq[i][1]]
            y.append(label)
    
    return np.array(X), tf.keras.utils.to_categorical(y, num_classes=3)

# ================== Model Architecture ==================
def create_model():
    model = Sequential([
        # Input: 273 units (13×21), Hidden: 40 units (paper specification)
        Dense(40, activation='sigmoid', input_dim=273),
        
        # Output with temperature scaling (T=1/2 as in paper)
        Dense(3, activation='linear'),
        Lambda(lambda x: x / 2),  # Temperature parameter
        Activation('softmax')
    ])
    
    # Original optimizer: SGD with lr=0.1, no momentum
    model.compile(
        loss='categorical_crossentropy',
        optimizer=SGD(learning_rate=0.1, momentum=0.0),
        metrics=['accuracy']
    )
    return model

# ================== Evaluation Metrics ==================
def print_metrics(y_true, y_pred, name="Dataset"):
    labels = list(LABEL_MAP.keys())
    y_true_labels = np.argmax(y_true, axis=1)
    y_pred_labels = np.argmax(y_pred, axis=1)
    
    # Q3 Accuracy
    acc = np.mean(y_true_labels == y_pred_labels)
    print(f"\n{name} Q3 Accuracy: {acc*100:.2f}%")
    
    # Matthews Correlation Coefficients
    print("\nMatthews CC:")
    for i, label in enumerate(labels):
        mcc = matthews_corrcoef(y_true_labels == i, y_pred_labels == i)
        print(f"C_{label}: {mcc:.3f}")
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true_labels, y_pred_labels))

# ================== Main Execution ==================
if __name__ == "__main__":
    # Load data
    train_data = parse_data("protein-secondary-structure.train.txt")
    test_data = parse_data("protein-secondary-structure.test.txt")
    
    # Create datasets
    X_train, y_train = create_dataset(train_data)
    X_test, y_test = create_dataset(test_data)
    
    # Verify input dimension matches paper (13×21=273)
    assert X_train.shape[1] == 273, "Invalid input dimension!"
    
    # Initialize model
    model = create_model()
    
    # Early stopping based on validation accuracy
    early_stop = EarlyStopping(
        monitor='val_accuracy',
        patience=15,
        restore_best_weights=True,
        verbose=1
    )
    
    # Train with paper's parameters (200 epochs, batch size 64)
    print("Training model...")
    history = model.fit(
        X_train, y_train,
        epochs=200,
        batch_size=64,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=1
    )
    
    # Final evaluation
    print("\n===== Final Evaluation =====")
    print_metrics(y_train, model.predict(X_train), "Training")
    print_metrics(y_test, model.predict(X_test), "Test")
    print("Best Validation Accuracy:", max(history.history['val_accuracy']))

Training model...
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 34: early stopping

===== Final Evaluation =====

Training Q3 Accuracy: 64.34%

Matthews CC:
C_h: 0.372
C_e: 0.343
C__: 0.387

Confusion Matrix:
[[2214  334 2053]
 [ 629 1219 1788]
 [1102  551 8215]]

Test Q3 Accuracy: 62.87%

Matthews CC:
C_h: 0.324
C_e: 0.289
C__: 0.386

Confusion Matrix:
[[ 375   56  418]
 [ 181  213  354]
 [ 178  120 1625]]
Best Validation Accuracy: 0.628693163394928


In [25]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Lambda, Activation
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from sklearn.metrics import matthews_corrcoef, confusion_matrix

# ================== Configuration ==================
AMINO_ACIDS = list("ARNDCQEGHILKMFPSTWYV")
SPACER = 'X'
LABEL_MAP = {'h':0, 'e':1, '_':2}

#  ================== BLOSUM62 MATRIX ==================
# standard 20×20 + spacer row
BLOSUM62 = {
    'A': [ 4, -1, -2, -2,  0, -1, -1,  0, -2, -1, -1, -1, -1, -2, -1,  1,  0, -3, -2,  0],
    'R': [-1,  5,  0, -2, -3,  1,  0, -2,  0, -3, -2,  2, -1, -3, -2, -1, -1, -3, -2, -3],
    'N': [-2,  0,  6,  1, -3,  0,  0,  0,  1, -3, -3,  0, -2, -3, -2,  1,  0, -4, -2, -3],
    'D': [-2, -2,  1,  6, -3,  0,  2, -1, -1, -3, -4, -1, -3, -3, -1,  0, -1, -4, -3, -3],
    'C': [ 0, -3, -3, -3,  9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1],
    'Q': [-1,  1,  0,  0, -3,  5,  2, -2,  0, -3, -2,  1,  0, -3, -1,  0, -1, -2, -1, -2],
    'E': [-1,  0,  0,  2, -4,  2,  5, -2,  0, -3, -3,  1, -2, -3, -1,  0, -1, -3, -2, -2],
    'G': [ 0, -2,  0, -1, -3, -2, -2,  6, -2, -4, -4, -2, -3, -3, -2,  0, -2, -2, -3, -3],
    'H': [-2,  0,  1, -1, -3,  0,  0, -2,  8, -3, -3, -1, -2, -1, -2, -1, -2, -2,  2, -3],
    'I': [-1, -3, -3, -3, -1, -3, -3, -4, -3,  4,  2, -3,  1,  0, -3, -2, -1, -3, -1,  3],
    'L': [-1, -2, -3, -4, -1, -2, -3, -4, -3,  2,  4, -2,  2,  0, -3, -2, -1, -2, -1,  1],
    'K': [-1,  2,  0, -1, -3,  1,  1, -2, -1, -3, -2,  5, -1, -3, -1,  0, -1, -3, -2, -2],
    'M': [-1, -1, -2, -3, -1,  0, -2, -3, -2,  1,  2, -1,  5,  0, -2, -1, -1, -1, -1,  1],
    'F': [-2, -3, -3, -3, -2, -3, -3, -3, -1,  0,  0, -3,  0,  6, -4, -2, -2,  1,  3, -1],
    'P': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4,  7, -1, -1, -4, -3, -2],
    'S': [ 1, -1,  1,  0, -1,  0,  0,  0, -1, -2, -2,  0, -1, -2, -1,  4,  1, -3, -2, -2],
    'T': [ 0, -1,  0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1,  1,  5, -2, -2,  0],
    'W': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1,  1, -4, -3, -2, 11,  2, -3],
    'Y': [-2, -2, -2, -3, -2, -1, -2, -3,  2, -1, -1, -2, -1,  3, -3, -2, -2,  2,  7, -1],
    'V': [ 0, -3, -3, -3, -1, -2, -2, -3, -3,  3,  1, -2,  1, -1, -2, -2,  0, -3, -1,  4],
    'X': [0]*20
}

# ================== Robust Data Parsing ==================
def parse_data(path):
    sequences = []
    current = []
    with open(path) as f:
        for line in f:
            L = line.strip()
            if not L or L.startswith('#'):
                continue
            if L in ('<>','<end>'):
                if current:
                    sequences.append(current)
                    current = []
                continue
            parts = L.split()
            if len(parts)==2 and parts[1].lower() in LABEL_MAP:
                aa, lab = parts
                current.append((aa.upper(), lab.lower()))
            # else: skip any malformed lines
    # catch last
    if current:
        sequences.append(current)
    return sequences

# ================== Window & Encoding ==================
def make_windows(seqs, use_profile=False):
    X, y = [], []
    w = 6
    for seq in seqs:
        L = len(seq)
        for i in range(L):
            feats = []
            for j in range(i-w, i+w+1):
                aa = seq[j][0] if 0 <= j < L else SPACER
                if use_profile:
                    feats.extend(BLOSUM62[aa])
                else:
                    onehot = [0]*21
                    idx = AMINO_ACIDS.index(aa) if aa in AMINO_ACIDS else 20
                    onehot[idx] = 1
                    feats.extend(onehot)
            X.append(feats)
            y.append(LABEL_MAP[seq[i][1]])
    X = np.array(X, dtype=np.float32)
    y = tf.keras.utils.to_categorical(y, num_classes=3)
    return X, y

# ================== Model Definition ==================
def make_model(input_dim):
    m = Sequential([
        Dense(40, activation='sigmoid', input_dim=input_dim),
        Dense(3, activation='linear'),
        Lambda(lambda x: x/2),
        Activation('softmax')
    ])
    m.compile(
        loss='categorical_crossentropy',
        optimizer=SGD(learning_rate=0.1),
        metrics=['accuracy']
    )
    return m

# ================== Q3 Printer ==================
def print_Q3(name, y_true, y_pred):
    yt = np.argmax(y_true, axis=1)
    yp = np.argmax(y_pred, axis=1)
    acc = np.mean(yt == yp)
    print(f"{name} Q3 Accuracy: {acc*100:.2f}%")
    return acc

# ================== Main ==================
if __name__ == "__main__":
    # 1) Load
    train = parse_data("protein-secondary-structure.train.txt")
    test  = parse_data("protein-secondary-structure.test.txt")

    # 2) Baseline (Qian & Sejnowski 1988)
    X1, y1 = make_windows(train, use_profile=False)
    X1t, y1t = make_windows(test,  use_profile=False)
    model1 = make_model(X1.shape[1])
    es = EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True, verbose=1)
    model1.fit(X1, y1, epochs=100, batch_size=64,
               validation_data=(X1t, y1t), callbacks=[es], verbose=1)
    acc1 = print_Q3("Qian & Sejnowski (1988)", y1t, model1.predict(X1t))

    # 3) Rost & Sander (1993) with BLOSUM62 profiles
    X2, y2   = make_windows(train, use_profile=True)
    X2t, y2t = make_windows(test,  use_profile=True)
    model2 = make_model(X2.shape[1])
    model2.fit(X2, y2, epochs=100, batch_size=64,
               validation_data=(X2t, y2t), callbacks=[es], verbose=1)
    acc2 = print_Q3("Rost & Sander (1993)", y2t, model2.predict(X2t))

    print(f"\nImprovement: +{(acc2-acc1)*100:.2f} percentage points")


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 28: early stopping
Qian & Sejnowski (1988) Q3 Accuracy: 62.95%
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 31: early stopping
Rost & Sander (1993) Q3 Accuracy: 64.15%

Improvement: +1.19 percentage points
