In [None]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Lambda, Activation
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping
from sklearn.metrics import matthews_corrcoef, confusion_matrix

# ================== Configuration ==================
AMINO_ACIDS = ['A','R','N','D','C','Q','E','G','H','I','L','K','M','F','P','S','T','W','Y','V']
SPACER_IDX = 20  # 21st unit for spacer/padding
LABEL_MAP = {'h':0, 'e':1, '_':2}  # 3-class mapping as per paper

# ================== Data Loading & Preprocessing ==================
def parse_data(file_path):
    sequences = []
    current_seq = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('#') or not line:
                continue
            if line in ('<>', '<end>'):
                if current_seq:
                    sequences.append(current_seq)
                    current_seq = []
                continue
            parts = line.split()
            if len(parts) == 2 and parts[1].lower() in LABEL_MAP:
                current_seq.append((parts[0].upper(), parts[1].lower()))
    return sequences

def create_dataset(sequences):
    X, y = [], []
    for seq in sequences:
        seq_len = len(seq)
        for i in range(seq_len):
            # Create 13-residue window (i-6 to i+6)
            window = []
            for j in range(i-6, i+7):
                if j < 0 or j >= seq_len:
                    window.append(SPACER_IDX)
                else:
                    aa = seq[j][0]
                    window.append(AMINO_ACIDS.index(aa) if aa in AMINO_ACIDS else SPACER_IDX)
            
            # One-hot encode (13 positions × 21 units)
            encoded = np.zeros((13, 21), dtype=np.float32)
            for pos, idx in enumerate(window):
                encoded[pos, idx] = 1.0
            X.append(encoded.flatten())
            
            # Encode label
            label = LABEL_MAP[seq[i][1]]
            y.append(label)
    
    return np.array(X), tf.keras.utils.to_categorical(y, num_classes=3)

# ================== Model Architecture ==================
def create_model():
    model = Sequential([
        # Input: 273 units (13×21), Hidden: 40 units (paper specification)
        Dense(40, activation='sigmoid', input_dim=273),
        
        # Output with temperature scaling (T=1/2 as in paper)
        Dense(3, activation='linear'),
        Lambda(lambda x: x / 2),  # Temperature parameter
        Activation('softmax')
    ])
    
    # Original optimizer: SGD with lr=0.1, no momentum
    model.compile(
        loss='categorical_crossentropy',
        optimizer=SGD(learning_rate=0.1, momentum=0.0),
        metrics=['accuracy']
    )
    return model

# ================== Evaluation Metrics ==================
def print_metrics(y_true, y_pred, name="Dataset"):
    labels = list(LABEL_MAP.keys())
    y_true_labels = np.argmax(y_true, axis=1)
    y_pred_labels = np.argmax(y_pred, axis=1)
    
    # Q3 Accuracy
    acc = np.mean(y_true_labels == y_pred_labels)
    print(f"\n{name} Q3 Accuracy: {acc*100:.2f}%")
    
    # Matthews Correlation Coefficients
    print("\nMatthews CC:")
    for i, label in enumerate(labels):
        mcc = matthews_corrcoef(y_true_labels == i, y_pred_labels == i)
        print(f"C_{label}: {mcc:.3f}")
    
    # Confusion Matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true_labels, y_pred_labels))

# ================== Main Execution ==================
if __name__ == "__main__":
    # Load data
    train_data = parse_data("protein-secondary-structure.train.txt")
    test_data = parse_data("protein-secondary-structure.test.txt")
    
    # Create datasets
    X_train, y_train = create_dataset(train_data)
    X_test, y_test = create_dataset(test_data)
    
    # Verify input dimension matches paper (13×21=273)
    assert X_train.shape[1] == 273, "Invalid input dimension!"
    
    # Initialize model
    model = create_model()
    
    # Early stopping based on validation accuracy
    early_stop = EarlyStopping(
        monitor='val_accuracy',
        patience=15,
        restore_best_weights=True,
        verbose=1
    )
    
    # Train with paper's parameters (200 epochs, batch size 64)
    print("Training model...")
    history = model.fit(
        X_train, y_train,
        epochs=200,
        batch_size=64,
        validation_data=(X_test, y_test),
        callbacks=[early_stop],
        verbose=1
    )
    
    # Final evaluation
    print("\n===== Final Evaluation =====")
    print_metrics(y_train, model.predict(X_train), "Training")
    print_metrics(y_test, model.predict(X_test), "Test")

Training model...
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 34: early stopping

===== Final Evaluation =====

Training Q3 Accuracy: 64.36%

Matthews CC:
C_h: 0.375
C_e: 0.350
C__: 0.391

Confusion Matrix:
[[2262  378 1961]
 [ 632 1324 1680]
 [1159  643 8066]]

Test Q3 Accuracy: 63.30%

Matthews CC:
C_h: 0.344
C_e: 0.287
C__: 0.399

Confusion Matrix:
[[ 394   67  388]
 [ 179  227  342]
 [ 179  137 1607]]
