In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC

# Initialize lists to store primary and secondary structures
primary_structures = []
secondary_structures = []

In [None]:
# Read data from file
with open('Data_RS126.txt', 'r') as file:
    for count, line in enumerate(file, start=1):
        if count % 2 == 0:  # Secondary structures are on even lines
            secondary_structures.append(line.strip())
        else:  # Primary structures are on odd lines
            primary_structures.append(line.strip())

In [None]:
# Display counts
print("Primary Protein Structures: ", len(primary_structures))
print("Secondary Protein Structures: ", len(secondary_structures))

Primary Protein Structures:  126
Secondary Protein Structures:  126


In [None]:
# Check for mismatched lengths between primary and secondary sequences
for i in range(len(secondary_structures)):
    sec_len = len(secondary_structures[i])
    pri_len = len(primary_structures[i])

    if sec_len != pri_len:
        print(i, " ", secondary_structures[i], " ", primary_structures[i])

109   CCCCCECCCCCCCCECCHHHHHHHHHHHHHHHCCCCCHHHHHHHCCECCCCCCCCCCCCCEEEECCCEEEEEHHHHHHHHHHHHHHCCCCCCCCEEEECCCCCCECCCHHHHHHHHHHHHHHHHHHHHHHHHHHCCEEECCCCECHHHHHHHHCCCCCCEEEECCCCHHHHHHHCCEEEECCHHHHHHHHHCCCEEEECCEEEEEEEECCCCCCCCHHHHHHHHHHHHHHHHCCCEEEEEECHHHHHCCHHHHHHHHHHHHHHCCCEEECCCCHHHHHHHHHCCCHHHHHHHHCCHHHCCCCCCCCCCEEEEEECCCCCCECECCCCCCCCCEHHHHHHHHHHHCCCCEEEEEEEECCCECCHHHHHHHHHHHHHHHCCCCCCCCEEEEECCEHHHHHHHHHCCHHHHHHHCCEEECCCCCHHHCCCECCCCCCCCCCEEEEECCCCCCCCCCCCCCCEEEEECCHHHHHHHHHHCECCCCCCCCCEECCCCCEECCCCCCCCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCCECCCCCCCCCCCCCEEEEEEEEEECCCCEHHHHECCHHHHHHCECHHHHHHHECCCCEECCCCEECCEECCCCCCEECHHHHHHHHHCCCCCEEEECCCCECECCCCCHHHHHHHHCCEEEEEECCECHHHHHHHHHCCCEEEEECCHHHHHCCCCCCEEEEECHHHCCCCCCEEEEEECCCCCEEEEEEECCCCHHHHHHHHHCCHHHHHHHCCCC   RAKVAMSHFEPHEYIRYDLLEKNIDIVRKRLNRPLTLSEKIVYGHLDDPANQEIERGKTYLRLRPDRVAMQDATAQMAMLQFISSGLPKVAVPSTIHCDHLIEAQLGGEKDLRRAKDINQEVYNFLATAGAKYGVGFWRPGSGIIHQIILENYAYPGVLLIGTDSHTPNGGGLGGICIGVGGADAVDVMAGIPWELKCPKVIGVKLTGSLSGWTSPKDVILKVAGILTVKGGTGAIV

In [None]:

# Remove mismatched sequences
primary_structures.pop(109)
secondary_structures.pop(109)

'CCCCCECCCCCCCCECCHHHHHHHHHHHHHHHCCCCCHHHHHHHCCECCCCCCCCCCCCCEEEECCCEEEEEHHHHHHHHHHHHHHCCCCCCCCEEEECCCCCCECCCHHHHHHHHHHHHHHHHHHHHHHHHHHCCEEECCCCECHHHHHHHHCCCCCCEEEECCCCHHHHHHHCCEEEECCHHHHHHHHHCCCEEEECCEEEEEEEECCCCCCCCHHHHHHHHHHHHHHHHCCCEEEEEECHHHHHCCHHHHHHHHHHHHHHCCCEEECCCCHHHHHHHHHCCCHHHHHHHHCCHHHCCCCCCCCCCEEEEEECCCCCCECECCCCCCCCCEHHHHHHHHHHHCCCCEEEEEEEECCCECCHHHHHHHHHHHHHHHCCCCCCCCEEEEECCEHHHHHHHHHCCHHHHHHHCCEEECCCCCHHHCCCECCCCCCCCCCEEEEECCCCCCCCCCCCCCCEEEEECCHHHHHHHHHHCECCCCCCCCCEECCCCCEECCCCCCCCCCCCCCCCCCCCCEECCCCCCCCCCCCCCCCCCCECCCCCCCCCCCCCEEEEEEEEEECCCCEHHHHECCHHHHHHCECHHHHHHHECCCCEECCCCEECCEECCCCCCEECHHHHHHHHHCCCCCEEEECCCCECECCCCCHHHHHHHHCCEEEEEECCECHHHHHHHHHCCCEEEEECCHHHHHCCCCCCEEEEECHHHCCCCCCEEEEEECCCCCEEEEEEECCCCHHHHHHHHHCCHHHHHHHCCCC'

In [None]:
# Count total characters in all sequences
count_secondary = 0
count_primary = 0
for i in range(len(secondary_structures)):
    sec_len = len(secondary_structures[i])
    pri_len = len(primary_structures[i])
    count_secondary += sec_len
    count_primary += pri_len

print("Count of Secondary Structure (Labels): ", count_secondary)
print("Count of Primary Structure (Features): ", count_primary)

Count of Secondary Structure (Labels):  22594
Count of Primary Structure (Features):  22594


In [None]:
# Function to split sequences into individual characters
def split(sequence):
    return [char for char in sequence]

split_primary = [split(seq) for seq in primary_structures]
split_secondary = [split(seq) for seq in secondary_structures]

In [None]:
# Function to encode primary structures using one-hot encoding
def encode_primary(char):
    encoding = {
        'A': np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        'C': np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        # (Add encodings for all amino acids)
        'Y': np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])
    }
    return encoding.get(char)

In [None]:
# Function to encode secondary structures into numerical labels
def encode_secondary(char):
    encoding = {'H': 0, 'C': 1, 'E': 2}
    return encoding.get(char)

In [None]:
# Apply encoding to primary and secondary structures
for i in range(len(split_primary)):
    split_primary[i] = [encode_primary(char) for char in split_primary[i]]

for i in range(len(split_secondary)):
    split_secondary[i] = [encode_secondary(char) for char in split_secondary[i]]

In [None]:
# Flatten secondary structure labels into a single list
def flatten_labels(sequences):
    return [label for sequence in sequences for label in sequence]

y_labels = flatten_labels(split_secondary)


In [None]:
# Function to pad sequences and create sliding windows
def create_sliding_window(size, sequences):
    pad_size = size // 2
    zero_vector = np.zeros(20)
    for seq in sequences:
        for _ in range(pad_size):
            seq.append(zero_vector)
            seq.insert(0, zero_vector)

    features = []
    for seq in sequences:
        for i in range(len(seq) - (size - 1)):
            features.append(seq[i:i + size])
    return features



In [None]:
X = create_sliding_window(15, split_primary)
num_features = 15 * 20

In [None]:
!pip install numpy
import numpy as np

# Function to pad sequences and create sliding windows
def create_sliding_window(size, sequences, labels):  # Include labels as input
    pad_size = size // 2
    zero_vector = [0.0] * 20  # Change to list of zeros

    # Pad sequences with zero vectors
    padded_sequences = []
    for seq in sequences:
        padded_seq = [zero_vector] * pad_size + seq + [zero_vector] * pad_size
        padded_sequences.append(padded_seq)

    features = []
    window_labels = []  # Store labels for each window
    for seq, label in zip(padded_sequences, labels):  # Iterate through sequences and labels together
        for i in range(len(seq) - size + 1):  # Iterate with consistent window size
            window = seq[i : i + size]
            # Check if all elements in the window are iterable
            if all(isinstance(sublist, (list, np.ndarray)) for sublist in window):
                # Flatten the window before appending to features
                flat_window = [element for sublist in window for element in sublist]
                features.append(flat_window)
                window_labels.append(label)  # Assign the label to the window
    return features, window_labels  # Return both features and labels

# Assuming X is created using create_sliding_window
X, y_labels = create_sliding_window(15, split_primary, y_labels)  # Pass y_labels to the function

# Converting to NumPy array
X = np.array(X) # No need to reshape, as features are already flattened within create_sliding_window

# Converting y_labels to NumPy array
y_labels = np.array(y_labels)

# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.10)



In [None]:

# Converting to numpy arrays
X = np.array(X).reshape(len(X), -1)
y_labels = np.array(y_labels)


In [None]:
# Splitting the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_labels, test_size=0.10)

In [None]:
# Initializing and training the SVM model
svc = SVC()
svc.fit(X_train, y_train)

In [None]:
# Performing grid search for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}
grid = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, verbose=3)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ....................C=0.1, gamma=1;, score=0.583 total time=   0.0s
[CV 2/5] END ....................C=0.1, gamma=1;, score=0.636 total time=   0.0s
[CV 3/5] END ....................C=0.1, gamma=1;, score=0.545 total time=   0.0s
[CV 4/5] END ....................C=0.1, gamma=1;, score=0.545 total time=   0.0s
[CV 5/5] END ....................C=0.1, gamma=1;, score=0.545 total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.1;, score=0.583 total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.1;, score=0.636 total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.1;, score=0.545 total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.1;, score=0.545 total time=   0.0s
[CV 5/5] END ..................C=0.1, gamma=0.1;, score=0.545 total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=0.01;, score=0.583 total time=   0.0s
[CV 2/5] END .................C=0.1, gamma=0.01

In [None]:
# Outputting results
print("Best Parameters:", grid.best_params_)
print("Best Estimator:", grid.best_estimator_)

Best Parameters: {'C': 0.1, 'gamma': 1}
Best Estimator: SVC(C=0.1, gamma=1)


In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def load_dataset(file_path):
    primary_sequences = []
    secondary_structures = []
    with open(file_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                continue
            elif len(line.strip()) > 0:

                if len(primary_sequences) == len(secondary_structures):
                    primary_sequences.append(line.strip())
                else:
                    secondary_structures.append(line.strip())
    return primary_sequences, secondary_structures


def encode_sequences(sequences, window_size=15):
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}

    def encode_sequence(seq):
        padded_seq = "X" * (window_size // 2) + seq + "X" * (window_size // 2)
        encoded = []
        for i in range(len(seq)):
            window = padded_seq[i:i + window_size]
            encoded.append([aa_to_idx.get(aa, -1) for aa in window])
        return encoded

    all_encoded = []
    for seq in sequences:
        all_encoded.extend(encode_sequence(seq))
    return np.array(all_encoded)


def encode_labels(labels):
    label_mapping = {"H": 0, "E": 1, "C": 2}
    all_labels = []
    for label_seq in labels:
        all_labels.extend([label_mapping[ss] for ss in label_seq])
    return np.array(all_labels)

def decode_labels(encoded_labels):
    label_mapping = {0: "H", 1: "E", 2: "C"}
    return [label_mapping[label] for label in encoded_labels]


def train_and_predict(primary_sequences, secondary_structures):

    X = encode_sequences(primary_sequences)
    y = encode_labels(secondary_structures)


    min_len = min(X.shape[0], y.shape[0])
    X = X[:min_len]
    y = y[:min_len]


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    model = SVC(kernel="rbf", C=1.0, gamma="scale")
    model.fit(X_train, y_train)


    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=["Helix", "Strand", "Coil"]))


    example_sequence = input("Enter a primary sequence for prediction: ").strip()
    example_features = encode_sequences([example_sequence])
    example_prediction = model.predict(example_features)
    decoded_prediction = decode_labels(example_prediction)
    print("Predicted Secondary Structure:", "".join(decoded_prediction))


if __name__ == "__main__":
    dataset_path = "Data_RS126.txt"
    primary_sequences, secondary_structures = load_dataset(dataset_path)

    train_and_predict(primary_sequences, secondary_structures)


Accuracy: 0.4601713062098501
              precision    recall  f1-score   support

       Helix       0.43      0.20      0.27      1478
      Strand       0.57      0.00      0.01      1067
        Coil       0.47      0.87      0.61      2125

    accuracy                           0.46      4670
   macro avg       0.49      0.36      0.30      4670
weighted avg       0.48      0.46      0.36      4670

Predicted Secondary Structure: CCCCCEHCCCCCCCCCCCCCCCCCCCCCCECCCCCCCCCCCCCHHCCCCCCCCCCCCHCCCCCCCCCCCCCCCCCCCCCCCCHCCCHHHHHHHCCCCCCCCCCCCCCC
