In [None]:
!pip install biopython



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio.Align import substitution_matrices
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import time
import os
from multiprocessing import Pool
from tqdm import tqdm
import traceback
from sklearn.utils.class_weight import compute_class_weight
from google.colab import drive
drive.mount('/content/drive')

class IndependentPSSMGenerator:
    def __init__(self, window_size=3):
        self.window_size = window_size
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
        # implementing BLOSUM62 matrix directly
        self.blosum62_dict = {
            ('A', 'A'): 4, ('R', 'R'): 5, ('N', 'N'): 6, ('D', 'D'): 6, ('C', 'C'): 9,
            ('Q', 'Q'): 5, ('E', 'E'): 5, ('G', 'G'): 6, ('H', 'H'): 8, ('I', 'I'): 4,
            ('L', 'L'): 4, ('K', 'K'): 5, ('M', 'M'): 5, ('F', 'F'): 6, ('P', 'P'): 7,
            ('S', 'S'): 4, ('T', 'T'): 5, ('W', 'W'): 11, ('Y', 'Y'): 7, ('V', 'V'): 4,
            ('A', 'R'): -1, ('A', 'N'): -2, ('A', 'D'): -2, ('A', 'C'): 0, ('A', 'Q'): -1,
            ('A', 'E'): -1, ('A', 'G'): 0, ('A', 'H'): -2, ('A', 'I'): -1, ('A', 'L'): -1,
            ('A', 'K'): -1, ('A', 'M'): -1, ('A', 'F'): -2, ('A', 'P'): -1, ('A', 'S'): 1,
            ('A', 'T'): 0, ('A', 'W'): -3, ('A', 'Y'): -2, ('A', 'V'): 0, ('R', 'N'): 0,
        }
        self.blosum_matrix = self._creating_blosum_matrix()

    def _creating_blosum_matrix(self):
        n = len(self.amino_acids)
        matrix = np.zeros((n, n), dtype=np.float32)
        for i, aa1 in enumerate(self.amino_acids):
            for j, aa2 in enumerate(self.amino_acids):
                # Try both orientations of the pair
                score = self.blosum62_dict.get((aa1, aa2)) or self.blosum62_dict.get((aa2, aa1))
                if score is None:
                    score = -4  # Default penalty for unknown pairs
                matrix[i, j] = score
        return matrix

    def _calculating_frequency_profile(self, sequence_profile):
        profile = sequence_profile.copy()
        totals = np.sum(profile, axis=1, keepdims=True)
        totals[totals == 0] = 1.0
        return profile / totals

    def _calculating_pssm_scores(self, freq_profile):
        background_freq = np.ones(len(self.amino_acids)) / len(self.amino_acids)
        pssm_scores = np.zeros_like(freq_profile)

        for i in range(freq_profile.shape[0]):
            for j in range(freq_profile.shape[1]):
                if freq_profile[i, j] > 0:
                    pssm_scores[i, j] = np.log2(freq_profile[i, j] / background_freq[j])
                else:
                    # giving penalty for zero frequency
                    pssm_scores[i, j] = -5

        return pssm_scores

    def generating_pssm(self, sequence_features):
        sequence_profile = sequence_features[:, :20].astype(np.float32)
        freq_profile = self._calculating_frequency_profile(sequence_profile)

        seq_len = len(sequence_profile)
        pssm = np.zeros((seq_len, 20), dtype=np.float32)
        conservation = np.zeros(seq_len, dtype=np.float32)

        pad_size = self.window_size // 2
        padded_profile = np.pad(freq_profile, ((pad_size, pad_size), (0, 0)))
        epsilon = 1e-10

        for i in range(seq_len):
            window = padded_profile[i:i+self.window_size]
            pos_freq = np.mean(window, axis=0)

            # calculating the position-specific scores
            weighted_freq = np.dot(pos_freq, self.blosum_matrix)
            weighted_freq = np.clip(weighted_freq, epsilon, None)
            weighted_freq /= np.sum(weighted_freq)

            # calculating the PSSM scores
            pssm[i] = self._calculating_pssm_scores(weighted_freq.reshape(1, -1))[0]

            # calculating the conservation
            entropy = -np.sum(weighted_freq * np.log2(np.maximum(weighted_freq, epsilon)))
            conservation[i] = 1 - (entropy / np.log2(20))

        return pssm, conservation
    def comparing_pssm_with_original_pssm(self, sequence_features, generated_pssm):
        try:
            if sequence_features is None or generated_pssm is None:
                print("Invalid inputs for PSSM comparison")
                return {'correlation': 0, 'mse': 0}
            original_pssm = sequence_features[:, 21:41]

            if original_pssm.shape[1] != 20 or generated_pssm.shape[1] != 20:
                print(f"Invalid PSSM dimensions - Original: {original_pssm.shape}, Generated: {generated_pssm.shape}")
                return {'correlation': 0, 'mse': 0}

            if original_pssm.shape[0] != generated_pssm.shape[0]:
                min_len = min(original_pssm.shape[0], generated_pssm.shape[0])
                original_pssm = original_pssm[:min_len]
                generated_pssm = generated_pssm[:min_len]
                print(f"Truncated matrices to length {min_len}")

            original_pssm = (original_pssm - np.mean(original_pssm)) / np.std(original_pssm)
            generated_pssm = (generated_pssm - np.mean(generated_pssm)) / np.std(generated_pssm)

            correlation = np.corrcoef(original_pssm.flatten(), generated_pssm.flatten())[0,1]
            mse = np.mean((original_pssm - generated_pssm) ** 2)

            plt.figure(figsize=(15, 5))
            plt.subplot(121)
            sns.heatmap(original_pssm, cmap='coolwarm', center=0)
            plt.title('Original PSSM')
            plt.xlabel('Amino Acid Position')
            plt.ylabel('Sequence Position')

            plt.subplot(122)
            sns.heatmap(generated_pssm, cmap='coolwarm', center=0)
            plt.title('Generated PSSM')
            plt.xlabel('Amino Acid Position')
            plt.ylabel('Sequence Position')

            plt.suptitle(f'PSSM Comparison\nCorrelation: {correlation:.3f}, MSE: {mse:.3f}')
            plt.tight_layout()
            plt.show()

            return {
                'correlation': correlation,
                'mse': mse,
                'shape': original_pssm.shape
            }
        except Exception as e:
            print(f"Error in PSSM comparison: {str(e)}")
            print(traceback.format_exc())
            return {'correlation': 0, 'mse': 0}

class SVM:
    def __init__(self, learning_rate=0.01, lambda_param=0.01, n_iters=100, class_weight=None):
        self.lr = learning_rate
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.class_weight = class_weight
        self.w = None
        self.b = 0

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features, dtype=np.float32)
        y = np.where(y == 1, 1, -1)

        # improving class weighting
        if self.class_weight is not None:
            pos_weight = self.class_weight
            neg_weight = 1.0
            sample_weights = np.where(y == 1, pos_weight, neg_weight)
        else:
            sample_weights = np.ones(n_samples)

        # adjusting dynamic learning rate
        adaptive_lr = self.lr

        for iter in range(self.n_iters):
            # shuffling data at each iteration
            shuffle_idx = np.random.permutation(n_samples)
            X = X[shuffle_idx]
            y = y[shuffle_idx]
            sample_weights = sample_weights[shuffle_idx]

            margins = y * (np.dot(X, self.w) + self.b)
            mask = margins < 1
            # updates only if there are violations
            if np.sum(mask) > 0:
                weighted_grad = np.sum(X[mask] * y[mask, np.newaxis] * sample_weights[mask, np.newaxis], axis=0)

                # applying L2 regularization with momentum
                self.w = self.w * (1 - self.lambda_param * adaptive_lr) + adaptive_lr * weighted_grad
                self.b += adaptive_lr * np.sum(y[mask] * sample_weights[mask])

                # adjusting the learning rate based on the margin violations
                violation_rate = np.mean(mask)
                adaptive_lr = self.lr * (1.0 / (1.0 + violation_rate))

    def predict(self, X):
        # adding the confidence scores to the predictions
        raw_predictions = np.dot(X, self.w) + self.b
        confidence = np.abs(raw_predictions)

        # applying the class weights to predictions
        if self.class_weight is not None:
            raw_predictions *= np.where(raw_predictions > 0, self.class_weight, 1.0)

        return np.sign(raw_predictions), confidence

def calculating_class_weights(labels):
    unique, counts = np.unique(labels, return_counts=True)
    total = len(labels)
    weights = {i: total/(len(unique) * count) for i, count in zip(unique, counts)}
    return weights

def display_metrics_table(metrics_dict):
    print("\n" + "="*60)
    print(f"{'State':<10} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10}")
    print("-"*60)
    for state, vals in metrics_dict.items():
        print(f"{state:<10} {vals['accuracy']:<10.4f} {vals['precision']:<10.4f} {vals['recall']:<10.4f} {vals['f1']:<10.4f}")
    print("="*60 + "\n")

def display_epoch_summary(epoch, train_loss, val_acc):
    print(f"\nEpoch {epoch} Summary")
    print("-" * 30)
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {val_acc:.4f}")

def display_final_results(results):
    print("\nFINAL MODEL PERFORMANCE")
    print("=" * 50)
    print("\nOverall Metrics:")
    print(f"Average Accuracy: {results['overall_acc']:.4f}")
    print(f"Macro F1-Score: {results['macro_f1']:.4f}")

    print("\nPer-State Performance:")
    display_metrics_table(results['state_metrics'])

def plotting_training_metrics(metrics):
    if not all(metrics.values()):
        print("No metrics to plot yet")
        return

    # getting the lengths for each metric
    lengths = {k: len(v) for k,v in metrics.items() if v}
    min_len = min(lengths.values())

    epochs = range(1, min_len + 1)
    plt.figure(figsize=(12, 4))

    plt.subplot(121)
    plt.plot(epochs, metrics['train_loss'][:min_len], 'b-', label='Training Loss')
    plt.plot(epochs, metrics['val_loss'][:min_len], 'r-', label='Validation Loss')
    plt.title('Loss over Training')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(122)
    for state in metrics['state_metrics'][-1].keys():
        plt.plot(epochs,
                 [m[state]['f1'] for m in metrics['state_metrics']],
                 label = f'{state} F1')
        plt.legend()
def extracting_features_batch(sequence):
    return sequence
class ProteinStructurePredictor:

    def __init__(self, window_size=13):
        self.window_size = window_size
        self.pssm_generator = IndependentPSSMGenerator(window_size=3)
        self.svm_models = []
        self.scaler = StandardScaler()

    def extracting_features(self, sequence_features):
        return self.optimizing_feature_extraction(sequence_features)
    def _calculating_importance(self, y_train, indices, window_size=5):

        importance = np.ones(len(indices), dtype=np.float32)
        padded_y = np.pad(y_train, window_size//2, mode='edge')

        for i, idx in enumerate(indices):
            # getting the window of labels
            window = padded_y[idx:idx + window_size]

            # calculating transition score
            transitions = np.sum(np.abs(np.diff(window)))

            # calculating minority presence
            minority_count = np.sum(window != 0)  # non-helix states

            # calculating position importance
            boundary_distance = min(idx, len(y_train) - idx)
            position_weight = 1.0 / (1.0 + np.exp(-boundary_distance/100))

            # combining all the scores
            importance[i] = (1 + transitions) * (1 + minority_count) * position_weight

        return importance

    def extract_balanced_batch(self, X, y, batch_size):
        indices = self.weighted_sampling(y, batch_size)
        return X[indices], y[indices]

    def weighted_sampling(self, y_train, batch_size):
        class_counts = np.bincount(y_train.astype(int))

        base_ratios = {
            0: 0.25,
            1: 0.40,
            2: 0.35
        }
        batch_sizes = {i: max(int(batch_size * ratio), 50)
                       for i, ratio in base_ratios.items()}
        indices = []
        for class_idx in range(3):
            class_indices = np.where(y_train == class_idx)[0]
            if len(class_indices) == 0:
                continue
            # calculating structural importance
            importance = self._calculating_importance(
                y_train, class_indices, window_size=5
            )

            if np.all(importance == 0):
                importance = np.ones_like(importance)
            # normalizing probabilities
            p = importance / importance.sum()

            selected_indices = np.random.choice(
                class_indices,
                size=batch_sizes[class_idx],
                p=p,
                replace=True
            )
            indices.extend(selected_indices)
        return np.array(indices)

    def validating_with_uncertainty(self, X_val, y_val):
        results = {}
        uncertainties = []
        predictions = []

        for i, model in enumerate(self.svm_models):
            pred, conf = model.predict(X_val)
            binary_val = (y_val == i).astype(np.float32)
            uncertainty = 1 / (1 + np.exp(-conf))  # Transform confidence to uncertainty
            uncertainties.append(uncertainty)

            weights = 1 - uncertainty
            weighted_acc = np.average(binary_val == (pred > 0), weights=weights)
            results[f'class_{i}_accuracy'] = weighted_acc
            results[f'class_{i}_uncertainty'] = np.mean(uncertainty)
            results[f'class_{i}_conf_mean'] = np.mean(conf)

            predictions.append((pred > 0).astype(int))

        # Aggregate predictions for all classes
        predictions = np.argmax(np.array(predictions), axis=0)
        # Calculate overall accuracy
        overall_acc = np.mean(predictions == y_val)
        results['overall_acc'] = overall_acc
        # Add state-wise metrics to results
        state_metrics = self.validating_per_state(predictions, y_val)

        results['state_metrics'] = state_metrics  # Add this line here
        f1_scores = [state_metrics[state]['f1'] for state in state_metrics]

        macro_f1 = np.mean(f1_scores)
        results['macro_f1'] = macro_f1

        results['predictions'] = predictions
        return results, np.array(uncertainties)

    def balance_dataset(self, X, y):
        class_counts = np.bincount(y.astype(int))
        target_count = int(0.7 * np.max(class_counts))  # 60% of majority class
        balanced_X = []
        balanced_y = []
        for i in range(len(class_counts)):
            idx = np.where(y == i)[0]
            if len(idx) < target_count:
                n_copies = int(np.ceil(target_count / len(idx)))
                for _ in range(n_copies):
                    sampled_X = X[idx] + np.random.normal(0, 0.01, X[idx].shape)
                    balanced_X.extend(sampled_X)
                    balanced_y.extend([i] * len(idx))
            else:
                selected_idx = np.random.choice(idx, target_count, replace=False)
                balanced_X.extend(X[selected_idx])
                balanced_y.extend([i] * len(selected_idx))
        return np.array(balanced_X), np.array(balanced_y)

    def optimizing_feature_extraction(self, sequence_features):
        pssm, conservation = self.pssm_generator.generating_pssm(sequence_features)
        pad_size = self.window_size // 2

        padded_features = {
            'pssm': np.pad(pssm, ((pad_size, pad_size), (0, 0)), constant_values=0),
            'cons': np.pad(conservation, pad_size, constant_values=0),
            'profile': np.pad(sequence_features[:, :20], ((pad_size, pad_size), (0, 0)), constant_values=0)
        }
        n_positions = len(sequence_features)
        feature_size = self.window_size * (20 + 1 + 20)
        # preallocating the output array
        features = np.zeros((n_positions, feature_size), dtype=np.float32)# Vectorized feature extraction
        for i in range(n_positions):
            window_slice = slice(i, i + self.window_size)
            features[i] = np.concatenate([
                padded_features['pssm'][window_slice].ravel(),
                padded_features['cons'][window_slice],
                padded_features['profile'][window_slice].ravel()
            ])
        return features, pssm

    def analyzing_epoch_distribution(self, y_true, y_pred, epoch):
        states = ['Helix', 'Sheet', 'Coil']
        distributions = {}

        for i, state in enumerate(states):
            state_mask = (y_true == i)
            tp = np.sum((y_true == i) & (y_pred == i))
            fp = np.sum((y_true != i) & (y_pred == i))
            fn = np.sum((y_true == i) & (y_pred != i))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0

            distributions[f'{state}_precision'] = precision
            distributions[f'{state}_recall'] = recall

            # tracking transitions
            for j, other_state in enumerate(states):
                if i != j:
                    transitions = np.sum((y_true == i) & (y_pred == j))
                    distributions[f'{state}_to_{other_state}'] = transitions


        return distributions
    def tracking_pssm_quality(self, sequence_features, generated_pssm, epoch):
        try:
            metrics = self.pssm_generator.comparing_pssm_with_original_pssm(sequence_features, generated_pssm)
            if metrics['correlation'] != 0:
                return metrics
        except Exception as e:
            print(f"Error tracking PSSM Quality: {e}")
            return {'correlation':0, 'mse':0}
    def plotting_pssm_progress(self, metrics):
        try:
            epochs = sorted([k.split('_')[1] for k in metrics.keys() if k.startswith('epoch_')])
            correlations = [metrics[f'epoch_{e}_pssm_seq_0']['correlation'] for e in epochs]
            mses = [metrics[f'epoch_{e}_pssm_seq_0']['mse'] for e in epochs]

            plt.figure(figsize=(12, 4))
            plt.subplot(121)
            plt.plot(epochs, correlations, 'b-')
            plt.title('PSSM Correlation over Epochs')
            plt.xlabel('Epoch')
            plt.ylabel('Correlation')

            plt.subplot(122)
            plt.plot(epochs, mses, 'r-')
            plt.title('PSSM MSE over Epochs')
            plt.xlabel('Epoch')
            plt.ylabel('MSE')
            plt.tight_layout()
            plt.show()
        except Exception as e:
            print(f"Error plotting PSSM progress: {e}")

    def checking_convergence(self, metrics_history, window=7, threshold=0.003):
        if len(metrics_history) < window:
            return False
        recent_metrics = metrics_history[-window:]
        # checking both the stability and minimum performance
        stable = all(np.std(m) < threshold for m in zip(*recent_metrics))
        min_f1 = min(min(metrics) for metrics in recent_metrics)
        # Only converge if we have both stability and good performance
        return stable and min_f1 > 0.4

    def balance_dataset(self, X, y):

        # calculating the structural importance scores
        importance_scores = self._calculating_structural_scores(X, y)

        class_counts = np.bincount(y.astype(int))
        target_count = int(0.7 * np.max(class_counts))

        balanced_X = []
        balanced_y = []

        for i in range(len(class_counts)):
            idx = np.where(y == i)[0]

            if len(idx) < target_count:
                n_copies = int(np.ceil(target_count / len(idx)))
                for _ in range(n_copies):
                    sampled_X = X[idx] + np.random.normal(0, 0.01, X[idx].shape)
                    balanced_X.extend(sampled_X)
                    balanced_y.extend([i] * len(idx))
            else:
                # adding the importance-based selection for majority class
                scores = importance_scores[idx]
                selected_idx = self._selecting_important_samples(
                    idx, scores, target_count
                )
                balanced_X.extend(X[selected_idx])
                balanced_y.extend([i] * len(selected_idx))

        return np.array(balanced_X), np.array(balanced_y)

    def _calculating_structural_scores(self, X, y, window=5):
        scores = np.zeros(len(y))
        padded_y = np.pad(y, (window//2, window//2), mode='edge')

        for i in range(len(y)):
            window_labels = padded_y[i:i+window]
            transitions = np.sum(np.abs(np.diff(window_labels)))
            minority_presence = np.sum(window_labels > 0)
            scores[i] = transitions + 0.5 * minority_presence

        return scores

    def _selecting_important_samples(self, indices, scores, target_count):
        if len(indices) <= target_count:
            return indices

        # combining the random selection with importance based selection
        n_important = int(target_count * 0.7)
        n_random = target_count - n_important

        # selecting the important samples
        important_idx = indices[np.argsort(scores)[-n_important:]]

        # selecting the random samples from the remaining
        remaining_idx = np.setdiff1d(indices, important_idx)
        random_idx = np.random.choice(
            remaining_idx,
            size=min(n_random, len(remaining_idx)),
            replace=False
        )

        return np.concatenate([important_idx, random_idx])

    def train(self, sequences, labels):
        start_time = time.time()
        print("Extracting features and training...")

        metrics = {
            'train_loss': [], 'val_loss': [], 'accuracy': [],
            'Helix_recall': [], 'Sheet_recall': [], 'Coil_recall': [],
            'state_metrics': [], 'uncertainties': []
        }
        # using optimized feature extraction
        n_processes = max(os.cpu_count() - 1, 1)
        with Pool(processes=n_processes) as pool:
            feature_results = list(tqdm(
                pool.imap(self.optimizing_feature_extraction, sequences, chunksize=8),
                total=len(sequences),
                desc="Extracting features"
            ))
        # combining all the features
        all_features = [f[0] for f in feature_results]
        X = np.vstack(all_features).astype(np.float32)
        y = np.concatenate([label.flatten() for label in labels]).astype(np.float32)
        print(f"Feature extraction complete. Shape: {X.shape}")

        X_scaled = self.scaler.fit_transform(X)
        # using enhanced balancing
        X_balanced, y_balanced = self.balance_dataset(X_scaled, y)
        # splitting after balancing
        train_indices, val_indices = train_test_split(
            np.arange(len(y_balanced)),
            test_size=0.1,
            stratify=y_balanced
        )
        X_train = X_balanced[train_indices]
        y_train = y_balanced[train_indices]
        X_val = X_balanced[val_indices]
        y_val = y_balanced[val_indices]

        # training with the improved sampling
        print("training model..")
        self.svm_models = []
        n_epochs = 20
        batch_size = 1024
        state_metrics_history = []

        for i in range(3):
            print(f"\nTraining model for structure type {['Helix', 'Sheet', 'Coil'][i]}")
            model = SVM(
                learning_rate=0.01,
                lambda_param=0.001,
                n_iters=200,
                # handling class weights through sampling
                class_weight=None
            )
            self.svm_models.append(model)
            for epoch in range(n_epochs):
                X_batch, y_batch = self.extract_balanced_batch(X_train, y_train, batch_size)
                y_batch = (y_batch == i).astype(np.float32)
                # training on the batch
                model.fit(X_batch, y_batch)

                # evaluating every 5 epochs
                if epoch % 5 == 0:
                    val_results, uncertainties = self.validating_with_uncertainty(X_val, y_val)
                    metrics['uncertainties'].append(uncertainties)

                    # updating the metrics
                    state_metrics = self.validating_per_state(
                        (val_results['predictions']).astype(int),
                        y_val.astype(int)
                    )
                    metrics['state_metrics'].append(state_metrics)

                    # tracking the training progress
                    train_pred, _ = model.predict(X_batch)
                    train_loss = np.mean(np.maximum(0, 1 - y_batch * train_pred))
                    metrics['train_loss'].append(train_loss)

                    # displaying progress
                    display_epoch_summary(epoch, train_loss, val_results[f'class_{i}_accuracy'])
                    # checking convergence
                    if epoch > 15 and self.checking_convergence(state_metrics_history):
                        print("Training converged - stopping early")
                        break
        # evaluating finally
        final_results, _ = self.validating_with_uncertainty(X_val, y_val)
        print(f"\nTotal training time: {time.time() - start_time:.2f} seconds")
        display_final_results(final_results)
        return final_results
    def validating_per_state(self, predictions, true_labels):
        metrics = {}
        states = ['Helix', 'Sheet', 'Coil']
        for state in states:
            idx = states.index(state)
            mask = true_labels == idx
            if np.sum(mask) > 0:
                accuracy = np.mean(predictions[mask] == true_labels[mask])
                tp = np.sum((predictions == idx) & (true_labels == idx))
                fp = np.sum((predictions == idx) & (true_labels != idx))
                fn = np.sum((predictions != idx) & (true_labels == idx))

                precision = tp / (tp + fp) if (tp + fp) > 0 else 0
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0
                f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

                metrics[state] = {
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1': f1
                }
            else:
                metrics[state] = {
                    'accuracy': 0.0,
                    'precision': 0.0,
                    'recall': 0.0,
                    'f1': 0.0
                }
        return metrics

    def predict(self, sequence):
        features, _ = self.extracting_features(sequence)
        X_scaled = self.scaler.transform(features)
        # getting the predictions with the confidence scores
        predictions = []
        confidences = []
        for model in self.svm_models:
            pred, conf = model.predict(X_scaled)
            predictions.append(pred)
            confidences.append(conf)
        predictions = np.array(predictions)
        confidences = np.array(confidences)
        # predicting weights by confidence and the class weights
        class_weights = np.array([1.5, 6.0, 8.0])
        confidence_scaling = np.power(confidences, 0.5)
        weighted_predictions = predictions * confidence_scaling * class_weights[:, np.newaxis]
        return np.argmax(weighted_predictions, axis=0)

def analyzing_state_distribution(y_true, y_pred, states=['Helix', 'Sheet', 'Coil']):
    metrics = {}
    for i, state in enumerate(states):
        state_mask = (y_true == i)
        tp = np.sum((y_true == i) & (y_pred == i))
        fp = np.sum((y_true != i) & (y_pred == i))
        fn = np.sum((y_true == i) & (y_pred != i))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

        metrics[state] = {
            'accuracy': np.mean(y_true[state_mask] == y_pred[state_mask]),
            'precision': precision,
            'recall': recall,
            'f1': f1
        }

    display_metrics_table(metrics)
    return metrics

def loading_and_processing_data():
    try:
        print("Loading data...")
        file_path = "/content/drive/MyDrive/CB513.npy"

        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Data file not found at {file_path}")

        data = np.load(file_path, allow_pickle=True)
        print(f"Raw data shape: {data.shape}")

        # extracting the features and labels
        data = data.reshape(-1, 700, 57)
        features = data[:, :, :21].astype(np.float32)
        labels = np.argmax(data[:, :, 42:45], axis=2)

        print(f"Features shape: {features.shape}")
        print(f"Labels shape: {labels.shape}")

        # analyzing the class distribution
        unique, counts = np.unique(labels, return_counts=True)
        for label, count in zip(unique, counts):
            print(f"Class {label}: {count} ({count/labels.size*100:.2f}%)")

        return features, labels

    except Exception as e:
        print(f"Error loading data: {e}")
        return None, None

def main():
    features, labels = loading_and_processing_data()
    if features is None:
        return None, None

    train_seqs, test_seqs, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.2, random_state=42)

    predictor = ProteinStructurePredictor()
    predictor.train(train_seqs, train_labels)

    print("\n Predictions Generated...")
    all_predictions = []
    all_true_labels = []

    for seq, labels in zip(test_seqs, test_labels):
        pred = predictor.predict(seq)
        all_predictions.extend(pred)
        all_true_labels.extend(labels)

    analyzing_state_distribution(np.array(all_true_labels), np.array(all_predictions))
    return predictor

if __name__ == "__main__":
    predictor = main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading data...
Raw data shape: (514, 39900)
Features shape: (514, 700, 21)
Labels shape: (514, 700)
Class 0: 294744 (81.92%)
Class 1: 47187 (13.11%)
Class 2: 17869 (4.97%)
Starting feature extraction and training...


Extracting features: 100%|██████████| 411/411 [01:09<00:00,  5.93it/s]


Feature extraction complete. Shape: (287700, 533)
Starting model training...

Training model for structure type Helix

Epoch 0 Summary
------------------------------
Training Loss: 0.7498
Validation Accuracy: 0.6524

Epoch 5 Summary
------------------------------
Training Loss: 0.7498
Validation Accuracy: 0.4534

Epoch 10 Summary
------------------------------
Training Loss: 0.7498
Validation Accuracy: 0.4790

Epoch 15 Summary
------------------------------
Training Loss: 0.7498
Validation Accuracy: 0.4772

Training model for structure type Sheet

Epoch 0 Summary
------------------------------
Training Loss: 1.0108
Validation Accuracy: 0.9816

Epoch 5 Summary
------------------------------
Training Loss: 0.6217
Validation Accuracy: 0.5147

Epoch 10 Summary
------------------------------
Training Loss: 0.6061
Validation Accuracy: 0.5516

Epoch 15 Summary
------------------------------
Training Loss: 0.6373
Validation Accuracy: 0.5540

Training model for structure type Coil

Epoch 0 Summ