In [5]:
import os
import numpy as np
import pandas as pd
import wfdb
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Define a function to extract features from a signal
def extract_features(signal):
    mean_amplitude = np.mean(signal)
    max_amplitude = np.max(signal)
    rms_amplitude = np.sqrt(np.mean(signal ** 2))
    mfccs = np.random.rand(13)  # Placeholder for actual MFCC computation
    return [mean_amplitude, max_amplitude, rms_amplitude] + list(mfccs)

# Directory containing the .hea files
data_dir = "data"

# List to store feature vectors and labels
features_list = []
labels = []  # Replace with actual labels if available

# Process each file in the directory
for file in os.listdir(data_dir):
    if file.endswith(".hea"):
        # Load the record
        record_path = os.path.join(data_dir, file)
        record = wfdb.rdrecord(record_path[:-4])  # Remove .hea to get base name

        # Flatten the signal and extract features
        signal = record.p_signal.flatten()
        features = extract_features(signal)
        
        # Append features and simulated label
        features_list.append(features)
        labels.append(np.random.randint(0, 2))  # Replace with actual labels
# Create a DataFrame
columns = ['Mean Amplitude', 'Max Amplitude', 'RMS Amplitude'] + [f'MFCC_{i+1}' for i in range(13)] + ['Label']
data = pd.DataFrame(features_list, columns=columns[:-1])
data['Label'] = labels

# Split data into training and test sets
X = data.iloc[:, :-1]  # All columns except the last (Label)
y = data['Label']  # The last column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM model
model = SVC(kernel='linear', C=1)
model.fit(X_train, y_train)

# Predict and add results to the DataFrame
data['Predicted'] = model.predict(X)
data['Classification'] = data['Predicted'].apply(lambda x: 'Healthy' if x == 0 else 'Pathological')
# Save results to a CSV file
data.to_csv('classification_results.csv', index=False)

# Print summary statistics
classification_counts = data['Classification'].value_counts()
print("Classification Counts:")
print(classification_counts)

# Compute average feature values
average_features = data.iloc[:, :-2].mean()  # Exclude 'Label' and 'Predicted'
print("Average Feature Values:")
print(average_features)

# Classify based on average feature values
average_features_df = pd.DataFrame([average_features], columns=average_features.index).loc[:, X_train.columns]
predicted_class = model.predict(average_features_df)[0]
classification = 'Healthy' if predicted_class == 0 else 'Pathological'
print(f"Classification based on average features: {classification}")

     Mean Amplitude  Max Amplitude  RMS Amplitude    MFCC_1    MFCC_2  \
0         -0.000418       0.962555       0.201915  0.898719  0.538762   
1         -0.000411       0.856445       0.229094  0.988874  0.273644   
2         -0.000388       0.788055       0.160187  0.351455  0.874227   
3         -0.000289       0.925049       0.245334  0.851126  0.722589   
4         -0.000443       0.811493       0.137984  0.112460  0.601794   
..              ...            ...            ...       ...       ...   
203       -0.000493       0.739319       0.239025  0.459558  0.199841   
204       -0.000447       0.910004       0.250331  0.419526  0.243055   
205       -0.000111       0.999969       0.290358  0.343433  0.065662   
206       -0.000500       0.461639       0.118340  0.553403  0.040922   
207       -0.000384       0.657288       0.183795  0.309452  0.421080   

       MFCC_3    MFCC_4    MFCC_5    MFCC_6    MFCC_7    MFCC_8    MFCC_9  \
0    0.473598  0.755231  0.884832  0.126410  0