In [6]:
import os
import re
import numpy as np
import pandas as pd
import wfdb
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Define a function to extract features from a signal
def extract_features(signal):
    mean_amplitude = np.mean(signal)
    max_amplitude = np.max(signal)
    rms_amplitude = np.sqrt(np.mean(signal ** 2))
    mfccs = np.random.rand(13)  # Placeholder for actual MFCC computation
    return [mean_amplitude, max_amplitude, rms_amplitude] + list(mfccs)

# Function to parse actual diagnosis from -info.txt files
def parse_diagnosis(info_file_path):
    with open(info_file_path, 'r') as f:
        content = f.read()
        # Extract the diagnosis from the file
        diagnosis_match = re.search(r"Diagnosis:\s+(.+)", content)
        if diagnosis_match:
            return diagnosis_match.group(1).strip()
    return None

# Directory containing the .hea files
data_dir = "data"

# List to store feature vectors, labels, filenames, and actual diagnoses
features_list = []
labels = []  # Replace with actual labels if available
filenames = []
actual_diagnoses = []

# Process each file in the directory
for file in os.listdir(data_dir):
    if file.endswith(".hea"):
        # Load the record
        record_path = os.path.join(data_dir, file)
        record = wfdb.rdrecord(record_path[:-4])  # Remove .hea to get base name

        # Flatten the signal and extract features
        signal = record.p_signal.flatten()
        features = extract_features(signal)
        
        # Append features, filename, and simulated label
        features_list.append(features)
        labels.append(np.random.randint(0, 2))  # Replace with actual labels
        filenames.append(file)
        
        # Extract actual diagnosis from corresponding -info.txt file
        info_file_path = os.path.join(data_dir, file.replace(".hea", "-info.txt"))
        if os.path.exists(info_file_path):
            actual_diagnoses.append(parse_diagnosis(info_file_path))
        else:
            actual_diagnoses.append(None)

# Create a DataFrame
columns = ['Mean Amplitude', 'Max Amplitude', 'RMS Amplitude'] + [f'MFCC_{i+1}' for i in range(13)] + ['Label']
data = pd.DataFrame(features_list, columns=columns[:-1])
data['Label'] = labels
data['File'] = filenames  # Add filenames to the DataFrame
data['Actual Diagnosis'] = actual_diagnoses  # Add actual diagnosis

# Split data into training and test sets
X = data.iloc[:, :-3]  # All columns except 'Label', 'File', and 'Actual Diagnosis'
y = data['Label']  # The 'Label' column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM model
model = SVC(kernel='linear', C=1)
model.fit(X_train, y_train)

# Predict and classify for each file
data['Predicted'] = model.predict(X)
data['Classification'] = data['Predicted'].apply(lambda x: 'Healthy' if x == 0 else 'Pathological')

# Compare predicted classifications with actual diagnoses
data['Match'] = data.apply(
    lambda row: 'Match' if (row['Classification'] == 'Healthy' and 'healthy' in (row['Actual Diagnosis'] or '').lower()) or
                (row['Classification'] == 'Pathological' and 'healthy' not in (row['Actual Diagnosis'] or '').lower())
    else 'Mismatch',
    axis=1
)

# Save results to a CSV file
data.to_csv('classification_results_with_comparison.csv', index=False)

# Print file-specific classifications
print("File-wise Classifications:")
for _, row in data.iterrows():
    print(f"File: {row['File']}, Predicted: {row['Classification']}, Actual: {row['Actual Diagnosis']}, Match: {row['Match']}")

# Print summary statistics
classification_counts = data['Classification'].value_counts()
print("\nClassification Counts:")
print(classification_counts)

# Print match/mismatch counts
match_counts = data['Match'].value_counts()
print("\nMatch/Mismatch Counts:")
print(match_counts)


File-wise Classifications:
File: voice201.hea, Predicted: Healthy, Actual: hypokinetic dysphonia (glottic insufficiency), Match: Mismatch
File: voice150.hea, Predicted: Healthy, Actual: hyperkinetic dysphonia, Match: Mismatch
File: voice092.hea, Predicted: Pathological, Actual: healthy, Match: Mismatch
File: voice098.hea, Predicted: Healthy, Actual: healthy, Match: Match
File: voice110.hea, Predicted: Healthy, Actual: healthy, Match: Match
File: voice160.hea, Predicted: Healthy, Actual: hyperkinetic dysphonia, Match: Mismatch
File: voice042.hea, Predicted: Pathological, Actual: reflux laryngitis, Match: Match
File: voice113.hea, Predicted: Healthy, Actual: hyperkinetic dysphonia (Prolapse), Match: Mismatch
File: voice100.hea, Predicted: Healthy, Actual: healthy, Match: Match
File: voice007.hea, Predicted: Healthy, Actual: hyperkinetic dysphonia (nodule), Match: Mismatch
File: voice006.hea, Predicted: Pathological, Actual: hypokinetic dysphonia, Match: Match
File: voice106.hea, Predicte