<a href="https://colab.research.google.com/github/mirrorball108/audio_processing_deeplearning/blob/main/final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bioread

Collecting bioread
  Downloading bioread-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting docopt>=0.6.1 (from bioread)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading bioread-3.0.1-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.3/54.3 kB[0m [31m146.1 kB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=dc595fb89bfcf3ac6bd35f497fb6d880df9d45c9642db1c9141817f67ed8f276
  Stored in directory: /root/.cache/pip/wheels/1a/b0/8c/4b75c4116c31f83c8f9f047231251e13cc74481cca4a78a9ce
Successfully built docopt
Installing collected packages: docopt, bioread
Successfully installed bioread-3.0.1 docopt-0.6.2


In [4]:
import bioread
import numpy as np
import pandas as pd
from scipy.integrate import simps

def extract_features(filename, interval=5):
    # Read the ACQ file
    data = bioread.read(filename)

    # Initialize a list to store features for each interval
    features_list = []

    # Calculate the number of samples in the specified interval
    sampling_rate = data.channels[0].samples_per_second
    samples_per_interval = int(sampling_rate * interval)

    # Process the signal in chunks of the specified interval for both channels
    for start in range(0, len(data.channels[0].data), samples_per_interval):
        end = start + samples_per_interval

        # Initialize a dictionary to store features for this interval
        features = {}

        for channel in data.channels:
            segment = channel.data[start:end]

            # Skip if the segment is empty or shorter than expected
            if len(segment) < samples_per_interval:
                continue

            # Calculate frequency spectrum
            freqs = np.fft.fftfreq(len(segment), d=1/sampling_rate)
            fft_values = np.fft.fft(segment)

            # Calculate max and min frequency
            max_frequency = freqs[np.argmax(np.abs(fft_values))]
            min_frequency = freqs[np.argmin(np.abs(fft_values))]

            # Calculate area under the curve
            auc = simps(segment, dx=1/sampling_rate)

            # Calculate additional features
            mean_value = np.mean(segment)
            std_dev = np.std(segment)
            rms = np.sqrt(np.mean(segment**2))
            peak_to_peak = np.max(segment) - np.min(segment)
            #skewness = skew(segment)
            #kurt = kurtosis(segment)

            # Store the features in the dictionary with channel name as key
            features[f"{channel.name} Max Frequency"] = max_frequency
            features[f"{channel.name} Min Frequency"] = min_frequency
            features[f"{channel.name} Area Under Curve"] = auc
            features[f"{channel.name} Mean"] = mean_value
            features[f"{channel.name} Std Dev"] = std_dev
            features[f"{channel.name} RMS"] = rms
            features[f"{channel.name} Peak-to-Peak"] = peak_to_peak
            #features[f"{channel.name} Skewness"] = skewness
            #features[f"{channel.name} Kurtosis"] = kurt

        # Append the features for this interval to the list
        features_list.append(features)

    # Convert the list of features to a DataFrame
    features_df = pd.DataFrame(features_list)

    return features_df

# Example usage
filename = '/content/control rat 1 day 1.acq'
features = extract_features(filename)

# Save the features to a CSV file
features.to_csv('features_output.csv', index=False)
print("Features extracted and saved to features_output.csv")

  auc = simps(segment, dx=1/sampling_rate)


Features extracted and saved to features_output.csv


In [5]:
normalfilename = '/content/rat 2 day 2.acq'
normalfeatures = extract_features(normalfilename)

# Save the features to a CSV file
normalfeatures.to_csv('normalfeatures_output.csv', index=False)
print("Features extracted and saved to normalfeatures_output.csv")

  auc = simps(segment, dx=1/sampling_rate)


Features extracted and saved to normalfeatures_output.csv


data is skewed

720 records for control rat

1690 records for normal rat

In [6]:
#creating labels
sick=[1]*720
normal=[0]*1690

print(sick[0:5])
print(normal[0:5])



[1, 1, 1, 1, 1]
[0, 0, 0, 0, 0]


In [7]:
labels=sick+normal
print(len(labels))

2410


In [8]:
type(labels)

list

In [9]:
import pandas as pd
df_sick=pd.read_csv('/content/features_output.csv')
df_sick.head()

Unnamed: 0,EEG (.5 - 35 Hz) Max Frequency,EEG (.5 - 35 Hz) Min Frequency,EEG (.5 - 35 Hz) Area Under Curve,EEG (.5 - 35 Hz) Mean,EEG (.5 - 35 Hz) Std Dev,EEG (.5 - 35 Hz) RMS,EEG (.5 - 35 Hz) Peak-to-Peak,ECG (.5 - 35 Hz) Max Frequency,ECG (.5 - 35 Hz) Min Frequency,ECG (.5 - 35 Hz) Area Under Curve,ECG (.5 - 35 Hz) Mean,ECG (.5 - 35 Hz) Std Dev,ECG (.5 - 35 Hz) RMS,ECG (.5 - 35 Hz) Peak-to-Peak
0,0.0,-56.2,-845.562683,-169.148328,50.520709,176.531863,334.793091,0.0,66.6,-2.739165,-0.548084,0.09657,0.556526,0.813904
1,0.0,-38.0,-856.819,-171.484656,53.295287,179.575541,295.715332,0.0,101.2,-2.74528,-0.549501,0.088074,0.556514,0.37323
2,0.0,88.4,-854.866679,-171.127332,56.066304,180.077744,352.432251,0.0,-101.2,-2.731768,-0.546789,0.088548,0.553912,0.435791
3,0.0,61.0,-868.321864,-173.821936,45.09617,179.57653,307.632446,0.0,-87.0,-2.713606,-0.543056,0.088324,0.550192,0.370178
4,0.0,-125.0,-862.504211,-172.673474,50.993456,180.04572,248.657227,0.0,-60.6,-2.688851,-0.538134,0.087334,0.545174,0.367432


In [10]:
df_normal=pd.read_csv('/content/normalfeatures_output.csv')
df_normal.head()

Unnamed: 0,EEG (.5 - 35 Hz) Max Frequency,EEG (.5 - 35 Hz) Min Frequency,EEG (.5 - 35 Hz) Area Under Curve,EEG (.5 - 35 Hz) Mean,EEG (.5 - 35 Hz) Std Dev,EEG (.5 - 35 Hz) RMS,EEG (.5 - 35 Hz) Peak-to-Peak,ECG (.5 - 35 Hz) Max Frequency,ECG (.5 - 35 Hz) Min Frequency,ECG (.5 - 35 Hz) Area Under Curve,ECG (.5 - 35 Hz) Mean,ECG (.5 - 35 Hz) Std Dev,ECG (.5 - 35 Hz) RMS,ECG (.5 - 35 Hz) Peak-to-Peak
0,0.4,-90.8,100.474287,20.104378,57.290795,60.715906,255.03872,-6.6,-118.8,0.012213,0.002447,0.089114,0.089147,0.39073
1,-0.6,-89.6,-72.8785,-14.567583,48.019303,50.180355,226.834361,-6.6,-115.4,-0.003372,-0.000707,0.100873,0.100876,0.51718
2,-0.6,-56.4,76.359761,15.277996,54.534531,56.634197,255.525002,6.6,-120.6,0.024686,0.004905,0.095939,0.096064,0.432775
3,-0.6,-92.6,9.221442,1.849972,49.061626,49.096492,219.118686,6.6,-116.8,-0.01874,-0.003819,0.093051,0.093129,0.434671
4,-0.4,-96.6,-76.704913,-15.356592,58.999819,60.965593,280.163291,0.8,68.2,0.031232,0.00622,0.101263,0.101454,0.4764


In [11]:
combined_df = pd.concat([df_sick, df_normal], ignore_index=True)
combined_df.head()

Unnamed: 0,EEG (.5 - 35 Hz) Max Frequency,EEG (.5 - 35 Hz) Min Frequency,EEG (.5 - 35 Hz) Area Under Curve,EEG (.5 - 35 Hz) Mean,EEG (.5 - 35 Hz) Std Dev,EEG (.5 - 35 Hz) RMS,EEG (.5 - 35 Hz) Peak-to-Peak,ECG (.5 - 35 Hz) Max Frequency,ECG (.5 - 35 Hz) Min Frequency,ECG (.5 - 35 Hz) Area Under Curve,ECG (.5 - 35 Hz) Mean,ECG (.5 - 35 Hz) Std Dev,ECG (.5 - 35 Hz) RMS,ECG (.5 - 35 Hz) Peak-to-Peak
0,0.0,-56.2,-845.562683,-169.148328,50.520709,176.531863,334.793091,0.0,66.6,-2.739165,-0.548084,0.09657,0.556526,0.813904
1,0.0,-38.0,-856.819,-171.484656,53.295287,179.575541,295.715332,0.0,101.2,-2.74528,-0.549501,0.088074,0.556514,0.37323
2,0.0,88.4,-854.866679,-171.127332,56.066304,180.077744,352.432251,0.0,-101.2,-2.731768,-0.546789,0.088548,0.553912,0.435791
3,0.0,61.0,-868.321864,-173.821936,45.09617,179.57653,307.632446,0.0,-87.0,-2.713606,-0.543056,0.088324,0.550192,0.370178
4,0.0,-125.0,-862.504211,-172.673474,50.993456,180.04572,248.657227,0.0,-60.6,-2.688851,-0.538134,0.087334,0.545174,0.367432


In [12]:
combined_df.shape

(2410, 14)

In [13]:
y = pd.DataFrame(labels, columns=['MyColumn'])
y.head()

Unnamed: 0,MyColumn
0,1
1,1
2,1
3,1
4,1


In [14]:
y.shape

(2410, 1)

In [15]:
count_any_nan = combined_df.isna().any(axis=1).sum()

In [16]:
count_any_nan

1

In [17]:
df_filled = combined_df.fillna(0)

In [18]:
count_any_nan2 = df_filled.isna().any(axis=1).sum()

In [19]:
count_any_nan2

0

In [20]:
#feature scaling

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_filled_scaled = sc.fit_transform(df_filled)

In [21]:
#compute covariance matrix
cov_matrix=np.cov(df_filled_scaled.T)
cov_matrix

array([[ 1.00041511e+00, -3.03718666e-02,  9.50184814e-02,
         9.50225887e-02,  2.37240815e-02, -1.04132654e-01,
         2.53026646e-02,  2.09006607e-02,  6.92964502e-04,
         1.08249940e-01,  1.08249894e-01, -3.66829452e-02,
        -1.10739849e-01, -3.73319365e-02],
       [-3.03718666e-02,  1.00041511e+00, -7.49303091e-03,
        -7.49507697e-03, -1.72558730e-02, -1.00457303e-02,
        -1.39369773e-02,  4.00516524e-02,  6.23576292e-02,
        -5.31762194e-03, -5.32082108e-03,  3.70335391e-03,
         4.04523245e-03, -5.22543488e-03],
       [ 9.50184814e-02, -7.49303091e-03,  1.00041511e+00,
         1.00041502e+00,  2.35905801e-01, -8.43490300e-01,
         1.83921012e-01,  2.99322758e-01,  3.98374550e-02,
         8.86585296e-01,  8.86582498e-01, -1.31630435e-01,
        -8.75980633e-01, -1.45648164e-01],
       [ 9.50225887e-02, -7.49507697e-03,  1.00041502e+00,
         1.00041511e+00,  2.35857738e-01, -8.43531296e-01,
         1.83911033e-01,  2.99323419e-01,  3.

In [23]:
#calculate eigen values and eigen vectors
eigen_values,eigen_vectors=np.linalg.eigh(cov_matrix)

In [24]:
#PCA

from sklearn.decomposition import PCA
pca=PCA(n_components=4)
train_pca=pca.fit_transform(df_filled_scaled)

In [25]:
#explain variance ratio

pca.explained_variance_ratio_

array([0.4157403 , 0.13808147, 0.13235927, 0.07666606])

In [26]:
pca_df=pd.DataFrame(data=train_pca,columns=['PC1','PC2','PC3','PC4'])
pca_df.head()

Unnamed: 0,PC1,PC2,PC3,PC4
0,3.839321,0.832204,1.400112,0.078461
1,3.736928,1.325692,0.505173,0.425801
2,3.734098,1.591921,0.784986,0.261766
3,3.785505,1.215388,0.334405,0.112594
4,3.756909,1.111235,0.263046,-1.221448


In [29]:

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pca_df, y, test_size=0.3, random_state=42)

# Create and train the SVM model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[492   0]
 [  2 229]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       492
           1       1.00      0.99      1.00       231

    accuracy                           1.00       723
   macro avg       1.00      1.00      1.00       723
weighted avg       1.00      1.00      1.00       723



  y = column_or_1d(y, warn=True)


In [30]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy}")


Accuracy Score: 0.9972337482710927


when 30% of data used for testing, accuracy= 99.7%