### Download Dataset

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vijayveersingh/1-2m-brain-signal-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/vijayveersingh/1-2m-brain-signal-data?dataset_version_number=1...


100%|██████████| 618M/618M [00:22<00:00, 28.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/vijayveersingh/1-2m-brain-signal-data/versions/1


### List Dataset Contents

In [2]:
import os

dataset_path = path

# List the contents of the downloaded dataset directory
print("Contents of the dataset directory:")
for root, dirs, files in os.walk(dataset_path):
    print(f"Directory: {root}")
    for name in files:
        print(f"  File: {name}")

Contents of the dataset directory:
Directory: /root/.cache/kagglehub/datasets/vijayveersingh/1-2m-brain-signal-data/versions/1
Directory: /root/.cache/kagglehub/datasets/vijayveersingh/1-2m-brain-signal-data/versions/1/MindBigData-IN-v1.06
  File: IN.txt
Directory: /root/.cache/kagglehub/datasets/vijayveersingh/1-2m-brain-signal-data/versions/1/MindBigData-MW-v1.0
  File: MW.txt
Directory: /root/.cache/kagglehub/datasets/vijayveersingh/1-2m-brain-signal-data/versions/1/MindBigData-EP-v1.0
  File: EP1.01.txt
Directory: /root/.cache/kagglehub/datasets/vijayveersingh/1-2m-brain-signal-data/versions/1/MindBigData-MU-v1.0
  File: MU.txt


### Load Data into DataFrame

In [3]:
import pandas as pd
import os

file_path_in = os.path.join(dataset_path, 'MindBigData-IN-v1.06', 'IN.txt')
# Read the file again, specifying no header and providing column names
column_names = ["id", "event", "device", "channel", "code", "size", "data"]
df = pd.read_csv(file_path_in, delimiter='\t', header=None, names=column_names)

### Display DataFrame Head

In [5]:
df.head()

Unnamed: 0,id,event,device,channel,code,size,data
0,1142043,173652,IN,AF3,0,256,"4259.487179,4237.948717,4247.179487,4242.05128..."
1,1142044,173652,IN,AF4,0,256,"4103.076923,4100.512820,4102.564102,4087.69230..."
2,1142045,173652,IN,T7,0,256,"4245.128205,4218.461538,4242.051282,4245.12820..."
3,1142046,173652,IN,T8,0,256,"4208.717948,4188.717948,4204.102564,4198.46153..."
4,1142047,173652,IN,PZ,0,256,"4189.230769,4203.589743,4188.717948,4186.66666..."


In [12]:
import numpy as np
from scipy.signal import butter, filtfilt, iirnotch

class EEGPreprocessor:
    def __init__(self, fs=128, lowcut=0.5, highcut=40.0, notch_freq=50.0, order=4, Q=30.0):
        self.fs = fs
        self.lowcut = lowcut
        self.highcut = highcut
        self.notch_freq = notch_freq
        self.order = order
        self.Q = Q

        # Calculate bandpass filter coefficients
        nyquist = 0.5 * self.fs
        low = self.lowcut / nyquist
        high = self.highcut / nyquist
        self.b_bandpass, self.a_bandpass = butter(self.order, [low, high], btype='band')

        # Calculate notch filter coefficients
        self.b_notch, self.a_notch = iirnotch(self.notch_freq, self.Q, self.fs)

    def _bandpass_filter(self, signal):
        return filtfilt(self.b_bandpass, self.a_bandpass, signal)

    def _notch_filter(self, signal):
        return filtfilt(self.b_notch, self.a_notch, signal)

    def _clean_signal(self, signal):
        # Remove DC component
        signal = signal - np.mean(signal)

        # Apply bandpass filter
        signal = self._bandpass_filter(signal)

        # Apply notch filter
        signal = self._notch_filter(signal)

        return signal

    def _normalize(self, signal, epsilon=1e-6):
        # Zero mean and unit standard deviation (standardization)
        mean = np.mean(signal)
        std = np.std(signal)
        return (signal - mean) / (std + epsilon) # Add epsilon to avoid division by zero

    def transform(self, dataframe):
        processed_signals = []
        labels = []

        for index, row in dataframe.iterrows():
            # Convert the "data" column to a numpy array of floats
            signal_data = np.array(row["data"].split(','), dtype=float)

            # Clean the signal
            cleaned_signal = self._clean_signal(signal_data)

            # Normalize the signal
            normalized_signal = self._normalize(cleaned_signal)

            # Append the processed signal and label to the lists
            processed_signals.append(normalized_signal)
            labels.append(row["code"])

        return processed_signals, labels

In [14]:
import numpy as np
from scipy.stats import skew, kurtosis
from scipy.signal import welch
from scipy.integrate import simpson # Corrected import
from scipy.special import rel_entr # Relative entropy, also known as Kullback-Leibler divergence

class EEGFeatureExtractor:
    def __init__(self, fs=128):
        self.fs = fs

    def _time_features(self, signal):
        features = {}
        features['mean'] = np.mean(signal)
        features['var'] = np.var(signal)
        features['skew'] = skew(signal)
        features['kurtosis'] = kurtosis(signal)
        features['ptp'] = np.ptp(signal)

        # Hjorth parameters
        # Activity: variance of the signal
        features['hjorth_activity'] = np.var(signal)

        # Mobility: square root of the ratio of the variance of the first derivative to the variance of the signal
        # np.diff calculates the difference between consecutive elements
        first_deriv = np.diff(signal)
        features['hjorth_mobility'] = np.sqrt(np.var(first_deriv) / np.var(signal)) if np.var(signal) != 0 else 0

        # Complexity: square root of the ratio of the variance of the second derivative to the variance of the first derivative
        second_deriv = np.diff(first_deriv)
        features['hjorth_complexity'] = np.sqrt(np.var(second_deriv) / np.var(first_deriv)) if np.var(first_deriv) != 0 else 0

        return features

    def _freq_features(self, signal):
        features = {}

        # Calculate Power Spectral Density (PSD) using Welch's method
        freqs, psd = welch(signal, self.fs, nperseg=256, average='median')

        # Calculate total power
        total_power = simpson(psd, freqs) # Corrected function call

        # Define EEG bands
        eeg_bands = {'delta': (0.5, 4),
                     'theta': (4, 8),
                     'alpha': (8, 12),
                     'beta': (13, 30),
                     'gamma': (30, 40)}

        # Calculate power in each band and normalized power
        for band, (low, high) in eeg_bands.items():
            # Find the indices corresponding to the current band
            idx_band = np.logical_and(freqs >= low, freqs <= high)
            # Calculate the power in the band using Simpson's rule for integration
            band_power = simpson(psd[idx_band], freqs[idx_band]) # Corrected function call
            features[f'{band}_power'] = band_power
            features[f'{band}_power_norm'] = band_power / total_power if total_power != 0 else 0

        # Calculate Spectral Entropy
        # Normalize PSD to get a probability distribution
        psd_norm = psd / total_power if total_power != 0 else psd
        # Calculate spectral entropy using relative entropy (KL divergence from uniform distribution)
        # Add a small value to avoid log(0)
        features['spectral_entropy'] = -np.sum(psd_norm * np.log2(psd_norm + 1e-10)) if total_power != 0 else 0


        return features

    def extract_features(self, signal):
        time_features = self._time_features(signal)
        freq_features = self._freq_features(signal)
        # Combine all features into one dictionary
        all_features = {**time_features, **freq_features}
        return all_features

In [15]:
# Define EEG sampling rate
fs_eeg = 128

# Instantiate the preprocessor and feature extractor
preprocessor = EEGPreprocessor(fs=fs_eeg)
feature_extractor = EEGFeatureExtractor(fs=fs_eeg)

# Create an empty list to store features
feature_list = []

# Loop through all rows of the DataFrame df
for index, row in df.iterrows():
    try:
        # Step 1: Convert raw data
        signal_data = np.array(row["data"].split(','), dtype=float)

        # Step 2: Preprocess the signal
        # Assuming clean_signal function is defined globally or within the preprocessor class
        # If clean_signal is within the preprocessor class, use preprocessor._clean_signal
        # Based on the previous turn, clean_signal is a method within EEGPreprocessor
        cleaned_signal = preprocessor._clean_signal(signal_data)


        # Step 3: Extract features
        # Assuming extract_features function is defined globally or within the feature_extractor class
        # If extract_features is within the feature_extractor class, use feature_extractor.extract_features
        # Based on the previous turn, extract_features is a method within EEGFeatureExtractor
        features = feature_extractor.extract_features(cleaned_signal)

        # Step 4: Add metadata
        features['digit'] = row['code']
        features['channel'] = row['channel']

        # Step 5: Store features
        feature_list.append(features)

    except Exception as e:
        # Handle errors (e.g., incomplete or non-numeric data)
        print(f"Skipping row {index} due to error: {e}")
        continue

print(f"Feature extraction complete. Extracted features for {len(feature_list)} rows.")

  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.fs, nperseg=256, average='median')
  freqs, psd = welch(signal, self.

Feature extraction complete. Extracted features for 65250 rows.


In [16]:
import pandas as pd

# Convert the list of features into a pandas DataFrame
features_df = pd.DataFrame(feature_list)

# Define the output filename
output_filename = 'features_preprocessed_full.csv'

# Save the DataFrame to a CSV file
features_df.to_csv(output_filename, index=False)

# Print a success message
print("Feature extraction and saving complete.")
print(f"Features saved to: {output_filename}")

# Display the shape and head of the DataFrame
print("\nShape of the features DataFrame:", features_df.shape)
print("\nFirst 5 rows of the features DataFrame:")
display(features_df.head())

Feature extraction and saving complete.
Features saved to: features_preprocessed_full.csv

Shape of the features DataFrame: (65250, 21)

First 5 rows of the features DataFrame:


Unnamed: 0,mean,var,skew,kurtosis,ptp,hjorth_activity,hjorth_mobility,hjorth_complexity,delta_power,delta_power_norm,...,theta_power_norm,alpha_power,alpha_power_norm,beta_power,beta_power_norm,gamma_power,gamma_power_norm,spectral_entropy,digit,channel
0,-0.994682,181.571713,0.065043,-0.488986,69.658823,181.571713,0.72456,1.216738,34.522125,0.257902,...,0.066469,8.616027,0.064367,54.726136,0.408839,16.86267,0.125975,8.96457,0,AF3
1,4.542899,652.240584,0.176188,-0.591324,117.256654,652.240584,0.47175,1.217368,179.490302,0.397498,...,0.037451,23.48786,0.052016,74.408266,0.164784,20.419909,0.045222,6.24306,0,AF4
2,-0.595764,242.957278,0.748985,0.588104,81.0912,242.957278,0.679946,1.296242,144.240653,0.604,...,0.057903,10.929314,0.045766,38.521637,0.161307,22.666292,0.094914,6.854228,0,T7
3,0.516059,338.819865,0.495659,0.187774,108.565438,338.819865,0.724805,1.263866,63.714731,0.256768,...,0.024806,16.862073,0.067953,93.025695,0.37489,30.732667,0.123851,8.55083,0,T8
4,-0.207519,159.987263,-0.038581,-0.679504,63.265561,159.987263,0.617211,1.195543,104.951056,0.631048,...,0.074482,15.544452,0.093465,23.171652,0.139326,10.91869,0.065652,6.561226,0,PZ


## Notebook Summary

This notebook performs the following steps to preprocess EEG brain signal data and extract features for machine learning:

1.  **Download Dataset:** The dataset "1-2m-brain-signal-data" from Kaggle is downloaded using `kagglehub`. The path to the downloaded dataset files is then printed.
2.  **List Dataset Contents:** The contents of the downloaded dataset directory are listed using the `os` module to understand the file structure.
3.  **Load Data into DataFrame:** The EEG data from the `IN.txt` file (located within the downloaded dataset) is loaded into a pandas DataFrame. The data is tab-delimited and loaded without a header, with custom column names assigned: "id", "event", "device", "channel", "code", "size", and "data".
4.  **Display DataFrame Head:** The first few rows of the loaded DataFrame are displayed to inspect the data format and content.
5.  **Define Preprocessing Functions (Initial):** Initial functions for preprocessing are defined:
    *   `bandpass_filter`: Applies a 4th order bandpass filter (0.5-40 Hz) using `scipy.signal.butter` and `scipy.signal.filtfilt`.
    *   `notch_filter`: Applies a 50 Hz notch filter with Q=30 using `scipy.signal.iirnotch` and `scipy.signal.filtfilt` to remove power line noise.
6.  **Define Preprocessing Classes:** More structured classes for preprocessing and feature extraction are defined:
    *   **`EEGPreprocessor` Class:** Encapsulates the signal cleaning and normalization steps.
        *   `__init__`: Initializes with sampling rate and filter parameters, calculating and storing filter coefficients.
        *   `_bandpass_filter`: Applies the pre-calculated bandpass filter.
        *   `_notch_filter`: Applies the pre-calculated notch filter.
        *   `_clean_signal`: Removes the DC component and applies both bandpass and notch filters.
        *   `_normalize`: Performs Z-score normalization (mean=0, std=1).
        *   `transform`: Iterates through a DataFrame, cleans and normalizes the 'data' column, and collects the processed signals and corresponding 'code' labels.
    *   **`EEGFeatureExtractor` Class:** Extracts time and frequency domain features.
        *   `__init__`: Initializes with the sampling rate.
        *   `_time_features`: Calculates basic statistical features (mean, var, skew, kurtosis, ptp) and Hjorth parameters (activity, mobility, complexity).
        *   `_freq_features`: Calculates the Power Spectral Density (PSD) using Welch's method, computes normalized power in standard EEG bands (delta, theta, alpha, beta, gamma), and calculates spectral entropy.
        *   `extract_features`: Combines time and frequency domain features into a single dictionary.
7.  **Process Data and Extract Features:**
    *   An EEG sampling rate (`fs_eeg`) of 128 Hz is set.
    *   Instances of `EEGPreprocessor` and `EEGFeatureExtractor` are created.
    *   An empty list `feature_list` is initialized.
    *   The code iterates through all rows of the loaded DataFrame (`df`).
    *   For each row, the raw signal string is converted to a numpy array of floats.
    *   The `_clean_signal` method from the `EEGPreprocessor` is applied.
    *   The `extract_features` method from the `EEGFeatureExtractor` is applied to the cleaned signal.
    *   Metadata ('digit' from the 'code' column and 'channel') is added to the extracted features dictionary.
    *   The features dictionary is appended to `feature_list`.
    *   Error handling is included to skip rows that cause processing errors.
8.  **Convert Features to DataFrame and Save:**
    *   The `feature_list` (a list of dictionaries) is converted into a pandas DataFrame called `features_df`.
    *   The `features_df` is saved to a CSV file named `features_preprocessed_full.csv` without the pandas index.
    *   A success message is printed, including the output filename.
    *   The shape and the first 5 rows of the `features_df` are displayed to show the resulting features.

This process successfully downloads the data, defines and utilizes classes for signal preprocessing and feature extraction, and saves the extracted features into a structured format (CSV) for further analysis or machine learning model training.

In [18]:
from google.colab import files

# Download the CSV file
files.download('features_preprocessed_full.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>