# insatll library

In [2]:
!pip install XGBoost
!pip install mne
!pip install optuna
!pip install imblearn



# Import library

In [3]:
import mne
import numpy as np
import pandas as pd
from scipy.signal import butter, filtfilt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler
import xgboost as xgb
import optuna

In [4]:
dataset = "/content/drive/MyDrive/Dataset_EEG/eeg_dataset.csv"

In [5]:
eeg_state = pd.read_csv(dataset)

In [6]:
eeg_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768000 entries, 0 to 767999
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   F7      768000 non-null  float64
 1   F3      768000 non-null  float64
 2   P7      768000 non-null  float64
 3   O1      768000 non-null  float64
 4   O2      768000 non-null  float64
 5   P8      768000 non-null  float64
 6   AF4     768000 non-null  float64
 7   state   768000 non-null  object 
dtypes: float64(7), object(1)
memory usage: 46.9+ MB


In [7]:
eeg_state = eeg_state.reset_index(drop=True)

In [8]:
eeg_state['state'].value_counts()

Unnamed: 0_level_0,count
state,Unnamed: 1_level_1
focussed,384000
unfocussed,384000


In [9]:
eeg_state

Unnamed: 0,F7,F3,P7,O1,O2,P8,AF4,state
0,3804.102564,5210.769231,4490.769231,3862.051282,3511.794872,4321.025641,4024.102564,focussed
1,3796.410256,5205.641026,4489.230769,3858.461538,3505.128205,4311.794872,4017.948718,focussed
2,3798.974359,5207.179487,4489.230769,3865.128205,3511.794872,4313.846154,4018.974359,focussed
3,3801.538462,5210.256410,4490.256410,3866.153846,3511.282051,4315.897436,4022.051282,focussed
4,3799.487179,5210.256410,4490.256410,3865.641026,3506.153846,4311.282051,4020.512821,focussed
...,...,...,...,...,...,...,...,...
767995,4012.820513,5053.333333,4296.923077,4309.230769,4082.051282,4198.974359,4137.948718,unfocussed
767996,4018.974359,5057.948718,4302.564103,4314.358974,4087.179487,4205.128205,4143.076923,unfocussed
767997,4019.487179,5062.564103,4303.076923,4313.846154,4090.256410,4205.641026,4138.974359,unfocussed
767998,4016.923077,5060.512821,4301.538462,4311.794872,4087.692308,4202.564103,4132.820513,unfocussed


In [9]:
eeg_state.to_csv("/content/eeg_dataset_new.csv", index=False)

In [10]:
import numpy as np
from sklearn.metrics import confusion_matrix

def custom_classification_report(y_true, y_pred, target_names, class_indices):
    # Compute the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    # Initialize lists to store precision, recall, F1-score, and support for each class
    precision = []
    recall = []
    f1_score = []
    support = []

    # Calculate precision, recall, F1-score for each class
    for i in range(len(target_names)):
        true_positives = cm[i, i]
        false_positives = cm[:, i].sum() - true_positives
        false_negatives = cm[i, :].sum() - true_positives
        true_negatives = cm.sum() - (true_positives + false_positives + false_negatives)

        # Precision: TP / (TP + FP)
        if true_positives + false_positives > 0:
            precision_i = true_positives / (true_positives + false_positives)
        else:
            precision_i = 0.0

        # Recall: TP / (TP + FN)
        if true_positives + false_negatives > 0:
            recall_i = true_positives / (true_positives + false_negatives)
        else:
            recall_i = 0.0

        # F1-Score: 2 * (Precision * Recall) / (Precision + Recall)
        if precision_i + recall_i > 0:
            f1_i = 2 * (precision_i * recall_i) / (precision_i + recall_i)
        else:
            f1_i = 0.0

        # Support: The number of true instances of each class
        support_i = cm[i, :].sum()

        # Append calculated metrics for this class
        precision.append(precision_i)
        recall.append(recall_i)
        f1_score.append(f1_i)
        support.append(support_i)

    # Calculate accuracy
    accuracy = np.trace(cm) / np.sum(cm)

    # Calculate average F1-score for specified classes
    f1_average = np.mean([f1_score[i] for i in class_indices])

    # Print the aesthetically improved report
    print("\n" + "Classification Report".center(65, "="))
    print(f"{'Class':<15}{'Precision':>12}{'Recall':>12}{'F1-Score':>12}{'Support':>12}")
    print("=" * 65)
    for i, label in enumerate(target_names):
        print(f"{label:<15}{precision[i]:>12.4f}{recall[i]:>12.4f}{f1_score[i]:>12.4f}{support[i]:>12}")
    print("=" * 65)
    print(f"Average F1-Score for classes : {f1_average:.4f}")
    print("=" * 65)

class_indices = [0, 1]


# LIGHTGBM

In [11]:

# Handle missing values
eeg_state = eeg_state.dropna()

# Encode the 'state' column
le = LabelEncoder()
eeg_state['state'] = le.fit_transform(eeg_state['state'])

# Features and target
X = eeg_state.drop('state', axis=1)
y = eeg_state['state']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': 2,  # number of classes
    'learning_rate': 0.1,
    'num_iterations': 700,
    'max_depth': 8,
    'random_seed': 42,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

# Create dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Train the model
lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],

)

# Predict the class labels
y_pred = lgb_model.predict(X_test)

# Convert probabilities to predicted class labels
y_pred_class = np.argmax(y_pred, axis=1)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



KeyboardInterrupt: 

# Tune-Hyperparameters XGboost

In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import optuna

# Handle missing values
eeg_state = eeg_state.dropna()

# Encode the 'state' column
le = LabelEncoder()
eeg_state['state'] = le.fit_transform(eeg_state['state'])

# Features and target
X = eeg_state.drop('state', axis=1)
y = eeg_state['state']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    params = {
        'objective': 'multi:softmax',
        'num_class': 2,  # Number of classes
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'tree_method': 'gpu_hist',  # Use GPU
        'gpu_id': 0,  # GPU ID, change if you have multiple GPUs
        'random_state': 42,
    }

    # Create DMatrix for XGBoost
    train_data = xgb.DMatrix(X_train, label=y_train)

    # Train the model
    xgb_model = xgb.train(**params, train_data)

    # Create DMatrix for test data
    test_data = xgb.DMatrix(X_test, label=y_test)

    # Predict the class labels
    y_pred = xgb_model.predict(test_data)

    # Evaluate the model
    accuracy = np.mean(y_pred == y_test)

    return accuracy  # Return the accuracy as the objective

# Create a study object and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Number of trials to run





[I 2024-10-28 19:02:06,498] A new study created in memory with name: no-name-a11a40c3-f34f-4057-875d-014c10d70dc0

    E.g. tree_method = "hist", device = "cuda"

Parameters: { "n_estimators" } are not used.

[W 2024-10-28 19:02:06,814] Trial 0 failed with parameters: {'learning_rate': 0.16390550871738618, 'max_depth': 4, 'n_estimators': 447, 'subsample': 0.6251987094776064, 'colsample_bytree': 0.7101803429848812} because of the following error: XGBoostError('[19:02:06] /workspace/src/tree/updater_gpu_hist.cu:861: Exception in gpu_hist: [19:02:06] /workspace/src/tree/updater_gpu_hist.cu:867: Check failed: ctx_->Ordinal() >= 0 (-1 vs. 0) : Must have at least one device\nStack trace:\n  [bt] (0) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x22dbbc) [0x7f9313c2dbbc]\n  [bt] (1) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0xd2b4c3) [0x7f931472b4c3]\n  [bt] (2) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0xd2bf84) [0x7f931

XGBoostError: [19:02:06] /workspace/src/tree/updater_gpu_hist.cu:861: Exception in gpu_hist: [19:02:06] /workspace/src/tree/updater_gpu_hist.cu:867: Check failed: ctx_->Ordinal() >= 0 (-1 vs. 0) : Must have at least one device
Stack trace:
  [bt] (0) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x22dbbc) [0x7f9313c2dbbc]
  [bt] (1) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0xd2b4c3) [0x7f931472b4c3]
  [bt] (2) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0xd2bf84) [0x7f931472bf84]
  [bt] (3) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x57f196) [0x7f9313f7f196]
  [bt] (4) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x580203) [0x7f9313f80203]
  [bt] (5) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x5cae68) [0x7f9313fcae68]
  [bt] (6) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x6f) [0x7f9313b3742f]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7f937fae5e2e]
  [bt] (8) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7f937fae2493]



Stack trace:
  [bt] (0) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x22dbbc) [0x7f9313c2dbbc]
  [bt] (1) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0xd2c154) [0x7f931472c154]
  [bt] (2) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x57f196) [0x7f9313f7f196]
  [bt] (3) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x580203) [0x7f9313f80203]
  [bt] (4) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(+0x5cae68) [0x7f9313fcae68]
  [bt] (5) /usr/local/lib/python3.10/dist-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x6f) [0x7f9313b3742f]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7f937fae5e2e]
  [bt] (7) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7f937fae2493]
  [bt] (8) /usr/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xa3e9) [0x7f937fb0b3e9]



In [49]:
# Print the best parameters
print("Best hyperparameters: ", study.best_params)



Best hyperparameters:  {'learning_rate': 0.2087759602169422, 'max_depth': 10, 'n_estimators': 462, 'subsample': 0.6478457734751482, 'colsample_bytree': 0.9431214021788126}


In [12]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler

# Handle missing values
eeg_state = eeg_state.dropna()

# Encode the 'state' column
le = LabelEncoder()
eeg_state['state'] = le.fit_transform(eeg_state['state'])

# Features and target
X = eeg_state.drop('state', axis=1)
y = eeg_state['state']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': 2,  # Number of classes
    'learning_rate': 0.2087759602169422,
    'max_depth': 10,
    'subsample': 0.6478457734751482,
    'colsample_bytree': 0.9431214021788126,
    # 'device': 'cuda',  # Use GPU
    'random_state': 42,
}

# Create DMatrix for XGBoost
train_data = xgb.DMatrix(X_train, label=y_train)
test_data = xgb.DMatrix(X_test, label=y_test)

# Train the model
xgb_model = xgb.train(params, train_data,num_boost_round=462)

# Predict the class labels
y_pred = xgb_model.predict(test_data)



In [16]:
print("Expected features:", xgb_model.feature_names)
print("Dataset columns:", eeg_state.columns.tolist())


Expected features: ['F7', 'F3', 'P7', 'O1', 'O2', 'P8', 'AF4']
Dataset columns: ['F7', 'F3', 'P7', 'O1', 'O2', 'P8', 'AF4', 'state']


In [13]:
target_names=list(map(str,le.classes_))

In [14]:
# Generate and print classification report
report = custom_classification_report(y_test, y_pred, target_names=target_names,class_indices = [0, 1])
print(report)



Class             Precision      Recall    F1-Score     Support
0                    0.9238      0.9344      0.9291       76800
1                    0.9337      0.9229      0.9283       76800
Average F1-Score for classes : 0.9287
None


In [17]:
import pickle

In [19]:
# Save the model as a pickle file
model_filename = 'xgb_model.pkl'  # You can change the filename as needed
with open(model_filename, 'wb') as f:
    pickle.dump(xgb_model, f)

print(f"Model saved to {model_filename}")

# To load the model later
with open(model_filename, 'rb') as f:
    loaded_model = pickle.load(f)

Model saved to xgb_model.pkl


In [20]:
import joblib
# save model
joblib.dump(xgb_model, '/content/xgb_model.pkl')

['/content/xgb_model.pkl']

In [26]:
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
test_path = "/content/drive/MyDrive/Dataset_EEG/eeg_dataset.csv"
data = pd.read_csv(test_path)

# Load the pre-trained model
xgb_model = joblib.load('/content/xgb_model.pkl')

# Encode the 'state' column (Assuming the original labels are in the dataset)
label_encoder = LabelEncoder()
data['state'] = label_encoder.fit_transform(data['state'])

# Prepare your data
X = data.drop('state', axis=1)  # Drop the label column
y = data['state']  # Use the encoded labels

# Create a DMatrix for XGBoost
dtest = xgb.DMatrix(X)

# Make predictions (for binary classification)
predictions = xgb_model.predict(dtest)  # This gives predicted class labels

# Evaluate the model
accuracy = accuracy_score(y, predictions)
report = classification_report(y, predictions, target_names=label_encoder.classes_)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)


Accuracy: 0.9750
Classification Report:
              precision    recall  f1-score   support

    focussed       0.97      0.98      0.98    384000
  unfocussed       0.98      0.97      0.97    384000

    accuracy                           0.97    768000
   macro avg       0.98      0.97      0.97    768000
weighted avg       0.98      0.97      0.97    768000



# Experiment

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Handle missing values
eeg_state =filtered_data.dropna()

# Encode the 'state' column
le = LabelEncoder()
filtered_data['state'] = le.fit_transform(filtered_data['state'])

# Features and target
X = filtered_data.drop('state', axis=1)
y =filtered_data['state']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': 2,  # number of classes
    'learning_rate': 0.1,
    'num_iterations': 700,
    'max_depth': 8,
    'random_seed': 42,
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

# Create dataset for LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Train the model
lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],

)

# Predict the class labels
y_pred = lgb_model.predict(X_test)

# Convert probabilities to predicted class labels
y_pred_class = np.argmax(y_pred, axis=1)


In [None]:
# Import the classification report
from sklearn.metrics import classification_report

# Generate the classification report
report = classification_report(y_test, y_pred_class, target_names=le.classes_)

# Print the classification report
print("Classification Report:\n", report)


In [None]:
import pandas as pd
import numpy as np
from scipy.signal import butter, filtfilt

# Sample data loaded into a DataFrame
# Replace this with your actual data loading code if needed
eeg_data = eeg_state.copy()  # assuming eeg_state is your DataFrame

# Bandpass filter parameters
lowcut = 1.0  # Lower bound of the bandpass filter
highcut = 40.0  # Upper bound of the bandpass filter
fs = 128  # Sampling rate in Hz

# Bandpass filter function
def bandpass_filter(data, lowcut, highcut, fs, order=5):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    return filtfilt(b, a, data)

# Apply the bandpass filter to each EEG channel (excluding the 'state' column)
filtered_data = eeg_data.copy()
eeg_channels = ['F7', 'F3', 'P7', 'O1', 'O2', 'P8', 'AF4']

for channel in eeg_channels:
    filtered_data[channel] = bandpass_filter(eeg_data[channel], lowcut, highcut, fs)

# Display the filtered data
filtered_data.head()
