# Slow Osciliations Sleep Challenge - Machine Learning
## Predict brain deep sleep slow oscillation, by Dreem
### In this challenge, data consists on EEG signals acquired on the Dreem headband in sham condition i.e. without any kind of sound stimulations. Thus we aim to predict brain activity in normal condition.
### Goals: In this dataset, we try to predict whether or not a slow oscillation will be followed by another one in sham condition, i.e. without any stimulation. This will allow to:
#### Predict normal brain activity
#### Know when it’s interesting to stimulate
#### Better quantify the impact of an individual stimulation by comparing to what would have occurred without stimulation.

###  Found on : https://challengedata.ens.fr/professors/challenges/10/

##  Benchmark code
#### Updated to reflect changes in the packages 
#### Comments added for readabilitiy and interpretation
#### Model evaluated 

In [None]:
import os
# Change working directory
os.chdir("/Users/mickey.rice/Desktop/Practice_Python")

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import h5py
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
## Trouble shooting
# pip install --upgrade --force-reinstall scipy
# pip install numba --upgrade
# pip install scikit-learn==1.0.2
# pip install numpy==1.23
# pip install h5py

In [None]:
#with h5py.File("X_train.h5", "r") as X_train, h5py.File("X_test.h5", "r") as X_test:
    #y_train = pd.read_csv("y_train_2.csv").values[:, 1].squeeze()

    ## Load data sets    
X_train = h5py.File("X_train.h5", "r")
y_train = pd.read_csv("y_train_2.csv").values[:, 1].squeeze()
X_test = h5py.File("X_test.h5", "r")

In [None]:
## Extract features
def extract_features(h5):
    data = h5["features"][:]
    features = []
    features.append(data[:, :11])
    features.append(data[:, 11:].max(1).reshape(-1, 1))
    features.append(data[:, 11:].min(1).reshape(-1, 1))
    features.append(np.abs(data[:, 11:]).mean(1).reshape(-1, 1))
    features = np.concatenate(features, 1)
    return features

In [None]:
## Train
features_train = extract_features(X_train)

## Test
features_test = extract_features(X_test)

# Train the model 
clf = RandomForestClassifier(n_estimators=10)
clf.fit(features_train, y_train)

# Make Predictions
y_pred = clf.predict(features_test)

In [None]:
# Save Predictions to CSV
with open("y_benchmark.csv", "w") as f:
    f.write("id,label\n" + "\n".join(["{},{}".format(i, y) for i, y in enumerate(y_pred)]))

# Load `y_test` from the supplementary file
y_test = pd.read_csv("y_lol_2.csv").values[:, 1].squeeze()

# Evaluate Performance
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, clf.predict_proba(features_test))
auc = roc_auc_score(y_test, clf.predict_proba(features_test), multi_class="ovr")

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Log Loss: {logloss:.4f}")
print(f"✅ AUC: {auc:.4f}")

## Try to improve over the benchmark code!

In [13]:
## Check class balance
print("Class Distribution in y_train:", np.bincount(y_train))
print("Class Distribution in y_test:", np.bincount(y_test))

Class Distribution in y_train: [114032  78620  68982]
Class Distribution in y_test: [79541 78942 79883]


In [None]:
## Improve feature extraction
def extract_features(h5):
    data = h5["features"][:]
    features = []
    features.append(data[:, :11])  # First 11 features
    features.append(data[:, 11:].max(1).reshape(-1, 1))  # Max
    features.append(data[:, 11:].min(1).reshape(-1, 1))  # Min
    features.append(np.abs(data[:, 11:]).mean(1).reshape(-1, 1))  # Mean
    features.append(np.abs(data[:, 11:]).var(1).reshape(-1, 1))  # Variance (New)
    features.append(np.median(data[:, 11:], axis=1).reshape(-1, 1))  # Median (New)
    features = np.concatenate(features, 1)
    return features

In [21]:
## Train features
features_train = extract_features(X_train)

## Test features
features_test = extract_features(X_test)

In [22]:
## Add polynomial interactions
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True)  # Only interaction terms
features_train_poly = poly.fit_transform(features_train)
features_test_poly = poly.transform(features_test)

In [26]:
## Improve Random Forest incorporating balanced classes, more complex 
clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10,
                             min_samples_leaf=5, class_weight="balanced", random_state=42)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
features_train_poly_scaled = scaler.fit_transform(features_train_poly)
features_test_poly_scaled = scaler.transform(features_test_poly)

clf.fit(features_train_poly_scaled, y_train)

# Make Predictions
y_pred = clf.predict(features_test_poly_scaled )

In [28]:
# Load `y_test` from the supplementary file
y_test = pd.read_csv("y_lol_2.csv").values[:, 1].squeeze()

# Evaluate new performance
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

accuracy = accuracy_score(y_test, y_pred)
logloss = log_loss(y_test, clf.predict_proba(features_test_poly_scaled))
auc = roc_auc_score(y_test, clf.predict_proba(features_test_poly_scaled), multi_class="ovr")

print(f"✅ Accuracy: {accuracy:.4f}")
print(f"✅ Log Loss: {logloss:.4f}")
print(f"✅ AUC: {auc:.4f}")

✅ Accuracy: 0.3322
✅ Log Loss: 1.1985
✅ AUC: 0.4990
