In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [2]:
train = pd.read_parquet("processed/final_dataset.parquet")

In [3]:
len(train)


23060846

In [4]:
def make_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df["hour"] = df["timestamp"].dt.hour
    
    periods = 20
    df["anglez"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill").astype('float16')
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill").astype('float16')
    df["anglez_rolling_mean"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling_mean"] = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_rolling_max"] = df["anglez"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling_max"] = df["enmo"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_rolling_std"] = df["anglez"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_rolling_std"] = df["enmo"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_diff_rolling_mean"] = df["anglez_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_diff_rolling_mean"] = df["enmo_diff"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["anglez_diff_rolling_max"] = df["anglez_diff"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    df["enmo_diff_rolling_max"] = df["enmo_diff"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
    
    return df

features = ["hour",
            "anglez",
            "anglez_rolling_mean",
            "anglez_rolling_max",
            "anglez_rolling_std",
            "anglez_diff",
            "anglez_diff_rolling_mean",
            "anglez_diff_rolling_max",
            "enmo",
            "enmo_rolling_mean",
            "enmo_rolling_max",
            "enmo_rolling_std",
            "enmo_diff",
            "enmo_diff_rolling_mean",
            "enmo_diff_rolling_max"]


In [5]:
train = make_features(train)

X_train = train[features]
y_train = train["awake"]


  df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).fillna(method="bfill").astype('float16')
  df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).fillna(method="bfill").astype('float16')
  df["anglez_rolling_mean"] = df["anglez"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
  df["enmo_rolling_mean"] = df["enmo"].rolling(periods,center=True).mean().fillna(method="bfill").fillna(method="ffill").astype('float16')
  df["anglez_rolling_max"] = df["anglez"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
  df["enmo_rolling_max"] = df["enmo"].rolling(periods,center=True).max().fillna(method="bfill").fillna(method="ffill").astype('float16')
  df["anglez_rolling_std"] = df["anglez"].rolling(periods,center=True).std().fillna(method="bfill").fillna(method="ffill").astype('float16')
  df["enmo_rolling_std"] = df["enmo"].rolling(periods,center=True

In [6]:
# Train the Random Forest Classifier
classifier = RandomForestClassifier(n_estimators=50, min_samples_leaf=300, random_state=42, n_jobs=-1)
classifier.fit(X_train, y_train)


In [7]:



# Predict probabilities for training data
train["not_awake"] = classifier.predict_proba(X_train)[:,0]
train["awake"] = classifier.predict_proba(X_train)[:,1]

# Smoothing the predictions
smoothing_length = 2*230
train["score"]  = train["awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
train["smooth"] = train["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")

# Re-binarize the smooth values
train["smooth"] = train["smooth"].round()

  train["score"]  = train["awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")
  train["smooth"] = train["not_awake"].rolling(smoothing_length,center=True).mean().fillna(method="bfill").fillna(method="ffill")


In [8]:

# Define the function to determine the onset and wakeup events
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: (cv[0], cv[1] != 0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False:
            lstPOI.extend([0] * llg)
        else:
            lstPOI.extend(['onset'] + (llg - 2) * [0] + ['wakeup'] if llg > 1 else [0])
    return lstPOI


In [9]:
from itertools import groupby

In [10]:

train["event"] = get_event(train)

# Save the events for inspection
train_events = train.loc[train["event"] != 0][["series_id", "step", "event", "score"]].copy().reset_index(drop=True)
train_events.to_csv('result_events.csv', index=False)

print("\n✅ Sleep events for training data saved under: result_events.csv")


✅ Sleep events for training data saved under: result_events.csv


In [11]:
import pandas as pd

# Load the train events CSV file into a DataFrame
train_events = pd.read_csv('result_events.csv')

# Display the first few rows to check the data
print(train_events.head(30))

       series_id    step   event     score
0   08db4255286f    9313   onset  0.499278
1   08db4255286f   14420  wakeup  0.499938
2   08db4255286f   26558   onset  0.499162
3   08db4255286f   27111  wakeup  0.499733
4   08db4255286f   27138   onset  0.499836
5   08db4255286f   27892  wakeup  0.499428
6   08db4255286f   28109   onset  0.499788
7   08db4255286f   28206  wakeup  0.498957
8   08db4255286f   28262   onset  0.499286
9   08db4255286f   31342  wakeup  0.499927
10  08db4255286f   44173   onset  0.499886
11  08db4255286f   49108  wakeup  0.498149
12  08db4255286f   60304   onset  0.499288
13  08db4255286f   66357  wakeup  0.498073
14  08db4255286f   79142   onset  0.499049
15  08db4255286f   85344  wakeup  0.499896
16  08db4255286f   97992   onset  0.499634
17  08db4255286f   98283  wakeup  0.499584
18  08db4255286f   98662   onset  0.499115
19  08db4255286f  104114  wakeup  0.499103
20  08db4255286f  114169   onset  0.499035
21  08db4255286f  118831  wakeup  0.498822
22  08db425

In [12]:
df = pd.DataFrame(train_events)

# Set a threshold for valid pairings (step difference > 15000 is invalid)
step_diff_min_threshold = 2000

# Sort by series_id and step
df = df.sort_values(by=['series_id', 'step'])

# Initialize a list to store valid event pairs
valid_pairs = []

# Iterate through each unique series_id and match onsets and wakeups
for series_id in df['series_id'].unique():
    series_data = df[df['series_id'] == series_id]
    
    onset_event = None
    for index, row in series_data.iterrows():
        if row['event'] == 'onset':
            onset_event = row
        elif row['event'] == 'wakeup' and onset_event is not None:
            # Ensure the step difference between onset and wakeup is above the minimum threshold
            step_diff = abs(row['step'] - onset_event['step'])
            if step_diff >= step_diff_min_threshold:
                valid_pairs.append((onset_event, row))  # Add the valid pair
            onset_event = None  # Reset for next pairing

# Create a DataFrame for the valid pairs in the required format
output_data = []
for onset, wakeup in valid_pairs:
    output_data.append({
        'series_id': onset['series_id'],
        'step': onset['step'],
        'event': 'onset',
        'score': onset['score']
    })
    output_data.append({
        'series_id': wakeup['series_id'],
        'step': wakeup['step'],
        'event': 'wakeup',
        'score': wakeup['score']
    })

# Create a DataFrame from the output data
output_df = pd.DataFrame(output_data)

# Save the results to a CSV file
output_df.to_csv('valid_pairs_predictions.csv', index=False)

print("Results saved to 'valid_pairs_predictions.csv'.")

Results saved to 'valid_pairs_predictions.csv'.


In [13]:
from event_detection_ap import score, ParticipantVisibleError

In [14]:
import pandas as pd


# Load ground truth and predictions
solution = pd.read_csv('processed/event_cleaned_final.csv')             # Ground truth
submission = pd.read_csv('valid_pairs_predictions.csv')                      # Your predictions from train set

# Define tolerances
tolerances = {
    "onset":  [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
    "wakeup": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
}

# Set correct column names used in your prediction file
column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'score',  # You named the prediction confidence 'score'
}

# Run scoring
try:
    ap_score = score(solution, submission, tolerances, **column_names)
    print(f"\n✅ Average Precision Score: {ap_score}")
except ParticipantVisibleError as e:
    print(f"\n❌ Error: {e}")


✅ Average Precision Score: 0.4823659738039304


In [15]:
solution = pd.read_csv('processed/event_cleaned_final.csv')
submission = pd.read_csv('valid_pairs_predictions.csv')

# Function to count events per series_id
def count_events(df, label):
    counts = df[df["event"].isin(["onset", "wakeup"])] \
                .groupby(["series_id", "event"]) \
                .size() \
                .unstack(fill_value=0) \
                .reset_index()
    counts["source"] = label
    return counts

# Apply to both datasets
solution_counts = count_events(solution, "ground_truth")
submission_counts = count_events(submission, "prediction")

# Summary
print(f"🔍 Ground Truth: {solution['series_id'].nunique()} series_ids")
print(f"🔍 Predictions : {submission['series_id'].nunique()} series_ids\n")

print("📊 Ground Truth Event Counts:")
print(solution_counts.head(50))

print("\n📊 Prediction Event Counts:")
print(submission_counts.head(50))

🔍 Ground Truth: 62 series_ids
🔍 Predictions : 62 series_ids

📊 Ground Truth Event Counts:
event     series_id  onset  wakeup        source
0      08db4255286f     25      25  ground_truth
1      0a96f4993bd7     15      15  ground_truth
2      0cfc06c129cc     21      21  ground_truth
3      0ef7d94fde99     21      21  ground_truth
4      1087d7b0ff2e     24      24  ground_truth
5      10f8bc1f7b07     23      23  ground_truth
6      1319a1935f48     33      33  ground_truth
7      1716cd4163b2     25      25  ground_truth
8      18b61dd5aae8     29      29  ground_truth
9      1955d568d987     29      29  ground_truth
10     2654a87be968      8       8  ground_truth
11     29c75c018220     26      26  ground_truth
12     3452b878e596     31      31  ground_truth
13     3664fe9233f9     22      22  ground_truth
14     483d6545417f     17      17  ground_truth
15     55a47ff9dc8a     23      23  ground_truth
16     5acc9d63b5fd     24      24  ground_truth
17     5c55a5e717d6     21  