In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import groupby
from event_detection_ap import score, ParticipantVisibleError

In [2]:
train = pd.read_parquet("processed/final_dataset.parquet")

In [3]:
len(train)


23060846

In [4]:
def make_features(df):
    print("🔄 Starting feature generation...")

    # === 1. Convert timestamp to datetime and remove timezone ===
    print("🕒 Converting timestamp to datetime...")
    df['timestamp'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))

    # === 2. Hour-based features ===
    print("⏰ Generating time-based features...")
    df["hour"] = df["timestamp"].dt.hour.astype('int8')
    df["minute"] = df["timestamp"].dt.minute.astype('int8')
    df["day_of_week"] = df["timestamp"].dt.dayofweek.astype('int8')
    df["elapsed_time_from_midnight"] = (df["hour"] * 60 + df["minute"]).astype('int32')
    df["is_weekend"] = (df["day_of_week"] >= 5).astype('int8')
    df["is_night"] = ((df["hour"] >= 22) | (df["hour"] <= 6)).astype('int8')

    # === 3. Time-based features ===
    print("📈 Calculating time-differential features...")
    periods = 20
    df["anglez"] = abs(df["anglez"])
    df["anglez_diff"] = df.groupby('series_id')['anglez'].diff(periods=periods).bfill().astype('float16')
    df["enmo_diff"] = df.groupby('series_id')['enmo'].diff(periods=periods).bfill().astype('float16')

    # === 4. Rolling statistical features (mean, max, std, etc.) ===
    print("🔁 Computing rolling statistics...")
    window_sizes = [12, 100, 360]
    for window in window_sizes:
        print(f"  ➤ Rolling features for window size: {window}")
        for col in ['anglez', 'enmo']:
            df[f'{col}_mean_{window}s'] = df[col].rolling(window, min_periods=1).mean().astype('float16')
            df[f'{col}_std_{window}s'] = df[col].rolling(window, min_periods=1).std().astype('float16')
            df[f'{col}_min_{window}s'] = df[col].rolling(window, min_periods=1).min().astype('float16')
            df[f'{col}_max_{window}s'] = df[col].rolling(window, min_periods=1).max().astype('float16')
            df[f'{col}_median_{window}s'] = df[col].rolling(window, min_periods=1).median().astype('float16')
            df[f'{col}_cumulative_{window}s'] = df[col].rolling(window, min_periods=1).sum().astype('float16')


    # === 6. Lag Features ===
    print("⏪ Adding lag features...")
    lag_targets = [
        "anglez", "enmo",
        "anglez_mean_12s", "anglez_std_12s",
        "enmo_mean_12s", "enmo_std_12s",
        "anglez_mean_100s", "anglez_std_100s",
        "enmo_mean_100s", "enmo_std_100s"
    ]
    lag_steps = [1, 2, 3]
    for col in lag_targets:
        for lag in lag_steps:
            df[f"{col}_lag_{lag}"] = df[col].shift(lag).astype('float16')

    # === 7. Handle NaN values ===
    print("🧹 Filling missing values...")
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].astype('float32').bfill().ffill().astype(df[numeric_cols].dtypes.to_dict())

    print("✅ Feature generation complete. Total features:", len(df.columns))

    return df

In [5]:
base_features = [
    "hour", "anglez_mean_100s", "anglez_std_100s", "anglez_diff",
    "enmo", "enmo_mean_100s", "enmo_max_100s", "enmo_std_100s", "enmo_diff"
]

extended_features1 = base_features + [
    "anglez_mean_360s", "anglez_std_360s",
    "enmo_std_360s", "is_night", "is_weekend"
]

extended_features2 = base_features + [
    
    # Lag features
    "anglez_std_100s_lag_3",
    "anglez_std_100s_lag_2",
    "anglez_std_100s_lag_1",
    "enmo_mean_100s_lag_1",
    "enmo_mean_100s_lag_2",
    "enmo_mean_100s_lag_3"
]

In [6]:
train = make_features(train)

🔄 Starting feature generation...
🕒 Converting timestamp to datetime...
⏰ Generating time-based features...
📈 Calculating time-differential features...
🔁 Computing rolling statistics...
  ➤ Rolling features for window size: 12
  ➤ Rolling features for window size: 100
  ➤ Rolling features for window size: 360
⏪ Adding lag features...
🧹 Filling missing values...
✅ Feature generation complete. Total features: 80


In [7]:
def showcor(X):
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(X.corr(), annot=True, fmt=".2f", cmap='coolwarm', center=0)
    plt.title("Feature Correlation Heatmap (with values)")
    plt.show()

In [8]:
def run_cv_random_forest(train_df, features, n_splits=3, model_params=None):
    if model_params is None:
        model_params = {
            'n_estimators': 100,
            'min_samples_leaf': 300,
            'random_state': 42,
            'n_jobs': -1
        }

    X = train_df[features]
    y = train_df["awake"]
    groups = train_df["series_id"]

    gkf = GroupKFold(n_splits=n_splits)
    oof_preds = np.zeros(len(X))
    oof_preds_not_awake = np.zeros(len(X))
    feature_importances = np.zeros(X.shape[1])
    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
        print(f"\n🔁 Fold {fold + 1}")
        print(f"  ➤ Training size: {len(train_idx)} | Validation size: {len(val_idx)}")

        X_tr, y_tr = X.iloc[train_idx].copy(), y.iloc[train_idx].copy()
        X_val, y_val = X.iloc[val_idx].copy(), y.iloc[val_idx].copy()

        model = RandomForestClassifier(**model_params)
        model.fit(X_tr, y_tr)

        probs = model.predict_proba(X_val)
        preds = model.predict(X_val)

        oof_preds[val_idx] = probs[:, 1]
        oof_preds_not_awake[val_idx] = probs[:, 0]

        precision = precision_score(y_val, preds)
        recall = recall_score(y_val, preds)
        f1 = f1_score(y_val, preds)

        fold_metrics.append({
            'fold': fold + 1,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })

        print(f"  📈 Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")

        feature_importances += model.feature_importances_

    avg_importances = feature_importances / n_splits
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': avg_importances
    }).sort_values(by='importance', ascending=False)

    metrics_df = pd.DataFrame(fold_metrics)
    print("\n📊 Fold-wise metrics:\n", metrics_df)
    print(f"\n🔍 Mean F1 Score: {metrics_df['f1_score'].mean():.4f}")

    print("\n🔥 Top 10 Feature Importances:")
    print(importance_df.head(10))

    return oof_preds, oof_preds_not_awake, model, X.columns, importance_df

In [9]:
def smooth_predictions(df, awake_col="awake_pred", not_awake_col="not_awake_pred", smoothing_length=460):
    df = df.copy()
    
    df["score"] = df[awake_col].rolling(smoothing_length, center=True).mean().bfill().ffill()
    df["smooth"] = df[not_awake_col].rolling(smoothing_length, center=True).mean().bfill().ffill()
    
    df["smooth"] = df["smooth"].round()
    return df

In [10]:

# Define the function to determine the onset and wakeup events
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: (cv[0], cv[1] != 0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False:
            lstPOI.extend([0] * llg)
        else:
            lstPOI.extend(['onset'] + (llg - 2) * [0] + ['wakeup'] if llg > 1 else [0])
    return lstPOI


In [11]:

def apply_filter(result_events, step_diff_min):
    df = pd.DataFrame(result_events)
    
    # Set a threshold for valid pairings (step difference > 15000 is invalid)
    step_diff_min_threshold = step_diff_min
    
    # Sort by series_id and step
    df = df.sort_values(by=['series_id', 'step'])
    
    # Initialize a list to store valid event pairs
    valid_pairs = []
    
    # Iterate through each unique series_id and match onsets and wakeups
    for series_id in df['series_id'].unique():
        series_data = df[df['series_id'] == series_id]
        
        onset_event = None
        for index, row in series_data.iterrows():
            if row['event'] == 'onset':
                onset_event = row
            elif row['event'] == 'wakeup' and onset_event is not None:
                # Ensure the step difference between onset and wakeup is above the minimum threshold
                step_diff = abs(row['step'] - onset_event['step'])
                if step_diff >= step_diff_min_threshold:
                    valid_pairs.append((onset_event, row))  # Add the valid pair
                onset_event = None  # Reset for next pairing
    
    # Create a DataFrame for the valid pairs in the required format
    output_data = []
    for onset, wakeup in valid_pairs:
        output_data.append({
            'series_id': onset['series_id'],
            'step': onset['step'],
            'event': 'onset',
            'score': onset['score']
        })
        output_data.append({
            'series_id': wakeup['series_id'],
            'step': wakeup['step'],
            'event': 'wakeup',
            'score': wakeup['score']
        })
    
    # Create a DataFrame from the output data
    output_df = pd.DataFrame(output_data)
    
    # Save the results to a CSV file
    output_df.to_csv('valid_pairs_predictions.csv', index=False)
    
    print("Results saved to 'valid_pairs_predictions.csv'.")

In [12]:
def evaluate_full_pipeline():


    # Load ground truth and predictions
    solution = pd.read_csv('processed/event_cleaned_final.csv')             # Ground truth
    submission = pd.read_csv('valid_pairs_predictions.csv')                      # Your predictions from train set
    
    # Define tolerances
    tolerances = {
        "onset":  [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
        "wakeup": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
    }
    
    # Set correct column names used in your prediction file
    column_names = {
        'series_id_column_name': 'series_id',
        'time_column_name': 'step',
        'event_column_name': 'event',
        'score_column_name': 'score',  # You named the prediction confidence 'score'
    }
    
    # Run scoring
    try:
        ap_score = score(solution, submission, tolerances, **column_names)
        print(f"\n✅ Average Precision Score: {ap_score}")
    except ParticipantVisibleError as e:
        print(f"\n❌ Error: {e}")


    return ap_score

In [13]:
def plot_feature_importances(model, feature_names, top_n=None, save_path=None, title="Feature Importances"):
    import matplotlib.pyplot as plt
    import pandas as pd

    importances = model.feature_importances_
    feat_df = pd.DataFrame({
        "feature": feature_names,
        "importance": importances
    }).sort_values(by="importance", ascending=False)

    if top_n:
        feat_df = feat_df.head(top_n)

    plt.figure(figsize=(10, 6))
    plt.barh(feat_df["feature"], feat_df["importance"])
    plt.gca().invert_yaxis()
    plt.title(title)
    plt.tight_layout()
    plt.show()

    if save_path:
        feat_df.to_csv(save_path, index=False)
        print(f"📁 Feature importances saved to: {save_path}")
    
    return feat_df

In [14]:
def main_pipeline(train, features, step_diff_min=2000, smoothing_length=460, n_splits=3, model_params=None):
    print("🚀 Starting full pipeline...")

    train_df = train

    oof_preds, oof_preds_not_awake, model, cols, importances = run_cv_random_forest(
        train_df, features, n_splits=n_splits, model_params=model_params
    )
    plot_feature_importances(model, cols, top_n=15, save_path="feature_importances.csv")

    train_df["awake_pred"] = oof_preds
    train_df["not_awake_pred"] = oof_preds_not_awake

    # 👇 Pass smoothing_length here
    train_df = smooth_predictions(train_df, smoothing_length=smoothing_length)

    train_df["event"] = get_event(train_df)
    train_events = train_df.loc[train_df["event"] != 0, ["series_id", "step", "event", "score"]].reset_index(drop=True)
    train_events.to_csv('result_events.csv', index=False)
    print("\n✅ Events saved to: result_events.csv")

    apply_filter(train_events, step_diff_min=step_diff_min)
    ap_score = evaluate_full_pipeline()

    return ap_score

In [15]:
from itertools import product
import pandas as pd

import pandas as pd
from itertools import product
from datetime import datetime

def parameter_sweep_with_features(
    train,
    feature_sets,
    param_grid,
    step_diff_min_values=[2000],
    smoothing_lengths=[460],
    n_splits=3,
    result_csv_path="sweep_results.csv"
):
    results = []
    default_params = {
        'random_state': 42,
        'n_jobs': -1
    }

    keys, values = zip(*param_grid.items())

    # Initialize CSV file with header if it doesn't exist
    try:
        existing_df = pd.read_csv(result_csv_path)
        print(f"📄 Appending to existing result file: {result_csv_path}")
    except FileNotFoundError:
        pd.DataFrame(columns=[
            'timestamp', 'feature_set', 'features', 'params',
            'step_diff_min', 'smoothing_length', 'ap_score'
        ]).to_csv(result_csv_path, index=False)
        print(f"📄 Created new result file: {result_csv_path}")

    for feature_set_name, feature_list in feature_sets.items():
        print(f"\n🧪 Testing feature set: {feature_set_name}")
        for step_diff_min in step_diff_min_values:
            for smoothing_length in smoothing_lengths:
                for comb in product(*values):
                    sweep_params = dict(zip(keys, comb))
                    model_params = {**default_params, **sweep_params}

                    print(f"\n🔍 Running with smoothing={smoothing_length}, step_diff_min={step_diff_min}, params={model_params}")

                    try:
                        ap_score = main_pipeline(
                            train=train,
                            features=feature_list,
                            step_diff_min=step_diff_min,
                            smoothing_length=smoothing_length,
                            n_splits=n_splits,
                            model_params=model_params
                        )
                    except Exception as e:
                        print(f"❌ Failed: {e}")
                        ap_score = None

                    result = {
                        'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                        'feature_set': feature_set_name,
                        'features': ', '.join(feature_list),
                        'params': model_params,
                        'step_diff_min': step_diff_min,
                        'smoothing_length': smoothing_length,
                        'ap_score': ap_score
                    }

                    # Append to results list
                    results.append(result)

                    # Write result row to CSV immediately
                    pd.DataFrame([result]).to_csv(result_csv_path, mode='a', index=False, header=False)
                    print(f"📤 Saved result to {result_csv_path}")

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by='ap_score', ascending=False)
    return results_df

In [None]:
param_grid = {
    'n_estimators': [100],
    'min_samples_leaf': [300]
}

feature_sets = {
    
    "extended1": extended_features1,
    "extended2": extended_features2,
    
}

sweep_results = parameter_sweep_with_features(
    train=train,
    feature_sets=feature_sets,
    param_grid=param_grid,
    step_diff_min_values=[1500, 2000, 2500],
    smoothing_lengths=[300, 460, 600],
    result_csv_path="sweep_results.csv"
)

print(sweep_results[['feature_set', 'smoothing_length', 'step_diff_min', 'params', 'ap_score']])

📄 Created new result file: sweep_results.csv

🧪 Testing feature set: extended1

🔍 Running with smoothing=300, step_diff_min=1500, params={'random_state': 42, 'n_jobs': -1, 'n_estimators': 100, 'min_samples_leaf': 300}
🚀 Starting full pipeline...

🔁 Fold 1
  ➤ Training size: 15330629 | Validation size: 7730217
