### Simple Random Forest classifier submission
**Author: [Carl McBride Ellis](https://www.kaggle.com/carlmcbrideellis)** ([LinkedIn](https://www.linkedin.com/in/carl-mcbride-ellis/))


We shall be using the training data from the reduced dataset ["Zzzs: Lightweight training dataset + target"](https://www.kaggle.com/datasets/carlmcbrideellis/zzzs-lightweight-training-dataset-target)

In [1]:
import numpy as np
import pandas as pd
from itertools import groupby

In [2]:
train = pd.read_parquet("/kaggle/input/zzzs-lightweight-training-dataset-target/Zzzs_train.parquet")
test  = pd.read_parquet("/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet")
train.dropna()
test.dropna()
# parse the timestamp and create an "hour" feature
train["timestamp"] = pd.to_datetime(train["timestamp"],utc=True)
train["hour"] = train["timestamp"].dt.hour

test["timestamp"] = pd.to_datetime(test["timestamp"],utc=True)
test["hour"] = test["timestamp"].dt.hour

random forest

%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample

X_train = train[["anglez","enmo","hour"]]
y_train = train["awake"]

classifier = RandomForestClassifier(n_estimators=100,
                                    max_depth=10,
                                    min_samples_leaf=500,random_state=42)



classifier.fit(X_train, y_train)

gxboost

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train = train[["anglez","enmo","hour"]]
y_train = train["awake"]

# Initialize and train an XGBoost classifier
classifier = xgb.XGBClassifier()
classifier.fit(X_train, y_train)


adaboost

In [3]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train = train[["anglez","enmo","hour"]]
y_train = train["awake"]


# Initialize and train an AdaBoost classifier with Decision Trees
classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
classifier.fit(X_train, y_train)



svm

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your dataset and split into features (X) and target (y)
X, y = ...

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (important for SVM)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train an SVM classifier
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_classifier.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


In [4]:
X_test = test[["anglez","enmo","hour"]]
test["score"] = classifier.predict_proba(X_test)[:,1]
test["score"]

0      0.536756
1      0.536756
2      0.536756
3      0.536756
4      0.536756
         ...   
445    0.540114
446    0.537152
447    0.533519
448    0.529884
449    0.530794
Name: score, Length: 450, dtype: float64

In [5]:
test["not_awake"] = 1-test["score"]
# exponential smoothing of the predictions
test["smooth"] = test["not_awake"].ewm(span = 100).mean()
# re-binarize
test["smooth"] = test["smooth"].round()
print(test["smooth"])
# https://stackoverflow.com/questions/73777727/how-to-mark-start-end-of-a-series-of-non-null-and-non-0-values-in-a-column-of-a
def get_event(df):
    lstCV = zip(df.series_id, df.smooth)
    lstPOI = []
    for (c, v), g in groupby(lstCV, lambda cv: 
                            (cv[0], cv[1]!=0 and not pd.isnull(cv[1]))):
        llg = sum(1 for item in g)
        if v is False: 
            lstPOI.extend([0]*llg)
        else: 
            lstPOI.extend(['onset']+(llg-2)*[0]+['wakeup'] if llg > 1 else [0])
    return lstPOI

test["event"] = get_event(test)
print(test["event"])

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
445    0.0
446    0.0
447    0.0
448    0.0
449    0.0
Name: smooth, Length: 450, dtype: float64
0      0
1      0
2      0
3      0
4      0
      ..
445    0
446    0
447    0
448    0
449    0
Name: event, Length: 450, dtype: int64


In [6]:
sample_submission = test.loc[test["event"] != 0][["series_id","step","event","score"]].copy().reset_index(drop=True).reset_index(names="row_id")
sample_submission.to_csv('submission.csv', index=False)