In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Mining Pipeline and Preprocessing

In [None]:
training_data_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
training_data_events

In [None]:
def translate_time(data: pd.DataFrame, col: str):
    try:
        data['datetime'] = data[col].astype(str).str.replace('T', ' ').str[:-5]
        data['datetime'] = pd.to_datetime(data['datetime'] )
    except Exception as e:
        print(e)
    return data

training_data_events = translate_time(training_data_events, "timestamp")
training_data_events.sample(5)

In [None]:
training_data_events.isna().sum()

In [None]:
training_data_events.dropna().nunique()

In [None]:
train_events = pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
train_series = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')

In [None]:
top_five_series = train_events.groupby('series_id')['event'].count().sort_values(ascending=False).head(5)
top_five_series

In [None]:
train_series_subset = train_series[train_series['series_id'].isin(top_five_series.index)]
train_series_subset = translate_time(train_series_subset, 'timestamp')

train_events_subset = train_events[train_events['series_id'].isin(top_five_series.index)]
train_events_subset = translate_time(train_events_subset, 'timestamp')

mask_non_NULL = (~train_events_subset['step'].isnull()) | (~train_events_subset['timestamp'].isnull())
train_events_subset_nonNULL = train_events_subset[mask_non_NULL]

train_data_events = pd.merge(train_series_subset, train_events_subset_nonNULL, on=['series_id', 'timestamp', 'datetime'], how='left')

train_data_events = train_data_events.drop(['step_y'], axis=1)
train_data_events = train_data_events.rename(columns = {'step_x' : 'step'})

train_data_events['event'].fillna("NoChange", inplace=True)

def event_cat_mapper(x):
    if x == "NoChange":
        return 0
    elif x == "wakeup":
        return 1
    return 2
train_data_events["event_cat"] = train_data_events['event'].apply(event_cat_mapper)

train_data_events["y"] = "NULL"

for idx in train_data_events['series_id'].unique():
    print(idx)
    val = 0
    for index, row in train_data_events[train_data_events['series_id'] == idx].iterrows():
        if row['event'] == "NoChange":
            train_data_events.loc[index, 'y'] = val
        else:
            if row['event'] == 'wakeup':
                val = 0
                train_data_events.loc[index, 'y'] = val
            else:
                val = 1
                train_data_events.loc[index, 'y'] = val

In [None]:
train_data_events

In [None]:
X = train_data_events.drop(columns=['y', 'series_id', 'night', 'event', 'event_cat', 'timestamp', 'datetime'])
y = train_data_events['y'].astype('int')

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train

In [None]:
X_train

In [None]:
import xgboost as xgb
# Assuming X_train and y_train are your feature matrix and target variable
dtrain = xgb.DMatrix(X_train, label=y_train)
params = {
    'objective': 'binary:logistic',
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'logloss'
}
num_round = 100  # Number of boosting rounds
model = xgb.train(params, dtrain, num_round)


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
y_pred = model.predict(xgb.DMatrix(X_test, label=y_test))
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions) * 100
conf_matrix = confusion_matrix(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)


In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=100)
adb.fit(X_train, y_train)

In [None]:
y_pred = adb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred) * 100
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

In [None]:
train_data_events

In [None]:
train_data_events['y'].value_counts()

In [None]:
train_data_events.series_id.value_counts().keys().tolist()

In [None]:
np.linalg.norm(train_data_events[(train_data_events['y'] == 1) & (train_data_events['series_id'] == 'cfeb11428dd7')]['anglez'].values)

In [None]:
np.linalg.norm(train_data_events[(train_data_events['y'] == 0) & (train_data_events['series_id'] == 'cfeb11428dd7')]['anglez'].values)

In [None]:
series_ids = train_data_events.series_id.value_counts().keys().tolist()

In [None]:
import math
train_data_events['lin_norm'] = 0

In [None]:
for idx in train_data_events['series_id'].unique():
    print(idx)
    for index, row in train_data_events[train_data_events['series_id'] == idx].iterrows():
        if index < 4:
            pass
        else:
            train_data_events.loc[index, 'lin_norm'] = val = np.linalg.norm(train_data_events.loc[index-4:index, 'anglez'].values)

In [None]:
train_data_events.head(10)

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_data_events[train_data_events['series_id'] == '78569a801a38']['lin_norm'])
plt.plot(train_data_events[train_data_events['series_id'] == '78569a801a38']['y'] * 200)
plt.figure(figsize=(10, 10))
# plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_data_events[train_data_events['series_id'] == 'cfeb11428dd7']['lin_norm'])
plt.plot(train_data_events[train_data_events['series_id'] == 'cfeb11428dd7']['y'] * 200)
plt.figure(figsize=(10, 10))
# plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_data_events[train_data_events['series_id'] == 'f564985ab692']['lin_norm'])
plt.plot(train_data_events[train_data_events['series_id'] == 'f564985ab692']['y'] * 200)
plt.figure(figsize=(10, 10))
# plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_data_events[train_data_events['series_id'] == 'fb223ed2278c']['lin_norm'])
plt.plot(train_data_events[train_data_events['series_id'] == 'fb223ed2278c']['y'] * 200)
plt.figure(figsize=(10, 10))
# plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.plot(train_data_events[train_data_events['series_id'] == 'f56824b503a0']['lin_norm'])
plt.plot(train_data_events[train_data_events['series_id'] == 'f56824b503a0']['y'] * 200)
plt.figure(figsize=(10, 10))
# plt.show()

In [None]:
train_data_events['math_predictions'] = train_data_events['lin_norm'].apply(lambda x: 1 if x > 100 else 0)
train_data_events

In [None]:
train_data_events['math_predictions'].value_counts()

In [None]:
train_data_events['y'].value_counts()

In [None]:
train_data_events['lin_norm'].value_counts()

In [None]:
train_data_events['math_predictions']

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(train_data_events['math_predictions'], train_data_events['y'].astype('int16')) * 100
conf_matrix = confusion_matrix(train_data_events['math_predictions'], train_data_events['y'].astype('int16'))
classification_rep = classification_report(train_data_events['math_predictions'], train_data_events['y'].astype('int16'))

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

In [None]:
# Assuming you have a trained model named 'model' and test data named 'X_test'
# Generate predictions
test_predictions = model.predict(xgb.DMatrix(X_test))

# Create a submission DataFrame
submission = pd.DataFrame({
    'row_id': range(len(X_test)),
    'series_id': X_test['series_id'],
    'step': X_test['step'],
    'event': test_predictions,  # Adjust this based on your model output
    'score': 0.5  # You may need to adjust this depending on your problem
})

# Save the submission file
submission.to_csv('submission.csv', index=False)

In [None]:
submission = pd.DataFrame()
submission = submission.sort_values(['series_id','step']).reset_index(drop=True)
submission['row_id'] = submission.index.astype(int)
submission['score'] = submission['score'].fillna(submission['score'].mean())
submission = submission[['row_id','series_id','step','event','score']]
submission['step'] = submission['step']
submission.to_csv('submission.csv',index=False)