# IMPORTS

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.datatypes._panel._convert import from_2d_array_to_nested
from scipy.stats import skew, kurtosis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from tsfresh import extract_features
from sklearn.model_selection import TimeSeriesSplit

# LOAD DATA

In [5]:
df_train = pd.read_csv('Sleep Train 5000.csv', header=None)
df_test = pd.read_csv('Sleep Test 1000.csv', header=None)

# PREPROCESSING

In [6]:
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

Train shape: (4999, 179)
Test shape: (1000, 178)


In [7]:
df_train = df_train.drop(columns=[178]) 

In [8]:
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

Train shape: (4999, 178)
Test shape: (1000, 178)


# SPLIT LABELS AND FEATURES

In [9]:
y_train = df_train.iloc[:, 0]
X_train = df_train.iloc[:, 1:]
y_test = df_test.iloc[:, 0]
X_test = df_test.iloc[:, 1:]

In [10]:
# Convert labels to integers (in both train and test)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Normalize features

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Naive Bayes

In [30]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)
y_pred_nb = nb.predict(X_test_scaled)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.034


# Gradient Boosting

In [31]:
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.031


# MLP Neural Network

In [32]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)
y_pred_mlp = mlp.predict(X_test_scaled)

print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))

MLP Accuracy: 0.034


# Time Series Forest

In [12]:
# Step 1: Convert from 2D to sktime nested format
X_train_sktime = from_2d_array_to_nested(X_train.to_numpy())
X_test_sktime = from_2d_array_to_nested(X_test.to_numpy())

# Step 2: Initialize and train the model
tsf = TimeSeriesForestClassifier(n_estimators=100, random_state=42)
tsf.fit(X_train_sktime, y_train)

# Step 3: Predict and evaluate
y_pred_tsf = tsf.predict(X_test_sktime)
accuracy = accuracy_score(y_test, y_pred_tsf)

print("TimeSeriesForest Accuracy:", accuracy)

TimeSeriesForest Accuracy: 0.045


# Voting Classifier

In [14]:
def extract_features(X):
    # Calculate basic time series features
    means = np.mean(X, axis=1)
    variances = np.var(X, axis=1)
    skewness = skew(X, axis=1, nan_policy='omit')
    kurt = kurtosis(X, axis=1, nan_policy='omit')
    
    features = np.column_stack([means, variances, skewness, kurt])
    return features

# Extract features for train and test data
X_train_features = extract_features(X_train)
X_test_features = extract_features(X_test)

print(X_train_features.shape)  # Check the shape of the feature matrix

(4999, 4)


In [16]:
# Set up hyperparameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7]
}

# Initialize the model
gb_model = GradientBoostingClassifier()

# Set up GridSearchCV
grid_search = GridSearchCV(gb_model, param_grid, cv=5)
grid_search.fit(X_train_features, y_train)

# Best parameters found by GridSearchCV
print("Best Gradient Boosting Params:", grid_search.best_params_)

# Use the best model found by grid search
best_gb_model = grid_search.best_estimator_

Best Gradient Boosting Params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}


In [19]:
nb_model = GaussianNB()
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
tsf_model = TimeSeriesForestClassifier()

In [20]:
# Combine models in a Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('nb', nb_model),
    ('gb', best_gb_model),
    ('mlp', mlp_model),
    ('tsf', tsf_model)
], voting='hard')  # You can also try 'soft' voting

# Train the Voting Classifier
voting_clf.fit(X_train_features, y_train)

# Evaluate the model
y_pred_voting = voting_clf.predict(X_test_features)

# Calculate accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred_voting)
print("Voting Classifier Accuracy:", accuracy)

Voting Classifier Accuracy: 0.042


# Cross Validation

In [25]:

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7]
}

# Initialize your model
model = GradientBoostingClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=TimeSeriesSplit(n_splits=5), n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_t rain)

# Get the best parameters and accuracy
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Best Cross-validation Accuracy: {grid_search.best_score_}')


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 100}
Best Cross-validation Accuracy: 0.3183673469387755
