# Headers

In [None]:
import os
import pandas as pd
import time
import numpy as np
import h5py

# Metrics
from sklearn.metrics import accuracy_score, f1_score


In [None]:
from utils import train_datapath, test_datapath
from utils import train_val_split

targets_for_test_df = pd.read_csv('data/targets_for_test.csv', index_col=0)


In [None]:

def save_submission(test_df, filled_test_predictions, filename='submission.csv'):
    # Create a new DataFrame for the submission
    submission_df = pd.DataFrame({
        'row_id': test_df['row_id'],
        'target': [0] + filled_test_predictions
    })

    # Save the submission file
    submission_df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")


# Data Processing


In [None]:

def create_sliding_windows_batch(df, window_size, batch_size):
    for start in range(0, len(df) - window_size + 1, batch_size):
        end = min(start + batch_size, len(df) - window_size + 1)
        batch_windows = [
            df.iloc[i:i + window_size].values for i in range(start, end)
        ]
        yield np.array(batch_windows)
        
def create_h5py_file(df, window_size, batch_size, filename):
    with h5py.File(filename, 'w') as h5f:
        batch_index = 0
        for batch in create_sliding_windows_batch(df, window_size, batch_size=batch_size):
            train_images = batch.reshape(-1, window_size, df.shape[1])
            h5f.create_dataset(f'batch_{batch_index}', data=train_images)
            batch_index += 1

train_df = pd.read_csv(train_datapath)
test_df = pd.read_csv(test_datapath)


window_size = 60
batch_size = 1024

# split the train_df into train and val
X_train, y_train, X_val, y_val = train_val_split(train_df)
X_train['target'] = y_train
train_slice_df = X_train.copy()

X_val['target'] = y_val
val_slice_df = X_val.copy()

train_file = 'data/train_images.h5'
# if not os.path.exists(train_file):
    create_h5py_file(train_slice_df, window_size, batch_size, train_file)

val_file = 'data/validation_images.h5'
if not os.path.exists(val_file):
    create_h5py_file(val_slice_df, window_size, batch_size, val_file)

test_file = 'data/test_images.h5'
if not os.path.exists(test_file):
    create_h5py_file(test_df, window_size, batch_size, test_file)
    
del train_df, test_df, X_train, y_train, X_val, y_val, train_slice_df, val_slice_df


In [None]:
def save_submission(test_df, filled_test_predictions, filename='submission.csv'):
    # Create a new DataFrame for the submission
    submission_df = pd.DataFrame({
        'row_id': test_df['row_id'],
        'target': [0] + filled_test_predictions
    })

    # Save the submission file
    submission_df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")




# Basic CNN

In [None]:
import torch

import torch.nn as nn
import torch.nn.functional as F

class BasicCNN(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(BasicCNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * (window_size // 2 // 2), 128)  # Adjust based on pooling layers
        self.fc2 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * (window_size // 2 // 2))  # Flatten
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x


def load_data_for_prediction_batch(h5_file, start_index, batch_size, window_size):
    with h5py.File(h5_file, 'r') as h5f:
        inputs = h5f['inputs'][start_index:start_index + batch_size]  # Load a batch of windows
    return inputs  # Shape will be (batch_size, window_size, num_features)

In [None]:
train_df = pd.read_csv(train_datapath)



# split the train_df into train and val
X_train, y_train, X_val, y_val = train_val_split(train_df)
X_train['target'] = y_train

train_slice_df = X_train.copy()

X_val['target'] = y_val
val_slice_df = X_val.copy()


input_channels = train_df.shape[1] - 1 # Columns minus the target
num_classes = 1  # Binary classification
window_size = 60
batch_size = 1024
train_targets = train_slice_df['target'].values

del train_df, X_train, y_train, X_val, y_val

In [None]:
def train_model(model, train_slice_df, train_file, window_size, batch_size, num_epochs=10, learning_rate=0.001):
    # Define loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        for start_index in range(0, len(train_targets) - window_size + 1, batch_size):
            # Load a batch of data
            inputs = load_data_for_prediction_batch(train_file, start_index, batch_size, window_size)
            inputs = torch.tensor(inputs, dtype=torch.float32)
            targets = torch.tensor(train_targets.values[start_index:start_index + batch_size], dtype=torch.float32)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}')

start_time = time.time()

# Initialize the model
model = BasicCNN(input_channels, num_classes)

# Train the model
train_model(model, train_slice_df, train_file, window_size, batch_size)

train_df = pd.read_csv(train_datapath)
X_train, y_train, X_val, y_val = train_val_split(train_df)


end_time = time.time()
minutes = (end_time - start_time) // 60
seconds = (end_time - start_time) % 60
print(f'Time elapsed: {minutes:.0f}m {seconds:.1f}s')
print('--------------------------------------')


# Evaluate the model on validation data in batches
model.eval()
val_predictions = []
val_targets = val_slice_df['target'].values

with torch.no_grad():
    for start_index in range(0, len(val_targets) - window_size + 1, batch_size):
        val_inputs = load_data_for_prediction_batch(val_file, start_index, batch_size, window_size)
        val_inputs = torch.tensor(val_inputs, dtype=torch.float32)
        val_outputs = model(val_inputs)
        batch_predictions = (val_outputs.squeeze().numpy() > 0.5).astype(int)
        val_predictions.extend(batch_predictions)

y_pred = np.array(val_predictions[:len(val_targets)])



# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.5f}')

# Calculate F1 macro score
f1_macro = f1_score(y_val, y_pred, average='macro')
print(f'Validation F1 Macro Score: {f1_macro:.5f}')

save_submission(X_val, y_pred, filename='submissionCNN.csv')

del train_df, X_train, y_train, X_val, y_val, train_slice_df, val_slice_df
