# AN2DL [2025-2026]: Time Series Classification

**NOTEBOOK BY thenegatives**


**Burchini - Collovigh - Corti - Ravasio**

## üåê **Google Drive Connection**

In [30]:
import sys

if not 'google.colab' in sys.modules:
  !pip install torchsummary
  !pip install tensorboard
  !pip install seaborn

In [31]:
# Check if the code is running in Google Colab
if 'google.colab' in sys.modules:
  from google.colab import drive
  drive.mount("/gdrive")
  current_dir = "/gdrive/My Drive/[2025 2026] AN2DL/"
  %cd $current_dir

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/[2025 2026] AN2DL


## ‚öôÔ∏è **Libraries Import**

In [32]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
logs_dir = "tensorboard"
!pkill -f tensorboard
%load_ext tensorboard

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
PyTorch version: 2.8.0+cu126
Device: cuda


## ‚è≥ **Data Loading**

In [33]:
import os

# load train data
pirate_pain_train = pd.read_csv(os.path.join(current_dir, 'pirate_pain_train.csv'))
pirate_pain_train_labels = pd.read_csv(os.path.join(current_dir, 'pirate_pain_train_labels.csv'))

# Merge them on the "sample_index" column
data = pd.merge(pirate_pain_train, pirate_pain_train_labels, on='sample_index')

pirate_pain_test = pd.read_csv(os.path.join(current_dir, 'pirate_pain_test.csv'))

## üîé **Exploration and Data Analysis**

In [34]:
# Print the shape of the dataset
print('Pirate pain dataset shape', data.shape)

Pirate pain dataset shape (105760, 41)


In [35]:
# Generate summary statistics for the dataset
data.describe()

Unnamed: 0,sample_index,time,pain_survey_1,pain_survey_2,pain_survey_3,pain_survey_4,joint_00,joint_01,joint_02,joint_03,...,joint_21,joint_22,joint_23,joint_24,joint_25,joint_26,joint_27,joint_28,joint_29,joint_30
count,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,...,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0
mean,330.0,79.5,1.633746,1.654851,1.65364,1.663134,0.943095,0.916955,0.779296,0.767921,...,3.972126e-05,4.176794e-05,3.56178e-05,3.138109e-05,0.0001024604,0.041905,0.058244,0.049886,0.062273,0.5
std,190.814948,46.187338,0.682423,0.669639,0.666649,0.661994,0.202051,0.197608,0.295605,0.300787,...,0.004974496,0.005472244,0.00123545,0.0004062914,0.003206128,0.060293,0.079819,0.060773,0.072597,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001015,0.005403,...,0.0,1.510494e-07,0.0,1.063144e-08,0.0,0.000203,0.0,0.0,0.0,0.5
25%,165.0,39.75,2.0,2.0,2.0,2.0,0.828277,0.811445,0.56885,0.52002,...,6.545878e-08,3.32165e-07,3.275038e-07,2.841805e-07,7.161332e-07,0.009885,0.012652,0.01629,0.019638,0.5
50%,330.0,79.5,2.0,2.0,2.0,2.0,1.005126,0.979468,0.909549,0.914834,...,8.302747e-07,1.095971e-06,1.024209e-06,8.746147e-07,3.126723e-06,0.021898,0.031739,0.031843,0.039041,0.5
75%,495.0,119.25,2.0,2.0,2.0,2.0,1.081039,1.056611,0.995187,0.994324,...,2.80009e-06,3.079465e-06,3.02183e-06,2.507548e-06,9.946107e-06,0.048579,0.071051,0.058741,0.079518,0.5
max,660.0,159.0,2.0,2.0,2.0,2.0,1.407968,1.334613,1.306046,1.254729,...,1.442198,1.305001,0.2742411,0.03643074,0.947354,1.223617,1.187419,1.412037,1.370765,0.5


In [36]:
# Get the target values from the dataset
target = data['label'].values
print('Target shape', target.shape)

# Find and report rows with missing labels
missing_labels = data[data['label'].isna()]
if not missing_labels.empty:
    print(f"\n‚ö†Ô∏è Found {len(missing_labels)} samples with no label:")
    print(missing_labels[['sample_index']].to_string(index=False))
else:
    print("\n‚úÖ All samples have labels.")

print("")

# Calculate the unique target labels and their counts
unique, count = np.unique(target, return_counts=True)
print(f"Target labels: {unique}\n")
for i in range(len(unique)):
    print(f'Class {unique[i]} has {count[i]} samples')


Target shape (105760,)

‚úÖ All samples have labels.

Target labels: ['high_pain' 'low_pain' 'no_pain']

Class high_pain has 8960 samples
Class low_pain has 15040 samples
Class no_pain has 81760 samples


## üîÑ **Data Preprocessing**

### Clean data

In [37]:
# delete useless columns

train = data.drop('sample_index', axis=1)
train = train.drop('time', axis=1)
train = train.drop('n_legs', axis=1)
train = train.drop('n_hands', axis=1)
train = train.drop('n_eyes', axis=1)

X_test = pirate_pain_test.drop('sample_index', axis=1)
X_test = X_test.drop('time', axis=1)
X_test = X_test.drop('n_legs', axis=1)
X_test = X_test.drop('n_hands', axis=1)
X_test = X_test.drop('n_eyes', axis=1)

In [38]:
# Map string labels to integers
label_encoder = {
    'no_pain': 0,
    'low_pain': 1,
    'high_pain': 2
}

# Create inverse mapping for later use (optional, but good practice)
inverse_label_encoder = {v: k for k, v in label_encoder.items()}

# Apply the mapping to the 'label' column in the 'data' DataFrame
train['label_encoded'] = train['label'].map(label_encoder)


In [39]:
# Prepare features and labels as float32 and int64 arrays
X = train.drop('label', axis=1).drop('label_encoded', axis=1).astype(np.float32).values
y = train['label_encoded'].values.astype(np.int64)

In [40]:
tmp = train.drop('label', axis=1).drop('label_encoded', axis=1)
tmp.head(10)

tmp.describe()

Unnamed: 0,pain_survey_1,pain_survey_2,pain_survey_3,pain_survey_4,joint_00,joint_01,joint_02,joint_03,joint_04,joint_05,...,joint_21,joint_22,joint_23,joint_24,joint_25,joint_26,joint_27,joint_28,joint_29,joint_30
count,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,...,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0,105760.0
mean,1.633746,1.654851,1.65364,1.663134,0.943095,0.916955,0.779296,0.767921,0.709186,0.711306,...,3.972126e-05,4.176794e-05,3.56178e-05,3.138109e-05,0.0001024604,0.041905,0.058244,0.049886,0.062273,0.5
std,0.682423,0.669639,0.666649,0.661994,0.202051,0.197608,0.295605,0.300787,0.418058,0.411783,...,0.004974496,0.005472244,0.00123545,0.0004062914,0.003206128,0.060293,0.079819,0.060773,0.072597,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.001015,0.005403,0.0,0.0,...,0.0,1.510494e-07,0.0,1.063144e-08,0.0,0.000203,0.0,0.0,0.0,0.5
25%,2.0,2.0,2.0,2.0,0.828277,0.811445,0.56885,0.52002,0.184585,0.210551,...,6.545878e-08,3.32165e-07,3.275038e-07,2.841805e-07,7.161332e-07,0.009885,0.012652,0.01629,0.019638,0.5
50%,2.0,2.0,2.0,2.0,1.005126,0.979468,0.909549,0.914834,0.930007,0.950628,...,8.302747e-07,1.095971e-06,1.024209e-06,8.746147e-07,3.126723e-06,0.021898,0.031739,0.031843,0.039041,0.5
75%,2.0,2.0,2.0,2.0,1.081039,1.056611,0.995187,0.994324,1.036497,1.0324,...,2.80009e-06,3.079465e-06,3.02183e-06,2.507548e-06,9.946107e-06,0.048579,0.071051,0.058741,0.079518,0.5
max,2.0,2.0,2.0,2.0,1.407968,1.334613,1.306046,1.254729,1.359204,1.387731,...,1.442198,1.305001,0.2742411,0.03643074,0.947354,1.223617,1.187419,1.412037,1.370765,0.5


### Split Dataset

In [41]:
# split: divide data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=2500,
    random_state=SEED,
    stratify=y
)

In [42]:
# Show final dataset sizes
print('Training set shape:\t', X_train.shape, y_train.shape)
print('Validation set shape:\t', X_val.shape, y_val.shape)
print('Test set shape:\t\t', X_test.shape)

Training set shape:	 (103260, 35) (103260,)
Validation set shape:	 (2500, 35) (2500,)
Test set shape:		 (211840, 35)


### Normalize features

In [43]:
# Find maximum value for each feature in training data
max_df = X_train.max(axis=0)
print('Dataset maximum values')
print(max_df)

# Find minimum value for each feature in training data
min_df = X_train.min(axis=0)
print('\nDataset minimum values')
print(min_df)

Dataset maximum values
[2.         2.         2.         2.         1.407968   1.3346131
 1.3060458  1.2547286  1.3592042  1.3877311  1.410532   1.3619686
 1.3572876  1.3881046  1.6314944  1.4641569  1.3967537  0.07385182
 0.26423228 0.10034701 0.18880701 0.4209474  0.7283824  0.6160614
 1.096014   1.4421984  1.3050011  0.27424106 0.03643074 0.947354
 1.2236171  1.1874188  1.4120367  1.3707654  0.5       ]

Dataset minimum values
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.01503578e-03 5.40320901e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 1.22095488e-01 4.67140321e-03 1.07917055e-01 7.22331256e-02
 1.07135512e-01 0.00000000e+00 2.76304974e-07 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 1.51049434e-07 0.00000000e+00
 1.06314380e-08 0.00000000e+00 2.02932133e-04 0.00000000e+00
 0.00000000e+00 0.00000000e+00 5.00000000e-01]


In [44]:
# Apply min-max scaling using training data statistics
X_train = (X_train - min_df) / (max_df - min_df)
X_val = (X_val - min_df) / (max_df - min_df)
X_test = (X_test - min_df) / (max_df - min_df)

# Verify normalization worked (should be 0.0 to 1.0)
print(f"New maximum values: {X_train.max(axis=0)}")
print(f"New minimum values: {X_train.min(axis=0)}")

New maximum values: [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1. nan]
New minimum values: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. nan]


In [45]:
# Convert numpy arrays to PyTorch datasets (pairs features with labels)
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))

In [46]:
# Define the batch size, which is the number of samples in each batch
BATCH_SIZE = 32

In [47]:
def make_loader(ds, batch_size, shuffle, drop_last):
    # Determine optimal number of worker processes for data loading
    cpu_cores = os.cpu_count() or 2
    num_workers = max(2, min(4, cpu_cores))

    # Create DataLoader with performance optimizations
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        pin_memory=True,  # Faster GPU transfer
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=4,  # Load 4 batches ahead
    )

In [48]:
# Create data loaders with different settings for each phase
train_loader = make_loader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = make_loader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

## üõ†Ô∏è **Model Building**

### Definizione del Modello

In [49]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size_1, hidden_size_2, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size_1)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_size_2, output_size)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        return out

# Define input and output sizes
input_size = X_train.shape[1]
output_size = len(np.unique(y_train))

# Choose appropriate hidden layer sizes
hidden_size_1 = 64
hidden_size_2 = 32

# Instantiate the model and move it to the device
model = NeuralNetwork(input_size, hidden_size_1, hidden_size_2, output_size).to(device)

# Print the model's architecture
print("Model Architecture:")
print(model)

# Print a summary of the model
print("\nModel Summary:")
summary(model, input_size=(input_size,))


Model Architecture:
NeuralNetwork(
  (fc1): Linear(in_features=35, out_features=64, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=32, out_features=3, bias=True)
)

Model Summary:
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]           2,304
              ReLU-2                   [-1, 64]               0
            Linear-3                   [-1, 32]           2,080
              ReLU-4                   [-1, 32]               0
            Linear-5                    [-1, 3]              99
Total params: 4,483
Trainable params: 4,483
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
-----------------------------------

## Inizializzazione del Modello, Funzione di Perdita e Ottimizzatore


## Addestramento della Rete Neurale

In [None]:
NUM_EPOCHS = 10
LEARNING_RATE = 0.001
print("Replacing NaN values in datasets...")
# Replace NaN values in X_train, X_val, and X_test with 0
X_train = np.nan_to_num(X_train, nan=0.0)
X_val = np.nan_to_num(X_val, nan=0.0)
X_test = np.nan_to_num(X_test, nan=0.0)
print(f"NaNs in X_train after fix: {np.isnan(X_train).sum()}")
print(f"NaNs in X_val after fix: {np.isnan(X_val).sum()}")
print(f"NaNs in X_test after fix: {np.isnan(X_test).sum()}")

# Convert numpy arrays to PyTorch datasets
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))

# Create data loaders with different settings for each phase
train_loader = make_loader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = make_loader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

print("NaN values replaced and DataLoaders re-created.")

# Initialize model
model = NeuralNetwork(input_size, hidden_size_1, hidden_size_2, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter(log_dir=os.path.join(logs_dir, 'runs/pain_classification'))


print("Starting re-training with fixed data...")

for epoch in range(NUM_EPOCHS):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total_samples += labels.size(0)
        correct_predictions += (predicted == labels).sum().item()

    epoch_loss = running_loss / total_samples
    epoch_accuracy = correct_predictions / total_samples

    print(f"Epoch [{epoch+1}/{NUM_EPOCHS}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")

    # Log to TensorBoard
    writer.add_scalar('Training Loss', epoch_loss, epoch)
    writer.add_scalar('Training Accuracy', epoch_accuracy, epoch)

print("Re-training complete.")

Replacing NaN values in datasets...
NaNs in X_train after fix: 0
NaNs in X_val after fix: 0
NaNs in X_test after fix: 0
NaN values replaced and DataLoaders re-created.
Starting re-training with fixed data...
Epoch [1/10], Loss: 0.4454, Accuracy: 0.8433
Epoch [2/10], Loss: 0.2832, Accuracy: 0.9017
Epoch [3/10], Loss: 0.2186, Accuracy: 0.9252
Epoch [4/10], Loss: 0.1827, Accuracy: 0.9372
Epoch [5/10], Loss: 0.1594, Accuracy: 0.9446
Epoch [6/10], Loss: 0.1423, Accuracy: 0.9501
Epoch [7/10], Loss: 0.1293, Accuracy: 0.9545
Epoch [8/10], Loss: 0.1201, Accuracy: 0.9576
Epoch [9/10], Loss: 0.1119, Accuracy: 0.9606
Epoch [10/10], Loss: 0.1064, Accuracy: 0.9626
Re-training complete.


## Valutazione della Rete Neurale con Metriche Avanzate


In [67]:
import torch.nn.functional as F
from sklearn.metrics import f1_score, precision_score, recall_score

print("Starting validation with advanced metrics...")

model.eval()  # Set the model to evaluation mode
val_running_loss = 0.0
val_correct_predictions = 0
val_total_samples = 0

all_labels = []
all_predictions = []

with torch.no_grad():  # Disable gradient calculation during validation
    for inputs, labels in val_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        val_running_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        val_total_samples += labels.size(0)
        val_correct_predictions += (predicted == labels).sum().item()

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

val_epoch_loss = val_running_loss / val_total_samples
val_epoch_accuracy = val_correct_predictions / val_total_samples

# Calculate Confusion Matrix
conf_matrix = confusion_matrix(all_labels, all_predictions)

# Calculate F1 Score (weighted for class imbalance)
f1 = f1_score(all_labels, all_predictions, average='weighted')

# Calculate Precision (weighted)
precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)

# Calculate Recall (weighted)
recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)

print(f"\nValidation Loss: {val_epoch_loss:.4f}")
print(f"Validation Accuracy: {val_epoch_accuracy:.4f}")
print(f"Validation F1 Score (weighted): {f1:.4f}")
print(f"Validation Precision (weighted): {precision:.4f}")
print(f"Validation Recall (weighted): {recall:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Log to TensorBoard
writer.add_scalar('Validation Loss', val_epoch_loss, NUM_EPOCHS)
writer.add_scalar('Validation Accuracy', val_epoch_accuracy, NUM_EPOCHS)
writer.add_scalar('Validation F1 Score (weighted)', f1, NUM_EPOCHS)
writer.add_scalar('Validation Precision (weighted)', precision, NUM_EPOCHS)
writer.add_scalar('Validation Recall (weighted)', recall, NUM_EPOCHS)

# Optional: Log confusion matrix as text to TensorBoard
conf_matrix_str = pd.DataFrame(conf_matrix, index=unique, columns=unique).to_string()
writer.add_text('Confusion Matrix', conf_matrix_str, NUM_EPOCHS)

print("Validation with advanced metrics complete.")


Starting validation with advanced metrics...

Validation Loss: 0.1028
Validation Accuracy: 0.9632
Validation F1 Score (weighted): 0.9619
Validation Precision (weighted): 0.9630
Validation Recall (weighted): 0.9632

Confusion Matrix:
[[1916   10    7]
 [  22  333    0]
 [  45    8  159]]
Validation with advanced metrics complete.


## Generazione delle Previsioni sul Test Set

In [None]:
print("Generating predictions on the test set...")

# Set the model to evaluation mode
model.eval()

# Convert the X_test numpy array to a PyTorch tensor and move it to the appropriate device
X_test_tensor = torch.from_numpy(X_test).to(device).float()

with torch.no_grad():
    #  Pass the X_test tensor through the model to get the raw output logits
    outputs = model(X_test_tensor)

    # Apply torch.softmax to the raw outputs to get probabilities across classes
    probabilities = torch.softmax(outputs, dim=1)

    # Get the predicted class index for each sample
    predicted_indices = torch.argmax(probabilities, dim=1)

# Convert the predicted class indices (PyTorch tensor) to a numpy array
predicted_indices_numpy = predicted_indices.cpu().numpy()

# Map the numerical predictions back to their original string labels
predictions = np.array([inverse_label_encoder[idx] for idx in predicted_indices_numpy])

print("Predictions generated successfully.")
print("\nFirst 10 predictions:")
print(predictions[:10])
print(f"\nTotal predictions: {len(predictions)}")

Generating predictions on the test set...
Predictions generated successfully.

First 10 predictions:
['no_pain' 'no_pain' 'no_pain' 'no_pain' 'no_pain' 'no_pain' 'no_pain'
 'no_pain' 'no_pain' 'no_pain']

Total predictions: 211840


## Salvataggio delle Previsioni su CSV


In [None]:
print("Creating submission file...")

# Create a temporary DataFrame to associate each prediction with its original sample_index
temp_predictions_df = pd.DataFrame({
    'sample_index': pirate_pain_test['sample_index'],
    'label': predictions
})

# Group by sample_index and get the mode (most frequent label) for each
final_predictions_df = temp_predictions_df.groupby('sample_index')['label'].agg(lambda x: x.mode()[0]).reset_index()

# Format the 'sample_index' column to have leading zeros (e.g., '000', '001')
final_predictions_df['sample_index'] = final_predictions_df['sample_index'].astype(str).str.zfill(3)

# Save the DataFrame to a CSV file
submission_filename = 'submission.csv'
final_predictions_df.to_csv(submission_filename, index=False)

print(f"Submission file '{submission_filename}' created successfully.")
print("First 5 rows of the submission file:")
print(final_predictions_df.head())

Creating submission file...
Submission file 'submission.csv' created successfully.
First 5 rows of the submission file:
  sample_index    label
0          000  no_pain
1          001  no_pain
2          002  no_pain
3          003  no_pain
4          004  no_pain
