In [None]:
from google.colab import drive; drive.mount('/content/drive')   # OK to enable, if your kaggle.json is stored in Google Drive

Mounted at /content/drive


In [None]:
!pip -q install --upgrade --force-reinstall --no-deps kaggle > log  # upgrade kaggle package (to avoid a warning)
!mkdir -p ~/.kaggle                                           # .kaggle folder must contain kaggle.json for kaggle executable to properly authenticate you to Kaggle.com
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json >log  # First, download kaggle.json from kaggle.com (in Account page) and place it in the root of mounted Google Drive
!cp kaggle.json ~/.kaggle/kaggle.json > log                   # Alternative location of kaggle.json (without a connection to Google Drive)
!chmod 600 ~/.kaggle/kaggle.json                              # give only the owner full read/write access to kaggle.json
!kaggle config set -n competition -v 6-feb-25-hse-har           # set the competition context for the next few kaggle API calls. !kaggle config view - shows current settings
!kaggle competitions download >> log                          # download competition dataset as a zip file
!unzip -o *.zip >> log                                        # Kaggle dataset is copied as a single file and needs to be unzipped.
!kaggle competitions leaderboard --show                       # print public leaderboard

- competition is now set to: 6-feb-25-hse-har
100% 601M/601M [00:04<00:00, 155MB/s]
Using competition: 6-feb-25-hse-har
  teamId  teamName              submissionDate       score    
--------  --------------------  -------------------  -------  
13317731  AH                    2025-02-17 14:01:20  0.98234  
13308368  Küî´üî´                   2025-02-20 14:29:32  0.97895  
13305124  D XX                  2025-02-20 13:37:15  0.97759  
13309866  Q                     2025-02-20 14:29:55  0.97556  
13330366  F                     2025-02-20 11:55:12  0.97216  
13327034  A                     2025-02-20 06:08:09  0.97080  
13318042  Z                     2025-02-19 16:35:09  0.96945  
13313084  AA                    2025-02-20 11:56:09  0.96945  
13308774  AG                    2025-02-20 05:27:02  0.96877  
13319546  L                     2025-02-20 12:55:27  0.96673  
13320317  –°                     2025-02-18 21:28:39  0.96673  
13316475  Team W                2025-02-20 13:39:51  0

In [None]:
%%time
%%capture
%reset -f

from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = "all"
import numpy as np, pandas as pd, time, os, random

np.set_printoptions(linewidth=10000, precision=2, edgeitems=20, suppress=True)
pd.set_option('display.max_colwidth', 1000, 'display.max_columns', 100, 'display.width', 1000, 'display.max_rows', 4)
ToCSV = lambda df, fname: df.round(2).to_csv(f'{fname}.csv', index_label='id')
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from torchsummary import summary

class Timer():
  def __init__(self, lim:'RunTimeLimit'=60): self.t0, self.lim, _ = time.time(), lim, print(f'‚è≥ started. You have {lim} sec. Good luck!')
  def ShowTime(self):
    msg = f'Runtime is {time.time()-self.t0:.0f} sec'
    print(f'\033[91m\033[1m' + msg + f' > {self.lim} sec limit!!!\033[0m' if (time.time()-self.t0-1) > self.lim else msg)

CPU times: user 2.37 s, sys: 462 ms, total: 2.83 s
Wall time: 5.82 s


In [None]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(0)

Random seed set as 0


In [None]:
# Check if cuda activated.
# If not, go to Runtime -> Change runtime type. Select 'T4 GPU'
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('cuda activated')
else:
    device = torch.device('cpu')
    print('cpu activated')

cuda activated


In [None]:
%time vX  = pd.read_csv('testX.csv', index_col='id')  # load testing input features X (only)
%time tYX = pd.read_csv('trainYX.csv')                # partially load training labels Y and input features X
tYX  # 561 input features

CPU times: user 335 ms, sys: 28.4 ms, total: 363 ms
Wall time: 464 ms
CPU times: user 47.8 s, sys: 5.9 s, total: 53.7 s
Wall time: 1min 8s


Unnamed: 0,y,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,...,511,512,513,514,515,516,517,518,519,520,521,522,523,524,525,526,527,528,529,530,531,532,533,534,535,536,537,538,539,540,541,542,543,544,545,546,547,548,549,550,551,552,553,554,555,556,557,558,559,560
0,5,0.2778,0.0092,-0.0676,-0.9785,-0.9160,-0.9610,-0.9834,-0.9170,-0.9590,-0.9390,-0.4230,-0.7520,0.8496,0.6226,0.8400,-0.9434,-0.9614,-1.0370,-1.0150,-1.0070,-0.9640,-0.9550,-0.6772,0.0568,0.0192,0.5900,-0.3162,0.1833,0.4440,-0.2622,0.1092,0.4468,-0.4443,-0.1484,0.1718,-0.2727,0.0954,-0.4720,-0.5264,0.2332,0.9640,-0.1309,0.1071,-0.9814,-0.948,-0.9727,-0.9720,-0.9575,-0.9585,...,-0.9126,-0.2037,-0.5300,-0.8164,-0.9170,-0.8850,-0.9033,-0.9120,-0.9750,-0.9326,-1.014,-0.9560,-0.6780,-0.9966,-0.6180,-0.1021,-0.5977,-0.9546,-0.9110,-0.9260,-0.9297,-1.017,-0.9460,-1.022,-0.9570,-0.2930,-1.0100,-0.3455,-0.1411,-0.5215,-0.9585,-0.9160,-0.9434,-0.9414,-0.9750,-0.9414,-0.9890,-0.9610,-0.4453,-1.002,-0.5415,-0.0308,-0.5093,0.0380,-0.0912,-0.1415,-0.1316,-0.8200,0.1721,-0.0535
1,1,0.2454,0.0073,-0.1046,-0.2010,0.1426,-0.2668,-0.2776,0.0648,-0.2605,-0.0572,-0.0364,-0.2830,-0.2830,-0.1448,0.4443,-0.0844,-0.6733,-0.7603,-0.7847,-0.4136,-0.3633,-0.1837,0.2830,0.5100,0.0582,-0.2502,0.3079,-0.1384,0.0822,0.0902,-0.0034,0.1969,0.0538,0.2996,-0.0258,0.0936,-0.3472,-0.1434,-0.4058,0.3690,0.9326,-0.2942,-0.0916,-0.9966,-0.964,-0.9663,-0.9746,-0.9736,-0.9634,...,-0.8115,0.4165,-0.4731,-0.8210,0.2542,0.2410,0.2688,0.0928,-0.7710,0.2430,-0.221,-0.1018,0.7134,-0.8994,-0.0642,-0.0842,-0.4750,-0.1345,-0.3853,-0.2573,-0.5430,-0.757,-0.1365,-0.677,-0.1826,0.6777,-0.7866,0.3240,-0.6206,-0.8530,-0.2500,-0.3025,-0.3176,-0.3198,-0.6426,-0.2488,-0.7236,-0.2512,0.6177,-0.910,0.1069,-0.0397,-0.4220,0.5480,0.6455,0.2296,-0.0335,-0.7000,0.2998,0.0880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499998,4,0.2740,-0.0132,-0.1257,-0.9834,-1.0020,-0.9590,-0.9897,-0.9746,-0.9873,-0.9346,-0.5630,-0.8394,0.8306,0.6846,0.8350,-0.9840,-0.9824,-0.9960,-1.0200,-0.9950,-1.0160,-0.9590,-0.6500,-0.5225,-0.7974,0.5020,-0.2532,0.3723,0.1772,0.2920,-0.2756,0.3179,-0.1398,0.0948,0.0180,-0.1853,0.1871,0.0790,-0.0402,-0.0880,0.9785,-0.0442,-0.0532,-0.9950,-1.028,-0.9790,-1.0010,-0.9697,-1.0060,...,-0.2356,0.4312,-0.6030,-0.8706,-0.9700,-0.9863,-1.0010,-0.9990,-0.9927,-0.9910,-1.020,-1.0210,-0.9920,-0.9736,0.3857,-0.4620,-0.7485,-0.9985,-0.9575,-0.9897,-0.9814,-1.000,-0.9990,-1.008,-0.9660,-0.8574,-0.9210,0.1049,-0.6284,-0.8970,-1.0200,-1.0150,-0.9750,-1.0170,-0.9746,-0.9937,-0.9927,-0.9950,-1.0030,-0.844,0.2454,-0.3782,-0.7183,-0.0227,0.1957,0.1864,0.4556,-0.9326,0.1137,0.0595
499999,5,0.2695,-0.0251,-0.1010,-1.0170,-0.9050,-0.9375,-0.9736,-0.8920,-0.9673,-0.9575,-0.5293,-0.8022,0.8530,0.6714,0.8480,-0.9624,-1.0205,-0.9900,-0.9600,-0.9960,-0.9480,-0.9720,-0.7320,-0.5117,-0.3535,0.3710,-0.2270,0.2700,-0.0636,-0.2438,0.0608,0.2050,-0.0218,-0.1199,0.0678,0.0154,-0.1132,-0.2886,-0.3882,0.6284,0.9966,-0.1277,0.0722,-1.0050,-0.925,-0.9440,-1.0050,-0.9824,-0.9233,...,-0.9500,0.0488,-0.3591,-0.7050,-1.0240,-0.9790,-0.9746,-0.9814,-0.9920,-0.9814,-1.013,-0.9860,-0.9650,-1.0150,-0.1430,-0.1555,-0.5180,-0.9320,-0.9200,-0.9424,-0.9326,-0.932,-0.9170,-0.985,-0.9463,-0.4020,-0.9640,-0.3160,-0.0948,-0.4695,-0.9590,-0.9500,-0.9976,-0.9680,-1.0340,-0.9727,-0.9900,-0.9790,-0.6980,-1.017,-0.4863,0.0084,-0.3293,-0.0127,-0.1399,0.4624,-0.7610,-0.8696,0.1720,-0.0272


In [None]:
tYX.y.value_counts(sort=False).to_frame().T  # counts of observations in each label category

y,5,1,3,4,2,6
count,93667,83502,66901,87427,72554,95949


In [None]:
tmr = Timer() # runtime limit (in seconds). Add all of your code after the timer

‚è≥ started. You have 60 sec. Good luck!


During the preprocessing it was tried to use PCA and and stratified sampling. the choice of the parameters was "brutally" bruteforced. The result was to have PCA that explains 0.9985 variance. Stratified sampling of 50k observations to train yeild to lower score all the time in comparison of just taking top 50k observations. Then it was evaluated that top 75k observations taken to train results in higher score and that was the final solution.


The idea was to mainly work with hyperparameters of the model (namely the number of layers and neurons, PCA components, load batch size, the dropout ratio and the number of epochs).
The number of neurons is justified by the number components of PCA = 321 (and then divide by two for each layer). It was all just manually working out evaluated by the kaggle accuracy score. We also tested the different optimizers, and the Adam optimizer converges most fastly. The final parameters are shown in the model&training cell.

In [None]:
set_seed(42)

Random seed set as 42


In [None]:
tX, tY = tYX.drop('y', axis=1).head(75000), tYX.head(75000).y-1   # shift labels by -1 to range {0,1,2,3,4,5}

In [None]:
set_seed(42)

Random seed set as 42


In [None]:
# Define the PyTorch model
class Model(nn.Module):
    def __init__(self, input_size):
        super(Model, self).__init__()

        self.fc1 = nn.Linear(input_size, 161)
        self.bn1 = nn.BatchNorm1d(161)
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(161, 81)
        self.bn2 = nn.BatchNorm1d(81)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(81, 6)

    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

#PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=0.9985)
tX_pca = pca.fit_transform(tX.values)
tX = pd.DataFrame(tX_pca)



# Convert numpy arrays to torch tensors
tX_tensor = torch.tensor(tX.values, dtype=torch.float32)
tY_tensor = torch.tensor(tY.values, dtype=torch.long)

# If using GPU
tX_tensor = tX_tensor.to(device)
tY_tensor = tY_tensor.to(device)

# Create TensorDataset and split into train val sets
dataset = TensorDataset(tX_tensor, tY_tensor)
val_size = int(len(dataset) * 0.3)
train_size = len(dataset) - val_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

# Instantiate the model
model = Model(input_size=tX.shape[1]).to(device)
print(summary(model, input_size=(tX.shape[1],)))

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0002)
# Training loop
epochs = 15
for epoch in range(epochs):
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {np.round(running_loss/len(train_loader), 4)}")

    # Validation loop
    with torch.no_grad():
        val_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        print(f"Validation Loss: {np.round(val_loss/len(val_loader), 4)}")
        print(f"Accuracy: {np.round(100 * correct / total, 4)}%")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 161]          51,842
       BatchNorm1d-2                  [-1, 161]             322
           Dropout-3                  [-1, 161]               0
            Linear-4                   [-1, 81]          13,122
       BatchNorm1d-5                   [-1, 81]             162
           Dropout-6                   [-1, 81]               0
            Linear-7                    [-1, 6]             492
Total params: 65,940
Trainable params: 65,940
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.01
Params size (MB): 0.25
Estimated Total Size (MB): 0.26
----------------------------------------------------------------
None
Epoch 1/15, Loss: 0.4657
Validation Loss: 0.1208
Accuracy: 98.4133%
Epoch 2/15, Loss: 0.0746
Validation

In [None]:
#PCA for
vX_pca = pca.transform(vX.values)
vX_tensor = torch.tensor(vX_pca, dtype=torch.float32)
model.to('cuda')
vX_tensor = vX_tensor.to('cuda')
model.eval()

# No need to compute gradients (for memory efficiency)
with torch.no_grad():
    # Make predictions
    predictions = model(vX_tensor)

# If you need class probabilities, apply softmax
probabilities = torch.softmax(predictions, dim=1)

# To get the predicted class labels, get the index of the max log-probability
predicted_labels = torch.max(probabilities, 1)[1]

# Convert to numpy array if needed (for further processing in non-PyTorch code)
probabilities_np = probabilities.cpu().numpy()
predicted_labels_np = predicted_labels.cpu().numpy()

# Now 'probabilities_np' holds class probabilities and 'predicted_labels_np' holds class predictions

Model(
  (fc1): Linear(in_features=321, out_features=161, bias=True)
  (bn1): BatchNorm1d(161, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=161, out_features=81, bias=True)
  (bn2): BatchNorm1d(81, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=81, out_features=6, bias=True)
)

Model(
  (fc1): Linear(in_features=321, out_features=161, bias=True)
  (bn1): BatchNorm1d(161, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=161, out_features=81, bias=True)
  (bn2): BatchNorm1d(81, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=81, out_features=6, bias=True)
)

In [None]:
YLab = [f'{i}/{s}' for i, s in enumerate('walking walking_upstairs walking_downstairs sitting standing laying'.split())]  # column labels
pd.DataFrame(probabilities_np[:3,:], columns=YLab).style.background_gradient(cmap='coolwarm', axis=1)  # display first few predictions

Unnamed: 0,0/walking,1/walking_upstairs,2/walking_downstairs,3/sitting,4/standing,5/laying
0,0.99963,3.2e-05,0.000225,9e-06,1.1e-05,9.3e-05
1,2e-06,2e-06,3e-06,0.0,2e-06,0.999991
2,2e-06,2e-06,1e-06,0.0,0.0,0.999995


In [None]:
result = pd.DataFrame(predicted_labels_np + 1, columns=['y']) # labels are shifted to the initial state

In [None]:
result.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,...,2897,2898,2899,2900,2901,2902,2903,2904,2905,2906,2907,2908,2909,2910,2911,2912,2913,2914,2915,2916,2917,2918,2919,2920,2921,2922,2923,2924,2925,2926,2927,2928,2929,2930,2931,2932,2933,2934,2935,2936,2937,2938,2939,2940,2941,2942,2943,2944,2945,2946
y,1,6,6,2,1,6,2,6,1,5,6,3,6,5,1,5,3,2,2,2,2,3,4,2,5,1,4,5,5,5,3,3,2,3,3,3,5,5,6,5,3,5,6,6,6,2,2,1,6,3,...,2,5,4,5,2,3,2,6,1,5,1,5,2,3,4,5,2,2,1,3,1,5,5,2,6,3,6,6,6,3,6,5,3,5,1,4,1,3,6,6,2,1,3,2,4,5,6,3,4,2


In [None]:
#ToCSV(result, 't28 2dropout0.2 batchnorm1d64 161-81-6 28 epochs pca.9985 adamlr0.0002 load batch 128 75kobs')
ToCSV(result, 'final5 15 epochs fin loss 0.0009')

In [None]:
tmr.ShowTime()    # measure Colab's runtime. Do not remove. Keep as the last cell in your notebook.

Runtime is 34 sec
