# Backpack prediction Competition
[link](https://www.kaggle.com/competitions/playground-series-s5e2/overview)

In [13]:
%pip install kagglehub numpy Pillow scikit-learn torch torchvision matplotlib opencv-python pandas 1>/dev/null
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
# Force CPU
# device = 'cpu'
print(f"Using {device} device")

%matplotlib inline


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/home/pj/.pyenv/versions/py310_env/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Using cuda device


## Load Data

In [35]:
import kagglehub
from PIL import Image
import os
from sklearn.model_selection import train_test_split

# Download the latest version of the dataset to the specified subdirectory.
dataset_path = kagglehub.competition_download("playground-series-s5e2")

print("Path to dataset files:", dataset_path)

# with open(f"{dataset_path}/train.csv", 'r') as f:
#     print(f.readlines())

raw_data = pd.read_csv(f"{dataset_path}/train.csv")

X_pd_raw = raw_data.copy().drop('Price', axis=1).drop('id', axis=1)
y_pd_raw = raw_data['Price']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_pd_raw, y_pd_raw)

X_train_raw

Path to dataset files: /home/pj/.cache/kagglehub/competitions/playground-series-s5e2


Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
54618,Under Armour,Canvas,Medium,10.0,Yes,No,Tote,Black,17.039422
229739,Nike,Leather,,7.0,No,Yes,Tote,Red,19.749471
241419,Jansport,Canvas,Large,6.0,Yes,Yes,Messenger,Red,27.806947
125077,Puma,Leather,Small,1.0,Yes,Yes,Tote,Gray,17.389870
164992,Nike,Leather,Small,7.0,Yes,No,Tote,Black,12.943406
...,...,...,...,...,...,...,...,...,...
231621,,Polyester,Large,10.0,No,,Tote,,8.086978
47632,Nike,Canvas,Small,7.0,Yes,No,Messenger,Gray,15.151423
24197,Under Armour,Leather,Large,10.0,Yes,No,Backpack,Pink,18.574369
2862,Under Armour,Leather,Large,3.0,Yes,Yes,Tote,Gray,26.032457


## Preprocessing

In [71]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

num_features = ['Compartments', 'Weight Capacity (kg)']
cat_features = ['Brand', 'Material', 'Size', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']

missing_cols = [
    col
    for col in X_pd.columns
    if (col not in num_features) and (col not in cat_features)
]
assert len(missing_cols) == 0, f"not all columns accounted for; missing {missing_cols}"

cat_pipeline = Pipeline(steps=[
    ('imputer_mf', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder())
])

num_pipeline = Pipeline(steps=[
    ('imputer_mean', SimpleImputer(strategy="mean")),
    ('scaler', StandardScaler())
])

full_pipeline = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_features),
        ('cat', cat_pipeline, cat_features)
    ],
    remainder='drop'  # Drop any columns not specified (optional, default is 'drop')
)

full_pipeline.fit(X_train_raw)
X_train = full_pipeline.transform(X_train_raw)
X_test = full_pipeline.transform(X_test_raw)

X_test.shape

(75000, 27)

## Create Torch DataLoader 

In [77]:
from torch.utils.data import DataLoader, random_split, TensorDataset
from torchvision import datasets, transforms

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).to(device)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


## Training the network

In [78]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import torch.nn.functional as F

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # We can use nn.Flatten to flatten the input in the forward pass.
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(27, 27),
            nn.ReLU(),
            nn.Linear(27, 13),
            nn.ReLU(),
            nn.Linear(13, 1)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [85]:
batch_size = 64

def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # print("train:", X.shape)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        # loss.requires_grad = True
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 1000 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"Train - loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    return loss


def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            print("test:", X.shape)
            pred = model(X).squeeze(-1)
            test_loss += loss_fn(pred, y).item()
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    # correct /= size
    # print(f"Test - Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f}")

    return test_loss

model = NeuralNetwork().to(device)

# Initialize the loss function
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

trains = []
tests = []
# start = time.time()
for t in range(100):
    train_loss = train_loop(train_loader, model, loss_fn, optimizer)
    test_loss = test_loop(test_loader, model, loss_fn)

    if t % 1 == 0:
        print(f"epoch {t} - train_loss: {train_loss}, test_loss: {test_loss}")
    trains.append(float(train_loss))
    tests.append(float(test_loss))

# torch.Size([64, 27])
# 

plt.plot(trains)
plt.plot(tests)
plt.legend(['train', 'test'])

Train - loss: 8185.339355  [   64/225000]
Train - loss: 1313.951172  [64064/225000]
Train - loss: 1569.489990  [128064/225000]
Train - loss: 1590.723633  [192064/225000]
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64, 27])
test: torch.Size([64

KeyboardInterrupt: 